aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2014-08-07 02:36:12 -0400
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2014-08-07 02:36:12 -0400
commit5e2aa2ed08e2e280121dc7cf5609c87d464f12ef (patch)
treeca7d7b1480285e3b617fecc5b41f0ce150a82c32 /kernel
parentf62d14a8072b9756db36ba394e2b267470a40240 (diff)
parentfc8104bc5a3f6f49d79f45f2706f79f77a9fb2ae (diff)
Merge branch 'next' into for-linus
Prepare first round of input updates for 3.17.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks16
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/audit.c66
-rw-r--r--kernel/auditsc.c27
-rw-r--r--kernel/backtracetest.c18
-rw-r--r--kernel/capability.c26
-rw-r--r--kernel/cgroup.c1873
-rw-r--r--kernel/cgroup_freezer.c138
-rw-r--r--kernel/compat.c8
-rw-r--r--kernel/context_tracking.c3
-rw-r--r--kernel/cpu.c42
-rw-r--r--kernel/cpuset.c80
-rw-r--r--kernel/debug/debug_core.c4
-rw-r--r--kernel/debug/kdb/kdb_bt.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/events/core.c368
-rw-r--r--kernel/events/uprobes.c114
-rw-r--r--kernel/exec_domain.c14
-rw-r--r--kernel/exit.c61
-rw-r--r--kernel/fork.c22
-rw-r--r--kernel/futex.c243
-rw-r--r--kernel/gcov/base.c6
-rw-r--r--kernel/gcov/gcc_4_7.c5
-rw-r--r--kernel/hrtimer.c9
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/Kconfig9
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/internals.h8
-rw-r--r--kernel/irq/irqdesc.c95
-rw-r--r--kernel/irq/irqdomain.c6
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/spurious.c106
-rw-r--r--kernel/kexec.c78
-rw-r--r--kernel/kmod.c7
-rw-r--r--kernel/kprobes.c392
-rw-r--r--kernel/ksysfs.c5
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/latencytop.c5
-rw-r--r--kernel/locking/Makefile1
-rw-r--r--kernel/locking/lockdep_internals.h6
-rw-r--r--kernel/locking/locktorture.c12
-rw-r--r--kernel/locking/mcs_spinlock.c64
-rw-r--r--kernel/locking/mcs_spinlock.h9
-rw-r--r--kernel/locking/mutex.c2
-rw-r--r--kernel/locking/qrwlock.c133
-rw-r--r--kernel/locking/rtmutex-debug.h5
-rw-r--r--kernel/locking/rtmutex.c273
-rw-r--r--kernel/locking/rtmutex.h5
-rw-r--r--kernel/locking/rwsem-spinlock.c28
-rw-r--r--kernel/locking/rwsem-xadd.c274
-rw-r--r--kernel/locking/rwsem.c31
-rw-r--r--kernel/module.c44
-rw-r--r--kernel/notifier.c22
-rw-r--r--kernel/panic.c23
-rw-r--r--kernel/params.c25
-rw-r--r--kernel/power/Kconfig3
-rw-r--r--kernel/power/hibernate.c67
-rw-r--r--kernel/power/main.c39
-rw-r--r--kernel/power/power.h9
-rw-r--r--kernel/power/process.c4
-rw-r--r--kernel/power/suspend.c125
-rw-r--r--kernel/power/suspend_test.c24
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/power/user.c3
-rw-r--r--kernel/printk/printk.c332
-rw-r--r--kernel/profile.c20
-rw-r--r--kernel/rcu/rcutorture.c217
-rw-r--r--kernel/rcu/tiny_plugin.h8
-rw-r--r--kernel/rcu/tree.c469
-rw-r--r--kernel/rcu/tree.h17
-rw-r--r--kernel/rcu/tree_plugin.h146
-rw-r--r--kernel/rcu/update.c16
-rw-r--r--kernel/reboot.c21
-rw-r--r--kernel/res_counter.c7
-rw-r--r--kernel/resource.c7
-rw-r--r--kernel/sched/core.c596
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cpudeadline.c37
-rw-r--r--kernel/sched/cpudeadline.h6
-rw-r--r--kernel/sched/cpupri.c16
-rw-r--r--kernel/sched/cpupri.h2
-rw-r--r--kernel/sched/cputime.c32
-rw-r--r--kernel/sched/deadline.c30
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c647
-rw-r--r--kernel/sched/features.h8
-rw-r--r--kernel/sched/idle.c170
-rw-r--r--kernel/sched/rt.c130
-rw-r--r--kernel/sched/sched.h52
-rw-r--r--kernel/sched/stop_task.c4
-rw-r--r--kernel/sched/wait.c2
-rw-r--r--kernel/seccomp.c114
-rw-r--r--kernel/signal.c95
-rw-r--r--kernel/smp.c69
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/stop_machine.c1
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c125
-rw-r--r--kernel/time/alarmtimer.c20
-rw-r--r--kernel/time/ntp.c32
-rw-r--r--kernel/time/sched_clock.c13
-rw-r--r--kernel/time/timekeeping.c7
-rw-r--r--kernel/torture.c40
-rw-r--r--kernel/trace/Kconfig30
-rw-r--r--kernel/trace/Makefile3
-rw-r--r--kernel/trace/ftrace.c271
-rw-r--r--kernel/trace/ring_buffer.c9
-rw-r--r--kernel/trace/trace.c478
-rw-r--r--kernel/trace/trace.h46
-rw-r--r--kernel/trace/trace_benchmark.c198
-rw-r--r--kernel/trace/trace_benchmark.h41
-rw-r--r--kernel/trace/trace_event_perf.c5
-rw-r--r--kernel/trace/trace_events.c14
-rw-r--r--kernel/trace/trace_functions.c56
-rw-r--r--kernel/trace/trace_functions_graph.c19
-rw-r--r--kernel/trace/trace_irqsoff.c71
-rw-r--r--kernel/trace/trace_kprobe.c74
-rw-r--r--kernel/trace/trace_nop.c1
-rw-r--r--kernel/trace/trace_output.c41
-rw-r--r--kernel/trace/trace_probe.c65
-rw-r--r--kernel/trace/trace_probe.h15
-rw-r--r--kernel/trace/trace_sched_wakeup.c70
-rw-r--r--kernel/trace/trace_selftest.c69
-rw-r--r--kernel/trace/trace_stack.c42
-rw-r--r--kernel/trace/trace_uprobe.c112
-rw-r--r--kernel/tracepoint.c28
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/user_namespace.c33
-rw-r--r--kernel/utsname_sysctl.c10
-rw-r--r--kernel/watchdog.c41
-rw-r--r--kernel/workqueue.c493
-rw-r--r--kernel/workqueue_internal.h2
134 files changed, 7066 insertions, 3912 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index d2b32ac27a39..76768ee812b2 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -220,6 +220,20 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
220 220
221endif 221endif
222 222
223config ARCH_SUPPORTS_ATOMIC_RMW
224 bool
225
223config MUTEX_SPIN_ON_OWNER 226config MUTEX_SPIN_ON_OWNER
224 def_bool y 227 def_bool y
225 depends on SMP && !DEBUG_MUTEXES 228 depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
229
230config RWSEM_SPIN_ON_OWNER
231 def_bool y
232 depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
233
234config ARCH_USE_QUEUE_RWLOCK
235 bool
236
237config QUEUE_RWLOCK
238 def_bool y if ARCH_USE_QUEUE_RWLOCK
239 depends on SMP
diff --git a/kernel/acct.c b/kernel/acct.c
index 8d6e145138bb..808a86ff229d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -55,7 +55,7 @@
55#include <linux/times.h> 55#include <linux/times.h>
56#include <linux/syscalls.h> 56#include <linux/syscalls.h>
57#include <linux/mount.h> 57#include <linux/mount.h>
58#include <asm/uaccess.h> 58#include <linux/uaccess.h>
59#include <asm/div64.h> 59#include <asm/div64.h>
60#include <linux/blkdev.h> /* sector_div */ 60#include <linux/blkdev.h> /* sector_div */
61#include <linux/pid_namespace.h> 61#include <linux/pid_namespace.h>
@@ -134,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
134 spin_lock(&acct_lock); 134 spin_lock(&acct_lock);
135 if (file != acct->file) { 135 if (file != acct->file) {
136 if (act) 136 if (act)
137 res = act>0; 137 res = act > 0;
138 goto out; 138 goto out;
139 } 139 }
140 140
@@ -262,7 +262,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
262 if (name) { 262 if (name) {
263 struct filename *tmp = getname(name); 263 struct filename *tmp = getname(name);
264 if (IS_ERR(tmp)) 264 if (IS_ERR(tmp))
265 return (PTR_ERR(tmp)); 265 return PTR_ERR(tmp);
266 error = acct_on(tmp); 266 error = acct_on(tmp);
267 putname(tmp); 267 putname(tmp);
268 } else { 268 } else {
diff --git a/kernel/audit.c b/kernel/audit.c
index 47845c57eb19..3ef2e0e797e8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -44,7 +44,7 @@
44#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 44#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
45 45
46#include <linux/init.h> 46#include <linux/init.h>
47#include <asm/types.h> 47#include <linux/types.h>
48#include <linux/atomic.h> 48#include <linux/atomic.h>
49#include <linux/mm.h> 49#include <linux/mm.h>
50#include <linux/export.h> 50#include <linux/export.h>
@@ -424,6 +424,38 @@ static void kauditd_send_skb(struct sk_buff *skb)
424} 424}
425 425
426/* 426/*
427 * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
428 *
429 * This function doesn't consume an skb as might be expected since it has to
430 * copy it anyways.
431 */
432static void kauditd_send_multicast_skb(struct sk_buff *skb)
433{
434 struct sk_buff *copy;
435 struct audit_net *aunet = net_generic(&init_net, audit_net_id);
436 struct sock *sock = aunet->nlsk;
437
438 if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
439 return;
440
441 /*
442 * The seemingly wasteful skb_copy() rather than bumping the refcount
443 * using skb_get() is necessary because non-standard mods are made to
444 * the skb by the original kaudit unicast socket send routine. The
445 * existing auditd daemon assumes this breakage. Fixing this would
446 * require co-ordinating a change in the established protocol between
447 * the kaudit kernel subsystem and the auditd userspace code. There is
448 * no reason for new multicast clients to continue with this
449 * non-compliance.
450 */
451 copy = skb_copy(skb, GFP_KERNEL);
452 if (!copy)
453 return;
454
455 nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
456}
457
458/*
427 * flush_hold_queue - empty the hold queue if auditd appears 459 * flush_hold_queue - empty the hold queue if auditd appears
428 * 460 *
429 * If auditd just started, drain the queue of messages already 461 * If auditd just started, drain the queue of messages already
@@ -1076,10 +1108,22 @@ static void audit_receive(struct sk_buff *skb)
1076 mutex_unlock(&audit_cmd_mutex); 1108 mutex_unlock(&audit_cmd_mutex);
1077} 1109}
1078 1110
1111/* Run custom bind function on netlink socket group connect or bind requests. */
1112static int audit_bind(int group)
1113{
1114 if (!capable(CAP_AUDIT_READ))
1115 return -EPERM;
1116
1117 return 0;
1118}
1119
1079static int __net_init audit_net_init(struct net *net) 1120static int __net_init audit_net_init(struct net *net)
1080{ 1121{
1081 struct netlink_kernel_cfg cfg = { 1122 struct netlink_kernel_cfg cfg = {
1082 .input = audit_receive, 1123 .input = audit_receive,
1124 .bind = audit_bind,
1125 .flags = NL_CFG_F_NONROOT_RECV,
1126 .groups = AUDIT_NLGRP_MAX,
1083 }; 1127 };
1084 1128
1085 struct audit_net *aunet = net_generic(net, audit_net_id); 1129 struct audit_net *aunet = net_generic(net, audit_net_id);
@@ -1901,10 +1945,10 @@ out:
1901 * audit_log_end - end one audit record 1945 * audit_log_end - end one audit record
1902 * @ab: the audit_buffer 1946 * @ab: the audit_buffer
1903 * 1947 *
1904 * The netlink_* functions cannot be called inside an irq context, so 1948 * netlink_unicast() cannot be called inside an irq context because it blocks
1905 * the audit buffer is placed on a queue and a tasklet is scheduled to 1949 * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
1906 * remove them from the queue outside the irq context. May be called in 1950 * on a queue and a tasklet is scheduled to remove them from the queue outside
1907 * any context. 1951 * the irq context. May be called in any context.
1908 */ 1952 */
1909void audit_log_end(struct audit_buffer *ab) 1953void audit_log_end(struct audit_buffer *ab)
1910{ 1954{
@@ -1914,6 +1958,18 @@ void audit_log_end(struct audit_buffer *ab)
1914 audit_log_lost("rate limit exceeded"); 1958 audit_log_lost("rate limit exceeded");
1915 } else { 1959 } else {
1916 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 1960 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1961
1962 kauditd_send_multicast_skb(ab->skb);
1963
1964 /*
1965 * The original kaudit unicast socket sends up messages with
1966 * nlmsg_len set to the payload length rather than the entire
1967 * message length. This breaks the standard set by netlink.
1968 * The existing auditd daemon assumes this breakage. Fixing
1969 * this would require co-ordinating a change in the established
1970 * protocol between the kaudit kernel subsystem and the auditd
1971 * userspace code.
1972 */
1917 nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; 1973 nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
1918 1974
1919 if (audit_pid) { 1975 if (audit_pid) {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f251a5e8d17a..21eae3c05ec0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -728,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
728 return AUDIT_BUILD_CONTEXT; 728 return AUDIT_BUILD_CONTEXT;
729} 729}
730 730
731static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
732{
733 int word, bit;
734
735 if (val > 0xffffffff)
736 return false;
737
738 word = AUDIT_WORD(val);
739 if (word >= AUDIT_BITMASK_SIZE)
740 return false;
741
742 bit = AUDIT_BIT(val);
743
744 return rule->mask[word] & bit;
745}
746
731/* At syscall entry and exit time, this filter is called if the 747/* At syscall entry and exit time, this filter is called if the
732 * audit_state is not low enough that auditing cannot take place, but is 748 * audit_state is not low enough that auditing cannot take place, but is
733 * also not high enough that we already know we have to write an audit 749 * also not high enough that we already know we have to write an audit
@@ -745,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
745 761
746 rcu_read_lock(); 762 rcu_read_lock();
747 if (!list_empty(list)) { 763 if (!list_empty(list)) {
748 int word = AUDIT_WORD(ctx->major);
749 int bit = AUDIT_BIT(ctx->major);
750
751 list_for_each_entry_rcu(e, list, list) { 764 list_for_each_entry_rcu(e, list, list) {
752 if ((e->rule.mask[word] & bit) == bit && 765 if (audit_in_mask(&e->rule, ctx->major) &&
753 audit_filter_rules(tsk, &e->rule, ctx, NULL, 766 audit_filter_rules(tsk, &e->rule, ctx, NULL,
754 &state, false)) { 767 &state, false)) {
755 rcu_read_unlock(); 768 rcu_read_unlock();
@@ -769,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
769static int audit_filter_inode_name(struct task_struct *tsk, 782static int audit_filter_inode_name(struct task_struct *tsk,
770 struct audit_names *n, 783 struct audit_names *n,
771 struct audit_context *ctx) { 784 struct audit_context *ctx) {
772 int word, bit;
773 int h = audit_hash_ino((u32)n->ino); 785 int h = audit_hash_ino((u32)n->ino);
774 struct list_head *list = &audit_inode_hash[h]; 786 struct list_head *list = &audit_inode_hash[h];
775 struct audit_entry *e; 787 struct audit_entry *e;
776 enum audit_state state; 788 enum audit_state state;
777 789
778 word = AUDIT_WORD(ctx->major);
779 bit = AUDIT_BIT(ctx->major);
780
781 if (list_empty(list)) 790 if (list_empty(list))
782 return 0; 791 return 0;
783 792
784 list_for_each_entry_rcu(e, list, list) { 793 list_for_each_entry_rcu(e, list, list) {
785 if ((e->rule.mask[word] & bit) == bit && 794 if (audit_in_mask(&e->rule, ctx->major) &&
786 audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { 795 audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
787 ctx->current_state = state; 796 ctx->current_state = state;
788 return 1; 797 return 1;
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a5e026bc45c4..1323360d90e3 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -19,8 +19,8 @@
19 19
20static void backtrace_test_normal(void) 20static void backtrace_test_normal(void)
21{ 21{
22 printk("Testing a backtrace from process context.\n"); 22 pr_info("Testing a backtrace from process context.\n");
23 printk("The following trace is a kernel self test and not a bug!\n"); 23 pr_info("The following trace is a kernel self test and not a bug!\n");
24 24
25 dump_stack(); 25 dump_stack();
26} 26}
@@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
37 37
38static void backtrace_test_irq(void) 38static void backtrace_test_irq(void)
39{ 39{
40 printk("Testing a backtrace from irq context.\n"); 40 pr_info("Testing a backtrace from irq context.\n");
41 printk("The following trace is a kernel self test and not a bug!\n"); 41 pr_info("The following trace is a kernel self test and not a bug!\n");
42 42
43 init_completion(&backtrace_work); 43 init_completion(&backtrace_work);
44 tasklet_schedule(&backtrace_tasklet); 44 tasklet_schedule(&backtrace_tasklet);
@@ -51,8 +51,8 @@ static void backtrace_test_saved(void)
51 struct stack_trace trace; 51 struct stack_trace trace;
52 unsigned long entries[8]; 52 unsigned long entries[8];
53 53
54 printk("Testing a saved backtrace.\n"); 54 pr_info("Testing a saved backtrace.\n");
55 printk("The following trace is a kernel self test and not a bug!\n"); 55 pr_info("The following trace is a kernel self test and not a bug!\n");
56 56
57 trace.nr_entries = 0; 57 trace.nr_entries = 0;
58 trace.max_entries = ARRAY_SIZE(entries); 58 trace.max_entries = ARRAY_SIZE(entries);
@@ -65,19 +65,19 @@ static void backtrace_test_saved(void)
65#else 65#else
66static void backtrace_test_saved(void) 66static void backtrace_test_saved(void)
67{ 67{
68 printk("Saved backtrace test skipped.\n"); 68 pr_info("Saved backtrace test skipped.\n");
69} 69}
70#endif 70#endif
71 71
72static int backtrace_regression_test(void) 72static int backtrace_regression_test(void)
73{ 73{
74 printk("====[ backtrace testing ]===========\n"); 74 pr_info("====[ backtrace testing ]===========\n");
75 75
76 backtrace_test_normal(); 76 backtrace_test_normal();
77 backtrace_test_irq(); 77 backtrace_test_irq();
78 backtrace_test_saved(); 78 backtrace_test_saved();
79 79
80 printk("====[ end of backtrace testing ]====\n"); 80 pr_info("====[ end of backtrace testing ]====\n");
81 return 0; 81 return 0;
82} 82}
83 83
diff --git a/kernel/capability.c b/kernel/capability.c
index a8d63df0c322..a5cf13c018ce 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -24,7 +24,6 @@
24 */ 24 */
25 25
26const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; 26const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
27
28EXPORT_SYMBOL(__cap_empty_set); 27EXPORT_SYMBOL(__cap_empty_set);
29 28
30int file_caps_enabled = 1; 29int file_caps_enabled = 1;
@@ -189,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
189 * 188 *
190 * An alternative would be to return an error here 189 * An alternative would be to return an error here
191 * (-ERANGE), but that causes legacy applications to 190 * (-ERANGE), but that causes legacy applications to
192 * unexpectidly fail; the capget/modify/capset aborts 191 * unexpectedly fail; the capget/modify/capset aborts
193 * before modification is attempted and the application 192 * before modification is attempted and the application
194 * fails. 193 * fails.
195 */ 194 */
@@ -395,7 +394,8 @@ EXPORT_SYMBOL(ns_capable);
395 * This does not set PF_SUPERPRIV because the caller may not 394 * This does not set PF_SUPERPRIV because the caller may not
396 * actually be privileged. 395 * actually be privileged.
397 */ 396 */
398bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) 397bool file_ns_capable(const struct file *file, struct user_namespace *ns,
398 int cap)
399{ 399{
400 if (WARN_ON_ONCE(!cap_valid(cap))) 400 if (WARN_ON_ONCE(!cap_valid(cap)))
401 return false; 401 return false;
@@ -424,23 +424,19 @@ bool capable(int cap)
424EXPORT_SYMBOL(capable); 424EXPORT_SYMBOL(capable);
425 425
426/** 426/**
427 * inode_capable - Check superior capability over inode 427 * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
428 * @inode: The inode in question 428 * @inode: The inode in question
429 * @cap: The capability in question 429 * @cap: The capability in question
430 * 430 *
431 * Return true if the current task has the given superior capability 431 * Return true if the current task has the given capability targeted at
432 * targeted at it's own user namespace and that the given inode is owned 432 * its own user namespace and that the given inode's uid and gid are
433 * by the current user namespace or a child namespace. 433 * mapped into the current user namespace.
434 *
435 * Currently we check to see if an inode is owned by the current
436 * user namespace by seeing if the inode's owner maps into the
437 * current user namespace.
438 *
439 */ 434 */
440bool inode_capable(const struct inode *inode, int cap) 435bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
441{ 436{
442 struct user_namespace *ns = current_user_ns(); 437 struct user_namespace *ns = current_user_ns();
443 438
444 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); 439 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
440 kgid_has_mapping(ns, inode->i_gid);
445} 441}
446EXPORT_SYMBOL(inode_capable); 442EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9fcdaa705b6c..70776aec2562 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -26,6 +26,8 @@
26 * distribution for more details. 26 * distribution for more details.
27 */ 27 */
28 28
29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30
29#include <linux/cgroup.h> 31#include <linux/cgroup.h>
30#include <linux/cred.h> 32#include <linux/cred.h>
31#include <linux/ctype.h> 33#include <linux/ctype.h>
@@ -33,6 +35,7 @@
33#include <linux/init_task.h> 35#include <linux/init_task.h>
34#include <linux/kernel.h> 36#include <linux/kernel.h>
35#include <linux/list.h> 37#include <linux/list.h>
38#include <linux/magic.h>
36#include <linux/mm.h> 39#include <linux/mm.h>
37#include <linux/mutex.h> 40#include <linux/mutex.h>
38#include <linux/mount.h> 41#include <linux/mount.h>
@@ -69,15 +72,6 @@
69 MAX_CFTYPE_NAME + 2) 72 MAX_CFTYPE_NAME + 2)
70 73
71/* 74/*
72 * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
73 * creation/removal and hierarchy changing operations including cgroup
74 * creation, removal, css association and controller rebinding. This outer
75 * lock is needed mainly to resolve the circular dependency between kernfs
76 * active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
77 */
78static DEFINE_MUTEX(cgroup_tree_mutex);
79
80/*
81 * cgroup_mutex is the master lock. Any modification to cgroup or its 75 * cgroup_mutex is the master lock. Any modification to cgroup or its
82 * hierarchy must be performed while holding it. 76 * hierarchy must be performed while holding it.
83 * 77 *
@@ -98,16 +92,21 @@ static DECLARE_RWSEM(css_set_rwsem);
98#endif 92#endif
99 93
100/* 94/*
95 * Protects cgroup_idr and css_idr so that IDs can be released without
96 * grabbing cgroup_mutex.
97 */
98static DEFINE_SPINLOCK(cgroup_idr_lock);
99
100/*
101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires 101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires
102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. 102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
103 */ 103 */
104static DEFINE_SPINLOCK(release_agent_path_lock); 104static DEFINE_SPINLOCK(release_agent_path_lock);
105 105
106#define cgroup_assert_mutexes_or_rcu_locked() \ 106#define cgroup_assert_mutex_or_rcu_locked() \
107 rcu_lockdep_assert(rcu_read_lock_held() || \ 107 rcu_lockdep_assert(rcu_read_lock_held() || \
108 lockdep_is_held(&cgroup_tree_mutex) || \
109 lockdep_is_held(&cgroup_mutex), \ 108 lockdep_is_held(&cgroup_mutex), \
110 "cgroup_[tree_]mutex or RCU read lock required"); 109 "cgroup_mutex or RCU read lock required");
111 110
112/* 111/*
113 * cgroup destruction makes heavy use of work items and there can be a lot 112 * cgroup destruction makes heavy use of work items and there can be a lot
@@ -150,6 +149,13 @@ struct cgroup_root cgrp_dfl_root;
150 */ 149 */
151static bool cgrp_dfl_root_visible; 150static bool cgrp_dfl_root_visible;
152 151
152/* some controllers are not supported in the default hierarchy */
153static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
154#ifdef CONFIG_CGROUP_DEBUG
155 | (1 << debug_cgrp_id)
156#endif
157 ;
158
153/* The list of hierarchy roots */ 159/* The list of hierarchy roots */
154 160
155static LIST_HEAD(cgroup_roots); 161static LIST_HEAD(cgroup_roots);
@@ -159,14 +165,13 @@ static int cgroup_root_count;
159static DEFINE_IDR(cgroup_hierarchy_idr); 165static DEFINE_IDR(cgroup_hierarchy_idr);
160 166
161/* 167/*
162 * Assign a monotonically increasing serial number to cgroups. It 168 * Assign a monotonically increasing serial number to csses. It guarantees
163 * guarantees cgroups with bigger numbers are newer than those with smaller 169 * cgroups with bigger numbers are newer than those with smaller numbers.
164 * numbers. Also, as cgroups are always appended to the parent's 170 * Also, as csses are always appended to the parent's ->children list, it
165 * ->children list, it guarantees that sibling cgroups are always sorted in 171 * guarantees that sibling csses are always sorted in the ascending serial
166 * the ascending serial number order on the list. Protected by 172 * number order on the list. Protected by cgroup_mutex.
167 * cgroup_mutex.
168 */ 173 */
169static u64 cgroup_serial_nr_next = 1; 174static u64 css_serial_nr_next = 1;
170 175
171/* This flag indicates whether tasks in the fork and exit paths should 176/* This flag indicates whether tasks in the fork and exit paths should
172 * check for fork/exit handlers to call. This avoids us having to do 177 * check for fork/exit handlers to call. This avoids us having to do
@@ -179,17 +184,59 @@ static struct cftype cgroup_base_files[];
179 184
180static void cgroup_put(struct cgroup *cgrp); 185static void cgroup_put(struct cgroup *cgrp);
181static int rebind_subsystems(struct cgroup_root *dst_root, 186static int rebind_subsystems(struct cgroup_root *dst_root,
182 unsigned long ss_mask); 187 unsigned int ss_mask);
183static void cgroup_destroy_css_killed(struct cgroup *cgrp);
184static int cgroup_destroy_locked(struct cgroup *cgrp); 188static int cgroup_destroy_locked(struct cgroup *cgrp);
189static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
190static void css_release(struct percpu_ref *ref);
191static void kill_css(struct cgroup_subsys_state *css);
185static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 192static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
186 bool is_add); 193 bool is_add);
187static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); 194static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
188 195
196/* IDR wrappers which synchronize using cgroup_idr_lock */
197static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
198 gfp_t gfp_mask)
199{
200 int ret;
201
202 idr_preload(gfp_mask);
203 spin_lock_bh(&cgroup_idr_lock);
204 ret = idr_alloc(idr, ptr, start, end, gfp_mask);
205 spin_unlock_bh(&cgroup_idr_lock);
206 idr_preload_end();
207 return ret;
208}
209
210static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
211{
212 void *ret;
213
214 spin_lock_bh(&cgroup_idr_lock);
215 ret = idr_replace(idr, ptr, id);
216 spin_unlock_bh(&cgroup_idr_lock);
217 return ret;
218}
219
220static void cgroup_idr_remove(struct idr *idr, int id)
221{
222 spin_lock_bh(&cgroup_idr_lock);
223 idr_remove(idr, id);
224 spin_unlock_bh(&cgroup_idr_lock);
225}
226
227static struct cgroup *cgroup_parent(struct cgroup *cgrp)
228{
229 struct cgroup_subsys_state *parent_css = cgrp->self.parent;
230
231 if (parent_css)
232 return container_of(parent_css, struct cgroup, self);
233 return NULL;
234}
235
189/** 236/**
190 * cgroup_css - obtain a cgroup's css for the specified subsystem 237 * cgroup_css - obtain a cgroup's css for the specified subsystem
191 * @cgrp: the cgroup of interest 238 * @cgrp: the cgroup of interest
192 * @ss: the subsystem of interest (%NULL returns the dummy_css) 239 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
193 * 240 *
194 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This 241 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
195 * function must be called either under cgroup_mutex or rcu_read_lock() and 242 * function must be called either under cgroup_mutex or rcu_read_lock() and
@@ -202,23 +249,49 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
202{ 249{
203 if (ss) 250 if (ss)
204 return rcu_dereference_check(cgrp->subsys[ss->id], 251 return rcu_dereference_check(cgrp->subsys[ss->id],
205 lockdep_is_held(&cgroup_tree_mutex) ||
206 lockdep_is_held(&cgroup_mutex)); 252 lockdep_is_held(&cgroup_mutex));
207 else 253 else
208 return &cgrp->dummy_css; 254 return &cgrp->self;
255}
256
257/**
258 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
259 * @cgrp: the cgroup of interest
260 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
261 *
262 * Similar to cgroup_css() but returns the effctive css, which is defined
263 * as the matching css of the nearest ancestor including self which has @ss
264 * enabled. If @ss is associated with the hierarchy @cgrp is on, this
265 * function is guaranteed to return non-NULL css.
266 */
267static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
268 struct cgroup_subsys *ss)
269{
270 lockdep_assert_held(&cgroup_mutex);
271
272 if (!ss)
273 return &cgrp->self;
274
275 if (!(cgrp->root->subsys_mask & (1 << ss->id)))
276 return NULL;
277
278 while (cgroup_parent(cgrp) &&
279 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
280 cgrp = cgroup_parent(cgrp);
281
282 return cgroup_css(cgrp, ss);
209} 283}
210 284
211/* convenient tests for these bits */ 285/* convenient tests for these bits */
212static inline bool cgroup_is_dead(const struct cgroup *cgrp) 286static inline bool cgroup_is_dead(const struct cgroup *cgrp)
213{ 287{
214 return test_bit(CGRP_DEAD, &cgrp->flags); 288 return !(cgrp->self.flags & CSS_ONLINE);
215} 289}
216 290
217struct cgroup_subsys_state *seq_css(struct seq_file *seq) 291struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
218{ 292{
219 struct kernfs_open_file *of = seq->private;
220 struct cgroup *cgrp = of->kn->parent->priv; 293 struct cgroup *cgrp = of->kn->parent->priv;
221 struct cftype *cft = seq_cft(seq); 294 struct cftype *cft = of_cft(of);
222 295
223 /* 296 /*
224 * This is open and unprotected implementation of cgroup_css(). 297 * This is open and unprotected implementation of cgroup_css().
@@ -231,9 +304,9 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq)
231 if (cft->ss) 304 if (cft->ss)
232 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); 305 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
233 else 306 else
234 return &cgrp->dummy_css; 307 return &cgrp->self;
235} 308}
236EXPORT_SYMBOL_GPL(seq_css); 309EXPORT_SYMBOL_GPL(of_css);
237 310
238/** 311/**
239 * cgroup_is_descendant - test ancestry 312 * cgroup_is_descendant - test ancestry
@@ -249,7 +322,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
249 while (cgrp) { 322 while (cgrp) {
250 if (cgrp == ancestor) 323 if (cgrp == ancestor)
251 return true; 324 return true;
252 cgrp = cgrp->parent; 325 cgrp = cgroup_parent(cgrp);
253 } 326 }
254 return false; 327 return false;
255} 328}
@@ -273,17 +346,30 @@ static int notify_on_release(const struct cgroup *cgrp)
273 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end 346 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
274 * @cgrp: the target cgroup to iterate css's of 347 * @cgrp: the target cgroup to iterate css's of
275 * 348 *
276 * Should be called under cgroup_mutex. 349 * Should be called under cgroup_[tree_]mutex.
277 */ 350 */
278#define for_each_css(css, ssid, cgrp) \ 351#define for_each_css(css, ssid, cgrp) \
279 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 352 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
280 if (!((css) = rcu_dereference_check( \ 353 if (!((css) = rcu_dereference_check( \
281 (cgrp)->subsys[(ssid)], \ 354 (cgrp)->subsys[(ssid)], \
282 lockdep_is_held(&cgroup_tree_mutex) || \
283 lockdep_is_held(&cgroup_mutex)))) { } \ 355 lockdep_is_held(&cgroup_mutex)))) { } \
284 else 356 else
285 357
286/** 358/**
359 * for_each_e_css - iterate all effective css's of a cgroup
360 * @css: the iteration cursor
361 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
362 * @cgrp: the target cgroup to iterate css's of
363 *
364 * Should be called under cgroup_[tree_]mutex.
365 */
366#define for_each_e_css(css, ssid, cgrp) \
367 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
368 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
369 ; \
370 else
371
372/**
287 * for_each_subsys - iterate all enabled cgroup subsystems 373 * for_each_subsys - iterate all enabled cgroup subsystems
288 * @ss: the iteration cursor 374 * @ss: the iteration cursor
289 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 375 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -296,22 +382,13 @@ static int notify_on_release(const struct cgroup *cgrp)
296#define for_each_root(root) \ 382#define for_each_root(root) \
297 list_for_each_entry((root), &cgroup_roots, root_list) 383 list_for_each_entry((root), &cgroup_roots, root_list)
298 384
299/** 385/* iterate over child cgrps, lock should be held throughout iteration */
300 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 386#define cgroup_for_each_live_child(child, cgrp) \
301 * @cgrp: the cgroup to be checked for liveness 387 list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
302 * 388 if (({ lockdep_assert_held(&cgroup_mutex); \
303 * On success, returns true; the mutex should be later unlocked. On 389 cgroup_is_dead(child); })) \
304 * failure returns false with no lock held. 390 ; \
305 */ 391 else
306static bool cgroup_lock_live_group(struct cgroup *cgrp)
307{
308 mutex_lock(&cgroup_mutex);
309 if (cgroup_is_dead(cgrp)) {
310 mutex_unlock(&cgroup_mutex);
311 return false;
312 }
313 return true;
314}
315 392
316/* the list of cgroups eligible for automatic release. Protected by 393/* the list of cgroups eligible for automatic release. Protected by
317 * release_list_lock */ 394 * release_list_lock */
@@ -348,7 +425,7 @@ struct cgrp_cset_link {
348 * reference-counted, to improve performance when child cgroups 425 * reference-counted, to improve performance when child cgroups
349 * haven't been created. 426 * haven't been created.
350 */ 427 */
351static struct css_set init_css_set = { 428struct css_set init_css_set = {
352 .refcount = ATOMIC_INIT(1), 429 .refcount = ATOMIC_INIT(1),
353 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), 430 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
354 .tasks = LIST_HEAD_INIT(init_css_set.tasks), 431 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
@@ -359,6 +436,43 @@ static struct css_set init_css_set = {
359 436
360static int css_set_count = 1; /* 1 for init_css_set */ 437static int css_set_count = 1; /* 1 for init_css_set */
361 438
439/**
440 * cgroup_update_populated - updated populated count of a cgroup
441 * @cgrp: the target cgroup
442 * @populated: inc or dec populated count
443 *
444 * @cgrp is either getting the first task (css_set) or losing the last.
445 * Update @cgrp->populated_cnt accordingly. The count is propagated
446 * towards root so that a given cgroup's populated_cnt is zero iff the
447 * cgroup and all its descendants are empty.
448 *
449 * @cgrp's interface file "cgroup.populated" is zero if
450 * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
451 * changes from or to zero, userland is notified that the content of the
452 * interface file has changed. This can be used to detect when @cgrp and
453 * its descendants become populated or empty.
454 */
455static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
456{
457 lockdep_assert_held(&css_set_rwsem);
458
459 do {
460 bool trigger;
461
462 if (populated)
463 trigger = !cgrp->populated_cnt++;
464 else
465 trigger = !--cgrp->populated_cnt;
466
467 if (!trigger)
468 break;
469
470 if (cgrp->populated_kn)
471 kernfs_notify(cgrp->populated_kn);
472 cgrp = cgroup_parent(cgrp);
473 } while (cgrp);
474}
475
362/* 476/*
363 * hash table for cgroup groups. This improves the performance to find 477 * hash table for cgroup groups. This improves the performance to find
364 * an existing css_set. This hash doesn't (currently) take into 478 * an existing css_set. This hash doesn't (currently) take into
@@ -383,6 +497,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
383static void put_css_set_locked(struct css_set *cset, bool taskexit) 497static void put_css_set_locked(struct css_set *cset, bool taskexit)
384{ 498{
385 struct cgrp_cset_link *link, *tmp_link; 499 struct cgrp_cset_link *link, *tmp_link;
500 struct cgroup_subsys *ss;
501 int ssid;
386 502
387 lockdep_assert_held(&css_set_rwsem); 503 lockdep_assert_held(&css_set_rwsem);
388 504
@@ -390,6 +506,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
390 return; 506 return;
391 507
392 /* This css_set is dead. unlink it and release cgroup refcounts */ 508 /* This css_set is dead. unlink it and release cgroup refcounts */
509 for_each_subsys(ss, ssid)
510 list_del(&cset->e_cset_node[ssid]);
393 hash_del(&cset->hlist); 511 hash_del(&cset->hlist);
394 css_set_count--; 512 css_set_count--;
395 513
@@ -400,10 +518,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
400 list_del(&link->cgrp_link); 518 list_del(&link->cgrp_link);
401 519
402 /* @cgrp can't go away while we're holding css_set_rwsem */ 520 /* @cgrp can't go away while we're holding css_set_rwsem */
403 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { 521 if (list_empty(&cgrp->cset_links)) {
404 if (taskexit) 522 cgroup_update_populated(cgrp, false);
405 set_bit(CGRP_RELEASABLE, &cgrp->flags); 523 if (notify_on_release(cgrp)) {
406 check_for_release(cgrp); 524 if (taskexit)
525 set_bit(CGRP_RELEASABLE, &cgrp->flags);
526 check_for_release(cgrp);
527 }
407 } 528 }
408 529
409 kfree(link); 530 kfree(link);
@@ -452,20 +573,20 @@ static bool compare_css_sets(struct css_set *cset,
452{ 573{
453 struct list_head *l1, *l2; 574 struct list_head *l1, *l2;
454 575
455 if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { 576 /*
456 /* Not all subsystems matched */ 577 * On the default hierarchy, there can be csets which are
578 * associated with the same set of cgroups but different csses.
579 * Let's first ensure that csses match.
580 */
581 if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
457 return false; 582 return false;
458 }
459 583
460 /* 584 /*
461 * Compare cgroup pointers in order to distinguish between 585 * Compare cgroup pointers in order to distinguish between
462 * different cgroups in heirarchies with no subsystems. We 586 * different cgroups in hierarchies. As different cgroups may
463 * could get by with just this check alone (and skip the 587 * share the same effective css, this comparison is always
464 * memcmp above) but on most setups the memcmp check will 588 * necessary.
465 * avoid the need for this more expensive check on almost all
466 * candidates.
467 */ 589 */
468
469 l1 = &cset->cgrp_links; 590 l1 = &cset->cgrp_links;
470 l2 = &old_cset->cgrp_links; 591 l2 = &old_cset->cgrp_links;
471 while (1) { 592 while (1) {
@@ -529,14 +650,17 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
529 * won't change, so no need for locking. 650 * won't change, so no need for locking.
530 */ 651 */
531 for_each_subsys(ss, i) { 652 for_each_subsys(ss, i) {
532 if (root->cgrp.subsys_mask & (1UL << i)) { 653 if (root->subsys_mask & (1UL << i)) {
533 /* Subsystem is in this hierarchy. So we want 654 /*
534 * the subsystem state from the new 655 * @ss is in this hierarchy, so we want the
535 * cgroup */ 656 * effective css from @cgrp.
536 template[i] = cgroup_css(cgrp, ss); 657 */
658 template[i] = cgroup_e_css(cgrp, ss);
537 } else { 659 } else {
538 /* Subsystem is not in this hierarchy, so we 660 /*
539 * don't want to change the subsystem state */ 661 * @ss is not in this hierarchy, so we don't want
662 * to change the css.
663 */
540 template[i] = old_cset->subsys[i]; 664 template[i] = old_cset->subsys[i];
541 } 665 }
542 } 666 }
@@ -602,10 +726,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
602 struct cgrp_cset_link *link; 726 struct cgrp_cset_link *link;
603 727
604 BUG_ON(list_empty(tmp_links)); 728 BUG_ON(list_empty(tmp_links));
729
730 if (cgroup_on_dfl(cgrp))
731 cset->dfl_cgrp = cgrp;
732
605 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); 733 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
606 link->cset = cset; 734 link->cset = cset;
607 link->cgrp = cgrp; 735 link->cgrp = cgrp;
736
737 if (list_empty(&cgrp->cset_links))
738 cgroup_update_populated(cgrp, true);
608 list_move(&link->cset_link, &cgrp->cset_links); 739 list_move(&link->cset_link, &cgrp->cset_links);
740
609 /* 741 /*
610 * Always add links to the tail of the list so that the list 742 * Always add links to the tail of the list so that the list
611 * is sorted by order of hierarchy creation 743 * is sorted by order of hierarchy creation
@@ -628,7 +760,9 @@ static struct css_set *find_css_set(struct css_set *old_cset,
628 struct css_set *cset; 760 struct css_set *cset;
629 struct list_head tmp_links; 761 struct list_head tmp_links;
630 struct cgrp_cset_link *link; 762 struct cgrp_cset_link *link;
763 struct cgroup_subsys *ss;
631 unsigned long key; 764 unsigned long key;
765 int ssid;
632 766
633 lockdep_assert_held(&cgroup_mutex); 767 lockdep_assert_held(&cgroup_mutex);
634 768
@@ -679,10 +813,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
679 813
680 css_set_count++; 814 css_set_count++;
681 815
682 /* Add this cgroup group to the hash table */ 816 /* Add @cset to the hash table */
683 key = css_set_hash(cset->subsys); 817 key = css_set_hash(cset->subsys);
684 hash_add(css_set_table, &cset->hlist, key); 818 hash_add(css_set_table, &cset->hlist, key);
685 819
820 for_each_subsys(ss, ssid)
821 list_add_tail(&cset->e_cset_node[ssid],
822 &cset->subsys[ssid]->cgroup->e_csets[ssid]);
823
686 up_write(&css_set_rwsem); 824 up_write(&css_set_rwsem);
687 825
688 return cset; 826 return cset;
@@ -735,14 +873,13 @@ static void cgroup_destroy_root(struct cgroup_root *root)
735 struct cgroup *cgrp = &root->cgrp; 873 struct cgroup *cgrp = &root->cgrp;
736 struct cgrp_cset_link *link, *tmp_link; 874 struct cgrp_cset_link *link, *tmp_link;
737 875
738 mutex_lock(&cgroup_tree_mutex);
739 mutex_lock(&cgroup_mutex); 876 mutex_lock(&cgroup_mutex);
740 877
741 BUG_ON(atomic_read(&root->nr_cgrps)); 878 BUG_ON(atomic_read(&root->nr_cgrps));
742 BUG_ON(!list_empty(&cgrp->children)); 879 BUG_ON(!list_empty(&cgrp->self.children));
743 880
744 /* Rebind all subsystems back to the default hierarchy */ 881 /* Rebind all subsystems back to the default hierarchy */
745 rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); 882 rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
746 883
747 /* 884 /*
748 * Release all the links from cset_links to this hierarchy's 885 * Release all the links from cset_links to this hierarchy's
@@ -765,7 +902,6 @@ static void cgroup_destroy_root(struct cgroup_root *root)
765 cgroup_exit_root_id(root); 902 cgroup_exit_root_id(root);
766 903
767 mutex_unlock(&cgroup_mutex); 904 mutex_unlock(&cgroup_mutex);
768 mutex_unlock(&cgroup_tree_mutex);
769 905
770 kernfs_destroy_root(root->kf_root); 906 kernfs_destroy_root(root->kf_root);
771 cgroup_free_root(root); 907 cgroup_free_root(root);
@@ -848,7 +984,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
848 * update of a tasks cgroup pointer by cgroup_attach_task() 984 * update of a tasks cgroup pointer by cgroup_attach_task()
849 */ 985 */
850 986
851static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); 987static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
852static struct kernfs_syscall_ops cgroup_kf_syscall_ops; 988static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
853static const struct file_operations proc_cgroupstats_operations; 989static const struct file_operations proc_cgroupstats_operations;
854 990
@@ -883,79 +1019,95 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
883 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 1019 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
884 mode |= S_IRUGO; 1020 mode |= S_IRUGO;
885 1021
886 if (cft->write_u64 || cft->write_s64 || cft->write_string || 1022 if (cft->write_u64 || cft->write_s64 || cft->write)
887 cft->trigger)
888 mode |= S_IWUSR; 1023 mode |= S_IWUSR;
889 1024
890 return mode; 1025 return mode;
891} 1026}
892 1027
893static void cgroup_free_fn(struct work_struct *work) 1028static void cgroup_get(struct cgroup *cgrp)
894{ 1029{
895 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 1030 WARN_ON_ONCE(cgroup_is_dead(cgrp));
896 1031 css_get(&cgrp->self);
897 atomic_dec(&cgrp->root->nr_cgrps);
898 cgroup_pidlist_destroy_all(cgrp);
899
900 if (cgrp->parent) {
901 /*
902 * We get a ref to the parent, and put the ref when this
903 * cgroup is being freed, so it's guaranteed that the
904 * parent won't be destroyed before its children.
905 */
906 cgroup_put(cgrp->parent);
907 kernfs_put(cgrp->kn);
908 kfree(cgrp);
909 } else {
910 /*
911 * This is root cgroup's refcnt reaching zero, which
912 * indicates that the root should be released.
913 */
914 cgroup_destroy_root(cgrp->root);
915 }
916} 1032}
917 1033
918static void cgroup_free_rcu(struct rcu_head *head) 1034static void cgroup_put(struct cgroup *cgrp)
919{ 1035{
920 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 1036 css_put(&cgrp->self);
921
922 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
923 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
924} 1037}
925 1038
926static void cgroup_get(struct cgroup *cgrp) 1039/**
1040 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1041 * @kn: the kernfs_node being serviced
1042 *
1043 * This helper undoes cgroup_kn_lock_live() and should be invoked before
1044 * the method finishes if locking succeeded. Note that once this function
1045 * returns the cgroup returned by cgroup_kn_lock_live() may become
1046 * inaccessible any time. If the caller intends to continue to access the
1047 * cgroup, it should pin it before invoking this function.
1048 */
1049static void cgroup_kn_unlock(struct kernfs_node *kn)
927{ 1050{
928 WARN_ON_ONCE(cgroup_is_dead(cgrp)); 1051 struct cgroup *cgrp;
929 WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); 1052
930 atomic_inc(&cgrp->refcnt); 1053 if (kernfs_type(kn) == KERNFS_DIR)
1054 cgrp = kn->priv;
1055 else
1056 cgrp = kn->parent->priv;
1057
1058 mutex_unlock(&cgroup_mutex);
1059
1060 kernfs_unbreak_active_protection(kn);
1061 cgroup_put(cgrp);
931} 1062}
932 1063
933static void cgroup_put(struct cgroup *cgrp) 1064/**
1065 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1066 * @kn: the kernfs_node being serviced
1067 *
1068 * This helper is to be used by a cgroup kernfs method currently servicing
1069 * @kn. It breaks the active protection, performs cgroup locking and
1070 * verifies that the associated cgroup is alive. Returns the cgroup if
1071 * alive; otherwise, %NULL. A successful return should be undone by a
1072 * matching cgroup_kn_unlock() invocation.
1073 *
1074 * Any cgroup kernfs method implementation which requires locking the
1075 * associated cgroup should use this helper. It avoids nesting cgroup
1076 * locking under kernfs active protection and allows all kernfs operations
1077 * including self-removal.
1078 */
1079static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
934{ 1080{
935 if (!atomic_dec_and_test(&cgrp->refcnt)) 1081 struct cgroup *cgrp;
936 return; 1082
937 if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) 1083 if (kernfs_type(kn) == KERNFS_DIR)
938 return; 1084 cgrp = kn->priv;
1085 else
1086 cgrp = kn->parent->priv;
939 1087
940 /* 1088 /*
941 * XXX: cgrp->id is only used to look up css's. As cgroup and 1089 * We're gonna grab cgroup_mutex which nests outside kernfs
942 * css's lifetimes will be decoupled, it should be made 1090 * active_ref. cgroup liveliness check alone provides enough
943 * per-subsystem and moved to css->id so that lookups are 1091 * protection against removal. Ensure @cgrp stays accessible and
944 * successful until the target css is released. 1092 * break the active_ref protection.
945 */ 1093 */
1094 cgroup_get(cgrp);
1095 kernfs_break_active_protection(kn);
1096
946 mutex_lock(&cgroup_mutex); 1097 mutex_lock(&cgroup_mutex);
947 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
948 mutex_unlock(&cgroup_mutex);
949 cgrp->id = -1;
950 1098
951 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 1099 if (!cgroup_is_dead(cgrp))
1100 return cgrp;
1101
1102 cgroup_kn_unlock(kn);
1103 return NULL;
952} 1104}
953 1105
954static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 1106static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
955{ 1107{
956 char name[CGROUP_FILE_NAME_MAX]; 1108 char name[CGROUP_FILE_NAME_MAX];
957 1109
958 lockdep_assert_held(&cgroup_tree_mutex); 1110 lockdep_assert_held(&cgroup_mutex);
959 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); 1111 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
960} 1112}
961 1113
@@ -964,7 +1116,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
964 * @cgrp: target cgroup 1116 * @cgrp: target cgroup
965 * @subsys_mask: mask of the subsystem ids whose files should be removed 1117 * @subsys_mask: mask of the subsystem ids whose files should be removed
966 */ 1118 */
967static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) 1119static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
968{ 1120{
969 struct cgroup_subsys *ss; 1121 struct cgroup_subsys *ss;
970 int i; 1122 int i;
@@ -972,40 +1124,40 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
972 for_each_subsys(ss, i) { 1124 for_each_subsys(ss, i) {
973 struct cftype *cfts; 1125 struct cftype *cfts;
974 1126
975 if (!test_bit(i, &subsys_mask)) 1127 if (!(subsys_mask & (1 << i)))
976 continue; 1128 continue;
977 list_for_each_entry(cfts, &ss->cfts, node) 1129 list_for_each_entry(cfts, &ss->cfts, node)
978 cgroup_addrm_files(cgrp, cfts, false); 1130 cgroup_addrm_files(cgrp, cfts, false);
979 } 1131 }
980} 1132}
981 1133
982static int rebind_subsystems(struct cgroup_root *dst_root, 1134static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
983 unsigned long ss_mask)
984{ 1135{
985 struct cgroup_subsys *ss; 1136 struct cgroup_subsys *ss;
986 int ssid, ret; 1137 unsigned int tmp_ss_mask;
1138 int ssid, i, ret;
987 1139
988 lockdep_assert_held(&cgroup_tree_mutex);
989 lockdep_assert_held(&cgroup_mutex); 1140 lockdep_assert_held(&cgroup_mutex);
990 1141
991 for_each_subsys(ss, ssid) { 1142 for_each_subsys(ss, ssid) {
992 if (!(ss_mask & (1 << ssid))) 1143 if (!(ss_mask & (1 << ssid)))
993 continue; 1144 continue;
994 1145
995 /* if @ss is on the dummy_root, we can always move it */ 1146 /* if @ss has non-root csses attached to it, can't move */
996 if (ss->root == &cgrp_dfl_root) 1147 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
997 continue;
998
999 /* if @ss has non-root cgroups attached to it, can't move */
1000 if (!list_empty(&ss->root->cgrp.children))
1001 return -EBUSY; 1148 return -EBUSY;
1002 1149
1003 /* can't move between two non-dummy roots either */ 1150 /* can't move between two non-dummy roots either */
1004 if (dst_root != &cgrp_dfl_root) 1151 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1005 return -EBUSY; 1152 return -EBUSY;
1006 } 1153 }
1007 1154
1008 ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); 1155 /* skip creating root files on dfl_root for inhibited subsystems */
1156 tmp_ss_mask = ss_mask;
1157 if (dst_root == &cgrp_dfl_root)
1158 tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
1159
1160 ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
1009 if (ret) { 1161 if (ret) {
1010 if (dst_root != &cgrp_dfl_root) 1162 if (dst_root != &cgrp_dfl_root)
1011 return ret; 1163 return ret;
@@ -1017,9 +1169,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1017 * Just warn about it and continue. 1169 * Just warn about it and continue.
1018 */ 1170 */
1019 if (cgrp_dfl_root_visible) { 1171 if (cgrp_dfl_root_visible) {
1020 pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", 1172 pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
1021 ret, ss_mask); 1173 ret, ss_mask);
1022 pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); 1174 pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1023 } 1175 }
1024 } 1176 }
1025 1177
@@ -1027,15 +1179,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1027 * Nothing can fail from this point on. Remove files for the 1179 * Nothing can fail from this point on. Remove files for the
1028 * removed subsystems and rebind each subsystem. 1180 * removed subsystems and rebind each subsystem.
1029 */ 1181 */
1030 mutex_unlock(&cgroup_mutex);
1031 for_each_subsys(ss, ssid) 1182 for_each_subsys(ss, ssid)
1032 if (ss_mask & (1 << ssid)) 1183 if (ss_mask & (1 << ssid))
1033 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); 1184 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1034 mutex_lock(&cgroup_mutex);
1035 1185
1036 for_each_subsys(ss, ssid) { 1186 for_each_subsys(ss, ssid) {
1037 struct cgroup_root *src_root; 1187 struct cgroup_root *src_root;
1038 struct cgroup_subsys_state *css; 1188 struct cgroup_subsys_state *css;
1189 struct css_set *cset;
1039 1190
1040 if (!(ss_mask & (1 << ssid))) 1191 if (!(ss_mask & (1 << ssid)))
1041 continue; 1192 continue;
@@ -1050,8 +1201,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1050 ss->root = dst_root; 1201 ss->root = dst_root;
1051 css->cgroup = &dst_root->cgrp; 1202 css->cgroup = &dst_root->cgrp;
1052 1203
1053 src_root->cgrp.subsys_mask &= ~(1 << ssid); 1204 down_write(&css_set_rwsem);
1054 dst_root->cgrp.subsys_mask |= 1 << ssid; 1205 hash_for_each(css_set_table, i, cset, hlist)
1206 list_move_tail(&cset->e_cset_node[ss->id],
1207 &dst_root->cgrp.e_csets[ss->id]);
1208 up_write(&css_set_rwsem);
1209
1210 src_root->subsys_mask &= ~(1 << ssid);
1211 src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
1212
1213 /* default hierarchy doesn't enable controllers by default */
1214 dst_root->subsys_mask |= 1 << ssid;
1215 if (dst_root != &cgrp_dfl_root)
1216 dst_root->cgrp.child_subsys_mask |= 1 << ssid;
1055 1217
1056 if (ss->bind) 1218 if (ss->bind)
1057 ss->bind(css); 1219 ss->bind(css);
@@ -1069,7 +1231,7 @@ static int cgroup_show_options(struct seq_file *seq,
1069 int ssid; 1231 int ssid;
1070 1232
1071 for_each_subsys(ss, ssid) 1233 for_each_subsys(ss, ssid)
1072 if (root->cgrp.subsys_mask & (1 << ssid)) 1234 if (root->subsys_mask & (1 << ssid))
1073 seq_printf(seq, ",%s", ss->name); 1235 seq_printf(seq, ",%s", ss->name);
1074 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1236 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1075 seq_puts(seq, ",sane_behavior"); 1237 seq_puts(seq, ",sane_behavior");
@@ -1091,8 +1253,8 @@ static int cgroup_show_options(struct seq_file *seq,
1091} 1253}
1092 1254
1093struct cgroup_sb_opts { 1255struct cgroup_sb_opts {
1094 unsigned long subsys_mask; 1256 unsigned int subsys_mask;
1095 unsigned long flags; 1257 unsigned int flags;
1096 char *release_agent; 1258 char *release_agent;
1097 bool cpuset_clone_children; 1259 bool cpuset_clone_children;
1098 char *name; 1260 char *name;
@@ -1100,24 +1262,16 @@ struct cgroup_sb_opts {
1100 bool none; 1262 bool none;
1101}; 1263};
1102 1264
1103/*
1104 * Convert a hierarchy specifier into a bitmask of subsystems and
1105 * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
1106 * array. This function takes refcounts on subsystems to be used, unless it
1107 * returns error, in which case no refcounts are taken.
1108 */
1109static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1265static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1110{ 1266{
1111 char *token, *o = data; 1267 char *token, *o = data;
1112 bool all_ss = false, one_ss = false; 1268 bool all_ss = false, one_ss = false;
1113 unsigned long mask = (unsigned long)-1; 1269 unsigned int mask = -1U;
1114 struct cgroup_subsys *ss; 1270 struct cgroup_subsys *ss;
1115 int i; 1271 int i;
1116 1272
1117 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1118
1119#ifdef CONFIG_CPUSETS 1273#ifdef CONFIG_CPUSETS
1120 mask = ~(1UL << cpuset_cgrp_id); 1274 mask = ~(1U << cpuset_cgrp_id);
1121#endif 1275#endif
1122 1276
1123 memset(opts, 0, sizeof(*opts)); 1277 memset(opts, 0, sizeof(*opts));
@@ -1198,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1198 /* Mutually exclusive option 'all' + subsystem name */ 1352 /* Mutually exclusive option 'all' + subsystem name */
1199 if (all_ss) 1353 if (all_ss)
1200 return -EINVAL; 1354 return -EINVAL;
1201 set_bit(i, &opts->subsys_mask); 1355 opts->subsys_mask |= (1 << i);
1202 one_ss = true; 1356 one_ss = true;
1203 1357
1204 break; 1358 break;
@@ -1210,12 +1364,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1210 /* Consistency checks */ 1364 /* Consistency checks */
1211 1365
1212 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1366 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1213 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1367 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1214 1368
1215 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || 1369 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1216 opts->cpuset_clone_children || opts->release_agent || 1370 opts->cpuset_clone_children || opts->release_agent ||
1217 opts->name) { 1371 opts->name) {
1218 pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); 1372 pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1219 return -EINVAL; 1373 return -EINVAL;
1220 } 1374 }
1221 } else { 1375 } else {
@@ -1227,7 +1381,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1227 if (all_ss || (!one_ss && !opts->none && !opts->name)) 1381 if (all_ss || (!one_ss && !opts->none && !opts->name))
1228 for_each_subsys(ss, i) 1382 for_each_subsys(ss, i)
1229 if (!ss->disabled) 1383 if (!ss->disabled)
1230 set_bit(i, &opts->subsys_mask); 1384 opts->subsys_mask |= (1 << i);
1231 1385
1232 /* 1386 /*
1233 * We either have to specify by name or by subsystems. (So 1387 * We either have to specify by name or by subsystems. (So
@@ -1258,14 +1412,13 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1258 int ret = 0; 1412 int ret = 0;
1259 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1413 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1260 struct cgroup_sb_opts opts; 1414 struct cgroup_sb_opts opts;
1261 unsigned long added_mask, removed_mask; 1415 unsigned int added_mask, removed_mask;
1262 1416
1263 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1417 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1264 pr_err("cgroup: sane_behavior: remount is not allowed\n"); 1418 pr_err("sane_behavior: remount is not allowed\n");
1265 return -EINVAL; 1419 return -EINVAL;
1266 } 1420 }
1267 1421
1268 mutex_lock(&cgroup_tree_mutex);
1269 mutex_lock(&cgroup_mutex); 1422 mutex_lock(&cgroup_mutex);
1270 1423
1271 /* See what subsystems are wanted */ 1424 /* See what subsystems are wanted */
@@ -1273,17 +1426,17 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1273 if (ret) 1426 if (ret)
1274 goto out_unlock; 1427 goto out_unlock;
1275 1428
1276 if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) 1429 if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1277 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1430 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1278 task_tgid_nr(current), current->comm); 1431 task_tgid_nr(current), current->comm);
1279 1432
1280 added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; 1433 added_mask = opts.subsys_mask & ~root->subsys_mask;
1281 removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; 1434 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1282 1435
1283 /* Don't allow flags or name to change at remount */ 1436 /* Don't allow flags or name to change at remount */
1284 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1437 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1285 (opts.name && strcmp(opts.name, root->name))) { 1438 (opts.name && strcmp(opts.name, root->name))) {
1286 pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", 1439 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1287 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", 1440 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1288 root->flags & CGRP_ROOT_OPTION_MASK, root->name); 1441 root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1289 ret = -EINVAL; 1442 ret = -EINVAL;
@@ -1291,7 +1444,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1291 } 1444 }
1292 1445
1293 /* remounting is not allowed for populated hierarchies */ 1446 /* remounting is not allowed for populated hierarchies */
1294 if (!list_empty(&root->cgrp.children)) { 1447 if (!list_empty(&root->cgrp.self.children)) {
1295 ret = -EBUSY; 1448 ret = -EBUSY;
1296 goto out_unlock; 1449 goto out_unlock;
1297 } 1450 }
@@ -1311,7 +1464,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1311 kfree(opts.release_agent); 1464 kfree(opts.release_agent);
1312 kfree(opts.name); 1465 kfree(opts.name);
1313 mutex_unlock(&cgroup_mutex); 1466 mutex_unlock(&cgroup_mutex);
1314 mutex_unlock(&cgroup_tree_mutex);
1315 return ret; 1467 return ret;
1316} 1468}
1317 1469
@@ -1369,14 +1521,22 @@ out_unlock:
1369 1521
1370static void init_cgroup_housekeeping(struct cgroup *cgrp) 1522static void init_cgroup_housekeeping(struct cgroup *cgrp)
1371{ 1523{
1372 atomic_set(&cgrp->refcnt, 1); 1524 struct cgroup_subsys *ss;
1373 INIT_LIST_HEAD(&cgrp->sibling); 1525 int ssid;
1374 INIT_LIST_HEAD(&cgrp->children); 1526
1527 INIT_LIST_HEAD(&cgrp->self.sibling);
1528 INIT_LIST_HEAD(&cgrp->self.children);
1375 INIT_LIST_HEAD(&cgrp->cset_links); 1529 INIT_LIST_HEAD(&cgrp->cset_links);
1376 INIT_LIST_HEAD(&cgrp->release_list); 1530 INIT_LIST_HEAD(&cgrp->release_list);
1377 INIT_LIST_HEAD(&cgrp->pidlists); 1531 INIT_LIST_HEAD(&cgrp->pidlists);
1378 mutex_init(&cgrp->pidlist_mutex); 1532 mutex_init(&cgrp->pidlist_mutex);
1379 cgrp->dummy_css.cgroup = cgrp; 1533 cgrp->self.cgroup = cgrp;
1534 cgrp->self.flags |= CSS_ONLINE;
1535
1536 for_each_subsys(ss, ssid)
1537 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1538
1539 init_waitqueue_head(&cgrp->offline_waitq);
1380} 1540}
1381 1541
1382static void init_cgroup_root(struct cgroup_root *root, 1542static void init_cgroup_root(struct cgroup_root *root,
@@ -1399,21 +1559,24 @@ static void init_cgroup_root(struct cgroup_root *root,
1399 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); 1559 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1400} 1560}
1401 1561
1402static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) 1562static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1403{ 1563{
1404 LIST_HEAD(tmp_links); 1564 LIST_HEAD(tmp_links);
1405 struct cgroup *root_cgrp = &root->cgrp; 1565 struct cgroup *root_cgrp = &root->cgrp;
1406 struct css_set *cset; 1566 struct css_set *cset;
1407 int i, ret; 1567 int i, ret;
1408 1568
1409 lockdep_assert_held(&cgroup_tree_mutex);
1410 lockdep_assert_held(&cgroup_mutex); 1569 lockdep_assert_held(&cgroup_mutex);
1411 1570
1412 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); 1571 ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
1413 if (ret < 0) 1572 if (ret < 0)
1414 goto out; 1573 goto out;
1415 root_cgrp->id = ret; 1574 root_cgrp->id = ret;
1416 1575
1576 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
1577 if (ret)
1578 goto out;
1579
1417 /* 1580 /*
1418 * We're accessing css_set_count without locking css_set_rwsem here, 1581 * We're accessing css_set_count without locking css_set_rwsem here,
1419 * but that's OK - it can only be increased by someone holding 1582 * but that's OK - it can only be increased by someone holding
@@ -1422,11 +1585,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1422 */ 1585 */
1423 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); 1586 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1424 if (ret) 1587 if (ret)
1425 goto out; 1588 goto cancel_ref;
1426 1589
1427 ret = cgroup_init_root_id(root); 1590 ret = cgroup_init_root_id(root);
1428 if (ret) 1591 if (ret)
1429 goto out; 1592 goto cancel_ref;
1430 1593
1431 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, 1594 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1432 KERNFS_ROOT_CREATE_DEACTIVATED, 1595 KERNFS_ROOT_CREATE_DEACTIVATED,
@@ -1462,7 +1625,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1462 link_css_set(&tmp_links, cset, root_cgrp); 1625 link_css_set(&tmp_links, cset, root_cgrp);
1463 up_write(&css_set_rwsem); 1626 up_write(&css_set_rwsem);
1464 1627
1465 BUG_ON(!list_empty(&root_cgrp->children)); 1628 BUG_ON(!list_empty(&root_cgrp->self.children));
1466 BUG_ON(atomic_read(&root->nr_cgrps) != 1); 1629 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1467 1630
1468 kernfs_activate(root_cgrp->kn); 1631 kernfs_activate(root_cgrp->kn);
@@ -1474,6 +1637,8 @@ destroy_root:
1474 root->kf_root = NULL; 1637 root->kf_root = NULL;
1475exit_root_id: 1638exit_root_id:
1476 cgroup_exit_root_id(root); 1639 cgroup_exit_root_id(root);
1640cancel_ref:
1641 percpu_ref_cancel_init(&root_cgrp->self.refcnt);
1477out: 1642out:
1478 free_cgrp_cset_links(&tmp_links); 1643 free_cgrp_cset_links(&tmp_links);
1479 return ret; 1644 return ret;
@@ -1483,10 +1648,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1483 int flags, const char *unused_dev_name, 1648 int flags, const char *unused_dev_name,
1484 void *data) 1649 void *data)
1485{ 1650{
1651 struct super_block *pinned_sb = NULL;
1652 struct cgroup_subsys *ss;
1486 struct cgroup_root *root; 1653 struct cgroup_root *root;
1487 struct cgroup_sb_opts opts; 1654 struct cgroup_sb_opts opts;
1488 struct dentry *dentry; 1655 struct dentry *dentry;
1489 int ret; 1656 int ret;
1657 int i;
1490 bool new_sb; 1658 bool new_sb;
1491 1659
1492 /* 1660 /*
@@ -1495,8 +1663,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1495 */ 1663 */
1496 if (!use_task_css_set_links) 1664 if (!use_task_css_set_links)
1497 cgroup_enable_task_cg_lists(); 1665 cgroup_enable_task_cg_lists();
1498retry: 1666
1499 mutex_lock(&cgroup_tree_mutex);
1500 mutex_lock(&cgroup_mutex); 1667 mutex_lock(&cgroup_mutex);
1501 1668
1502 /* First find the desired set of subsystems */ 1669 /* First find the desired set of subsystems */
@@ -1513,6 +1680,27 @@ retry:
1513 goto out_unlock; 1680 goto out_unlock;
1514 } 1681 }
1515 1682
1683 /*
1684 * Destruction of cgroup root is asynchronous, so subsystems may
1685 * still be dying after the previous unmount. Let's drain the
1686 * dying subsystems. We just need to ensure that the ones
1687 * unmounted previously finish dying and don't care about new ones
1688 * starting. Testing ref liveliness is good enough.
1689 */
1690 for_each_subsys(ss, i) {
1691 if (!(opts.subsys_mask & (1 << i)) ||
1692 ss->root == &cgrp_dfl_root)
1693 continue;
1694
1695 if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
1696 mutex_unlock(&cgroup_mutex);
1697 msleep(10);
1698 ret = restart_syscall();
1699 goto out_free;
1700 }
1701 cgroup_put(&ss->root->cgrp);
1702 }
1703
1516 for_each_root(root) { 1704 for_each_root(root) {
1517 bool name_match = false; 1705 bool name_match = false;
1518 1706
@@ -1535,7 +1723,7 @@ retry:
1535 * subsystems) then they must match. 1723 * subsystems) then they must match.
1536 */ 1724 */
1537 if ((opts.subsys_mask || opts.none) && 1725 if ((opts.subsys_mask || opts.none) &&
1538 (opts.subsys_mask != root->cgrp.subsys_mask)) { 1726 (opts.subsys_mask != root->subsys_mask)) {
1539 if (!name_match) 1727 if (!name_match)
1540 continue; 1728 continue;
1541 ret = -EBUSY; 1729 ret = -EBUSY;
@@ -1544,28 +1732,35 @@ retry:
1544 1732
1545 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1733 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1546 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1734 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1547 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1735 pr_err("sane_behavior: new mount options should match the existing superblock\n");
1548 ret = -EINVAL; 1736 ret = -EINVAL;
1549 goto out_unlock; 1737 goto out_unlock;
1550 } else { 1738 } else {
1551 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1739 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1552 } 1740 }
1553 } 1741 }
1554 1742
1555 /* 1743 /*
1556 * A root's lifetime is governed by its root cgroup. Zero 1744 * We want to reuse @root whose lifetime is governed by its
1557 * ref indicate that the root is being destroyed. Wait for 1745 * ->cgrp. Let's check whether @root is alive and keep it
1558 * destruction to complete so that the subsystems are free. 1746 * that way. As cgroup_kill_sb() can happen anytime, we
1559 * We can use wait_queue for the wait but this path is 1747 * want to block it by pinning the sb so that @root doesn't
1560 * super cold. Let's just sleep for a bit and retry. 1748 * get killed before mount is complete.
1749 *
1750 * With the sb pinned, tryget_live can reliably indicate
1751 * whether @root can be reused. If it's being killed,
1752 * drain it. We can use wait_queue for the wait but this
1753 * path is super cold. Let's just sleep a bit and retry.
1561 */ 1754 */
1562 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { 1755 pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
1756 if (IS_ERR(pinned_sb) ||
1757 !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
1563 mutex_unlock(&cgroup_mutex); 1758 mutex_unlock(&cgroup_mutex);
1564 mutex_unlock(&cgroup_tree_mutex); 1759 if (!IS_ERR_OR_NULL(pinned_sb))
1565 kfree(opts.release_agent); 1760 deactivate_super(pinned_sb);
1566 kfree(opts.name);
1567 msleep(10); 1761 msleep(10);
1568 goto retry; 1762 ret = restart_syscall();
1763 goto out_free;
1569 } 1764 }
1570 1765
1571 ret = 0; 1766 ret = 0;
@@ -1596,17 +1791,27 @@ retry:
1596 1791
1597out_unlock: 1792out_unlock:
1598 mutex_unlock(&cgroup_mutex); 1793 mutex_unlock(&cgroup_mutex);
1599 mutex_unlock(&cgroup_tree_mutex); 1794out_free:
1600
1601 kfree(opts.release_agent); 1795 kfree(opts.release_agent);
1602 kfree(opts.name); 1796 kfree(opts.name);
1603 1797
1604 if (ret) 1798 if (ret)
1605 return ERR_PTR(ret); 1799 return ERR_PTR(ret);
1606 1800
1607 dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); 1801 dentry = kernfs_mount(fs_type, flags, root->kf_root,
1802 CGROUP_SUPER_MAGIC, &new_sb);
1608 if (IS_ERR(dentry) || !new_sb) 1803 if (IS_ERR(dentry) || !new_sb)
1609 cgroup_put(&root->cgrp); 1804 cgroup_put(&root->cgrp);
1805
1806 /*
1807 * If @pinned_sb, we're reusing an existing root and holding an
1808 * extra ref on its sb. Mount is complete. Put the extra ref.
1809 */
1810 if (pinned_sb) {
1811 WARN_ON(new_sb);
1812 deactivate_super(pinned_sb);
1813 }
1814
1610 return dentry; 1815 return dentry;
1611} 1816}
1612 1817
@@ -1615,7 +1820,19 @@ static void cgroup_kill_sb(struct super_block *sb)
1615 struct kernfs_root *kf_root = kernfs_root_from_sb(sb); 1820 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1616 struct cgroup_root *root = cgroup_root_from_kf(kf_root); 1821 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1617 1822
1618 cgroup_put(&root->cgrp); 1823 /*
1824 * If @root doesn't have any mounts or children, start killing it.
1825 * This prevents new mounts by disabling percpu_ref_tryget_live().
1826 * cgroup_mount() may wait for @root's release.
1827 *
1828 * And don't kill the default root.
1829 */
1830 if (css_has_online_children(&root->cgrp.self) ||
1831 root == &cgrp_dfl_root)
1832 cgroup_put(&root->cgrp);
1833 else
1834 percpu_ref_kill(&root->cgrp.self.refcnt);
1835
1619 kernfs_kill_sb(sb); 1836 kernfs_kill_sb(sb);
1620} 1837}
1621 1838
@@ -1737,7 +1954,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1737 1954
1738/** 1955/**
1739 * cgroup_task_migrate - move a task from one cgroup to another. 1956 * cgroup_task_migrate - move a task from one cgroup to another.
1740 * @old_cgrp; the cgroup @tsk is being migrated from 1957 * @old_cgrp: the cgroup @tsk is being migrated from
1741 * @tsk: the task being migrated 1958 * @tsk: the task being migrated
1742 * @new_cset: the new css_set @tsk is being attached to 1959 * @new_cset: the new css_set @tsk is being attached to
1743 * 1960 *
@@ -1829,10 +2046,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
1829 2046
1830 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); 2047 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
1831 2048
1832 /* nothing to do if this cset already belongs to the cgroup */
1833 if (src_cgrp == dst_cgrp)
1834 return;
1835
1836 if (!list_empty(&src_cset->mg_preload_node)) 2049 if (!list_empty(&src_cset->mg_preload_node))
1837 return; 2050 return;
1838 2051
@@ -1847,13 +2060,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
1847 2060
1848/** 2061/**
1849 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration 2062 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
1850 * @dst_cgrp: the destination cgroup 2063 * @dst_cgrp: the destination cgroup (may be %NULL)
1851 * @preloaded_csets: list of preloaded source css_sets 2064 * @preloaded_csets: list of preloaded source css_sets
1852 * 2065 *
1853 * Tasks are about to be moved to @dst_cgrp and all the source css_sets 2066 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
1854 * have been preloaded to @preloaded_csets. This function looks up and 2067 * have been preloaded to @preloaded_csets. This function looks up and
1855 * pins all destination css_sets, links each to its source, and put them on 2068 * pins all destination css_sets, links each to its source, and append them
1856 * @preloaded_csets. 2069 * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
2070 * source css_set is assumed to be its cgroup on the default hierarchy.
1857 * 2071 *
1858 * This function must be called after cgroup_migrate_add_src() has been 2072 * This function must be called after cgroup_migrate_add_src() has been
1859 * called on each migration source css_set. After migration is performed 2073 * called on each migration source css_set. After migration is performed
@@ -1864,19 +2078,42 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1864 struct list_head *preloaded_csets) 2078 struct list_head *preloaded_csets)
1865{ 2079{
1866 LIST_HEAD(csets); 2080 LIST_HEAD(csets);
1867 struct css_set *src_cset; 2081 struct css_set *src_cset, *tmp_cset;
1868 2082
1869 lockdep_assert_held(&cgroup_mutex); 2083 lockdep_assert_held(&cgroup_mutex);
1870 2084
2085 /*
2086 * Except for the root, child_subsys_mask must be zero for a cgroup
2087 * with tasks so that child cgroups don't compete against tasks.
2088 */
2089 if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
2090 dst_cgrp->child_subsys_mask)
2091 return -EBUSY;
2092
1871 /* look up the dst cset for each src cset and link it to src */ 2093 /* look up the dst cset for each src cset and link it to src */
1872 list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { 2094 list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
1873 struct css_set *dst_cset; 2095 struct css_set *dst_cset;
1874 2096
1875 dst_cset = find_css_set(src_cset, dst_cgrp); 2097 dst_cset = find_css_set(src_cset,
2098 dst_cgrp ?: src_cset->dfl_cgrp);
1876 if (!dst_cset) 2099 if (!dst_cset)
1877 goto err; 2100 goto err;
1878 2101
1879 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); 2102 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2103
2104 /*
2105 * If src cset equals dst, it's noop. Drop the src.
2106 * cgroup_migrate() will skip the cset too. Note that we
2107 * can't handle src == dst as some nodes are used by both.
2108 */
2109 if (src_cset == dst_cset) {
2110 src_cset->mg_src_cgrp = NULL;
2111 list_del_init(&src_cset->mg_preload_node);
2112 put_css_set(src_cset, false);
2113 put_css_set(dst_cset, false);
2114 continue;
2115 }
2116
1880 src_cset->mg_dst_cset = dst_cset; 2117 src_cset->mg_dst_cset = dst_cset;
1881 2118
1882 if (list_empty(&dst_cset->mg_preload_node)) 2119 if (list_empty(&dst_cset->mg_preload_node))
@@ -1885,7 +2122,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1885 put_css_set(dst_cset, false); 2122 put_css_set(dst_cset, false);
1886 } 2123 }
1887 2124
1888 list_splice(&csets, preloaded_csets); 2125 list_splice_tail(&csets, preloaded_csets);
1889 return 0; 2126 return 0;
1890err: 2127err:
1891 cgroup_migrate_finish(&csets); 2128 cgroup_migrate_finish(&csets);
@@ -1966,7 +2203,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1966 return 0; 2203 return 0;
1967 2204
1968 /* check that we can legitimately attach to the cgroup */ 2205 /* check that we can legitimately attach to the cgroup */
1969 for_each_css(css, i, cgrp) { 2206 for_each_e_css(css, i, cgrp) {
1970 if (css->ss->can_attach) { 2207 if (css->ss->can_attach) {
1971 ret = css->ss->can_attach(css, &tset); 2208 ret = css->ss->can_attach(css, &tset);
1972 if (ret) { 2209 if (ret) {
@@ -1996,7 +2233,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1996 */ 2233 */
1997 tset.csets = &tset.dst_csets; 2234 tset.csets = &tset.dst_csets;
1998 2235
1999 for_each_css(css, i, cgrp) 2236 for_each_e_css(css, i, cgrp)
2000 if (css->ss->attach) 2237 if (css->ss->attach)
2001 css->ss->attach(css, &tset); 2238 css->ss->attach(css, &tset);
2002 2239
@@ -2004,7 +2241,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
2004 goto out_release_tset; 2241 goto out_release_tset;
2005 2242
2006out_cancel_attach: 2243out_cancel_attach:
2007 for_each_css(css, i, cgrp) { 2244 for_each_e_css(css, i, cgrp) {
2008 if (css == failed_css) 2245 if (css == failed_css)
2009 break; 2246 break;
2010 if (css->ss->cancel_attach) 2247 if (css->ss->cancel_attach)
@@ -2063,13 +2300,20 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2063 * function to attach either it or all tasks in its threadgroup. Will lock 2300 * function to attach either it or all tasks in its threadgroup. Will lock
2064 * cgroup_mutex and threadgroup. 2301 * cgroup_mutex and threadgroup.
2065 */ 2302 */
2066static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2303static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2304 size_t nbytes, loff_t off, bool threadgroup)
2067{ 2305{
2068 struct task_struct *tsk; 2306 struct task_struct *tsk;
2069 const struct cred *cred = current_cred(), *tcred; 2307 const struct cred *cred = current_cred(), *tcred;
2308 struct cgroup *cgrp;
2309 pid_t pid;
2070 int ret; 2310 int ret;
2071 2311
2072 if (!cgroup_lock_live_group(cgrp)) 2312 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2313 return -EINVAL;
2314
2315 cgrp = cgroup_kn_lock_live(of->kn);
2316 if (!cgrp)
2073 return -ENODEV; 2317 return -ENODEV;
2074 2318
2075retry_find_task: 2319retry_find_task:
@@ -2135,8 +2379,8 @@ retry_find_task:
2135 2379
2136 put_task_struct(tsk); 2380 put_task_struct(tsk);
2137out_unlock_cgroup: 2381out_unlock_cgroup:
2138 mutex_unlock(&cgroup_mutex); 2382 cgroup_kn_unlock(of->kn);
2139 return ret; 2383 return ret ?: nbytes;
2140} 2384}
2141 2385
2142/** 2386/**
@@ -2170,43 +2414,44 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2170} 2414}
2171EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2415EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2172 2416
2173static int cgroup_tasks_write(struct cgroup_subsys_state *css, 2417static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
2174 struct cftype *cft, u64 pid) 2418 char *buf, size_t nbytes, loff_t off)
2175{ 2419{
2176 return attach_task_by_pid(css->cgroup, pid, false); 2420 return __cgroup_procs_write(of, buf, nbytes, off, false);
2177} 2421}
2178 2422
2179static int cgroup_procs_write(struct cgroup_subsys_state *css, 2423static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
2180 struct cftype *cft, u64 tgid) 2424 char *buf, size_t nbytes, loff_t off)
2181{ 2425{
2182 return attach_task_by_pid(css->cgroup, tgid, true); 2426 return __cgroup_procs_write(of, buf, nbytes, off, true);
2183} 2427}
2184 2428
2185static int cgroup_release_agent_write(struct cgroup_subsys_state *css, 2429static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
2186 struct cftype *cft, char *buffer) 2430 char *buf, size_t nbytes, loff_t off)
2187{ 2431{
2188 struct cgroup_root *root = css->cgroup->root; 2432 struct cgroup *cgrp;
2433
2434 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2189 2435
2190 BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); 2436 cgrp = cgroup_kn_lock_live(of->kn);
2191 if (!cgroup_lock_live_group(css->cgroup)) 2437 if (!cgrp)
2192 return -ENODEV; 2438 return -ENODEV;
2193 spin_lock(&release_agent_path_lock); 2439 spin_lock(&release_agent_path_lock);
2194 strlcpy(root->release_agent_path, buffer, 2440 strlcpy(cgrp->root->release_agent_path, strstrip(buf),
2195 sizeof(root->release_agent_path)); 2441 sizeof(cgrp->root->release_agent_path));
2196 spin_unlock(&release_agent_path_lock); 2442 spin_unlock(&release_agent_path_lock);
2197 mutex_unlock(&cgroup_mutex); 2443 cgroup_kn_unlock(of->kn);
2198 return 0; 2444 return nbytes;
2199} 2445}
2200 2446
2201static int cgroup_release_agent_show(struct seq_file *seq, void *v) 2447static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2202{ 2448{
2203 struct cgroup *cgrp = seq_css(seq)->cgroup; 2449 struct cgroup *cgrp = seq_css(seq)->cgroup;
2204 2450
2205 if (!cgroup_lock_live_group(cgrp)) 2451 spin_lock(&release_agent_path_lock);
2206 return -ENODEV;
2207 seq_puts(seq, cgrp->root->release_agent_path); 2452 seq_puts(seq, cgrp->root->release_agent_path);
2453 spin_unlock(&release_agent_path_lock);
2208 seq_putc(seq, '\n'); 2454 seq_putc(seq, '\n');
2209 mutex_unlock(&cgroup_mutex);
2210 return 0; 2455 return 0;
2211} 2456}
2212 2457
@@ -2218,6 +2463,320 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2218 return 0; 2463 return 0;
2219} 2464}
2220 2465
2466static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
2467{
2468 struct cgroup_subsys *ss;
2469 bool printed = false;
2470 int ssid;
2471
2472 for_each_subsys(ss, ssid) {
2473 if (ss_mask & (1 << ssid)) {
2474 if (printed)
2475 seq_putc(seq, ' ');
2476 seq_printf(seq, "%s", ss->name);
2477 printed = true;
2478 }
2479 }
2480 if (printed)
2481 seq_putc(seq, '\n');
2482}
2483
2484/* show controllers which are currently attached to the default hierarchy */
2485static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
2486{
2487 struct cgroup *cgrp = seq_css(seq)->cgroup;
2488
2489 cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
2490 ~cgrp_dfl_root_inhibit_ss_mask);
2491 return 0;
2492}
2493
2494/* show controllers which are enabled from the parent */
2495static int cgroup_controllers_show(struct seq_file *seq, void *v)
2496{
2497 struct cgroup *cgrp = seq_css(seq)->cgroup;
2498
2499 cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
2500 return 0;
2501}
2502
2503/* show controllers which are enabled for a given cgroup's children */
2504static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2505{
2506 struct cgroup *cgrp = seq_css(seq)->cgroup;
2507
2508 cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
2509 return 0;
2510}
2511
2512/**
2513 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2514 * @cgrp: root of the subtree to update csses for
2515 *
2516 * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
2517 * css associations need to be updated accordingly. This function looks up
2518 * all css_sets which are attached to the subtree, creates the matching
2519 * updated css_sets and migrates the tasks to the new ones.
2520 */
2521static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2522{
2523 LIST_HEAD(preloaded_csets);
2524 struct cgroup_subsys_state *css;
2525 struct css_set *src_cset;
2526 int ret;
2527
2528 lockdep_assert_held(&cgroup_mutex);
2529
2530 /* look up all csses currently attached to @cgrp's subtree */
2531 down_read(&css_set_rwsem);
2532 css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
2533 struct cgrp_cset_link *link;
2534
2535 /* self is not affected by child_subsys_mask change */
2536 if (css->cgroup == cgrp)
2537 continue;
2538
2539 list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
2540 cgroup_migrate_add_src(link->cset, cgrp,
2541 &preloaded_csets);
2542 }
2543 up_read(&css_set_rwsem);
2544
2545 /* NULL dst indicates self on default hierarchy */
2546 ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
2547 if (ret)
2548 goto out_finish;
2549
2550 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2551 struct task_struct *last_task = NULL, *task;
2552
2553 /* src_csets precede dst_csets, break on the first dst_cset */
2554 if (!src_cset->mg_src_cgrp)
2555 break;
2556
2557 /*
2558 * All tasks in src_cset need to be migrated to the
2559 * matching dst_cset. Empty it process by process. We
2560 * walk tasks but migrate processes. The leader might even
2561 * belong to a different cset but such src_cset would also
2562 * be among the target src_csets because the default
2563 * hierarchy enforces per-process membership.
2564 */
2565 while (true) {
2566 down_read(&css_set_rwsem);
2567 task = list_first_entry_or_null(&src_cset->tasks,
2568 struct task_struct, cg_list);
2569 if (task) {
2570 task = task->group_leader;
2571 WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
2572 get_task_struct(task);
2573 }
2574 up_read(&css_set_rwsem);
2575
2576 if (!task)
2577 break;
2578
2579 /* guard against possible infinite loop */
2580 if (WARN(last_task == task,
2581 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
2582 goto out_finish;
2583 last_task = task;
2584
2585 threadgroup_lock(task);
2586 /* raced against de_thread() from another thread? */
2587 if (!thread_group_leader(task)) {
2588 threadgroup_unlock(task);
2589 put_task_struct(task);
2590 continue;
2591 }
2592
2593 ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2594
2595 threadgroup_unlock(task);
2596 put_task_struct(task);
2597
2598 if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
2599 goto out_finish;
2600 }
2601 }
2602
2603out_finish:
2604 cgroup_migrate_finish(&preloaded_csets);
2605 return ret;
2606}
2607
2608/* change the enabled child controllers for a cgroup in the default hierarchy */
2609static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2610 char *buf, size_t nbytes,
2611 loff_t off)
2612{
2613 unsigned int enable = 0, disable = 0;
2614 struct cgroup *cgrp, *child;
2615 struct cgroup_subsys *ss;
2616 char *tok;
2617 int ssid, ret;
2618
2619 /*
2620 * Parse input - space separated list of subsystem names prefixed
2621 * with either + or -.
2622 */
2623 buf = strstrip(buf);
2624 while ((tok = strsep(&buf, " "))) {
2625 if (tok[0] == '\0')
2626 continue;
2627 for_each_subsys(ss, ssid) {
2628 if (ss->disabled || strcmp(tok + 1, ss->name) ||
2629 ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
2630 continue;
2631
2632 if (*tok == '+') {
2633 enable |= 1 << ssid;
2634 disable &= ~(1 << ssid);
2635 } else if (*tok == '-') {
2636 disable |= 1 << ssid;
2637 enable &= ~(1 << ssid);
2638 } else {
2639 return -EINVAL;
2640 }
2641 break;
2642 }
2643 if (ssid == CGROUP_SUBSYS_COUNT)
2644 return -EINVAL;
2645 }
2646
2647 cgrp = cgroup_kn_lock_live(of->kn);
2648 if (!cgrp)
2649 return -ENODEV;
2650
2651 for_each_subsys(ss, ssid) {
2652 if (enable & (1 << ssid)) {
2653 if (cgrp->child_subsys_mask & (1 << ssid)) {
2654 enable &= ~(1 << ssid);
2655 continue;
2656 }
2657
2658 /*
2659 * Because css offlining is asynchronous, userland
2660 * might try to re-enable the same controller while
2661 * the previous instance is still around. In such
2662 * cases, wait till it's gone using offline_waitq.
2663 */
2664 cgroup_for_each_live_child(child, cgrp) {
2665 DEFINE_WAIT(wait);
2666
2667 if (!cgroup_css(child, ss))
2668 continue;
2669
2670 cgroup_get(child);
2671 prepare_to_wait(&child->offline_waitq, &wait,
2672 TASK_UNINTERRUPTIBLE);
2673 cgroup_kn_unlock(of->kn);
2674 schedule();
2675 finish_wait(&child->offline_waitq, &wait);
2676 cgroup_put(child);
2677
2678 return restart_syscall();
2679 }
2680
2681 /* unavailable or not enabled on the parent? */
2682 if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2683 (cgroup_parent(cgrp) &&
2684 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
2685 ret = -ENOENT;
2686 goto out_unlock;
2687 }
2688 } else if (disable & (1 << ssid)) {
2689 if (!(cgrp->child_subsys_mask & (1 << ssid))) {
2690 disable &= ~(1 << ssid);
2691 continue;
2692 }
2693
2694 /* a child has it enabled? */
2695 cgroup_for_each_live_child(child, cgrp) {
2696 if (child->child_subsys_mask & (1 << ssid)) {
2697 ret = -EBUSY;
2698 goto out_unlock;
2699 }
2700 }
2701 }
2702 }
2703
2704 if (!enable && !disable) {
2705 ret = 0;
2706 goto out_unlock;
2707 }
2708
2709 /*
2710 * Except for the root, child_subsys_mask must be zero for a cgroup
2711 * with tasks so that child cgroups don't compete against tasks.
2712 */
2713 if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
2714 ret = -EBUSY;
2715 goto out_unlock;
2716 }
2717
2718 /*
2719 * Create csses for enables and update child_subsys_mask. This
2720 * changes cgroup_e_css() results which in turn makes the
2721 * subsequent cgroup_update_dfl_csses() associate all tasks in the
2722 * subtree to the updated csses.
2723 */
2724 for_each_subsys(ss, ssid) {
2725 if (!(enable & (1 << ssid)))
2726 continue;
2727
2728 cgroup_for_each_live_child(child, cgrp) {
2729 ret = create_css(child, ss);
2730 if (ret)
2731 goto err_undo_css;
2732 }
2733 }
2734
2735 cgrp->child_subsys_mask |= enable;
2736 cgrp->child_subsys_mask &= ~disable;
2737
2738 ret = cgroup_update_dfl_csses(cgrp);
2739 if (ret)
2740 goto err_undo_css;
2741
2742 /* all tasks are now migrated away from the old csses, kill them */
2743 for_each_subsys(ss, ssid) {
2744 if (!(disable & (1 << ssid)))
2745 continue;
2746
2747 cgroup_for_each_live_child(child, cgrp)
2748 kill_css(cgroup_css(child, ss));
2749 }
2750
2751 kernfs_activate(cgrp->kn);
2752 ret = 0;
2753out_unlock:
2754 cgroup_kn_unlock(of->kn);
2755 return ret ?: nbytes;
2756
2757err_undo_css:
2758 cgrp->child_subsys_mask &= ~enable;
2759 cgrp->child_subsys_mask |= disable;
2760
2761 for_each_subsys(ss, ssid) {
2762 if (!(enable & (1 << ssid)))
2763 continue;
2764
2765 cgroup_for_each_live_child(child, cgrp) {
2766 struct cgroup_subsys_state *css = cgroup_css(child, ss);
2767 if (css)
2768 kill_css(css);
2769 }
2770 }
2771 goto out_unlock;
2772}
2773
2774static int cgroup_populated_show(struct seq_file *seq, void *v)
2775{
2776 seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
2777 return 0;
2778}
2779
2221static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, 2780static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2222 size_t nbytes, loff_t off) 2781 size_t nbytes, loff_t off)
2223{ 2782{
@@ -2226,6 +2785,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2226 struct cgroup_subsys_state *css; 2785 struct cgroup_subsys_state *css;
2227 int ret; 2786 int ret;
2228 2787
2788 if (cft->write)
2789 return cft->write(of, buf, nbytes, off);
2790
2229 /* 2791 /*
2230 * kernfs guarantees that a file isn't deleted with operations in 2792 * kernfs guarantees that a file isn't deleted with operations in
2231 * flight, which means that the matching css is and stays alive and 2793 * flight, which means that the matching css is and stays alive and
@@ -2236,9 +2798,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2236 css = cgroup_css(cgrp, cft->ss); 2798 css = cgroup_css(cgrp, cft->ss);
2237 rcu_read_unlock(); 2799 rcu_read_unlock();
2238 2800
2239 if (cft->write_string) { 2801 if (cft->write_u64) {
2240 ret = cft->write_string(css, cft, strstrip(buf));
2241 } else if (cft->write_u64) {
2242 unsigned long long v; 2802 unsigned long long v;
2243 ret = kstrtoull(buf, 0, &v); 2803 ret = kstrtoull(buf, 0, &v);
2244 if (!ret) 2804 if (!ret)
@@ -2248,8 +2808,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2248 ret = kstrtoll(buf, 0, &v); 2808 ret = kstrtoll(buf, 0, &v);
2249 if (!ret) 2809 if (!ret)
2250 ret = cft->write_s64(css, cft, v); 2810 ret = cft->write_s64(css, cft, v);
2251 } else if (cft->trigger) {
2252 ret = cft->trigger(css, (unsigned int)cft->private);
2253 } else { 2811 } else {
2254 ret = -EINVAL; 2812 ret = -EINVAL;
2255 } 2813 }
@@ -2326,20 +2884,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2326 return -EPERM; 2884 return -EPERM;
2327 2885
2328 /* 2886 /*
2329 * We're gonna grab cgroup_tree_mutex which nests outside kernfs 2887 * We're gonna grab cgroup_mutex which nests outside kernfs
2330 * active_ref. kernfs_rename() doesn't require active_ref 2888 * active_ref. kernfs_rename() doesn't require active_ref
2331 * protection. Break them before grabbing cgroup_tree_mutex. 2889 * protection. Break them before grabbing cgroup_mutex.
2332 */ 2890 */
2333 kernfs_break_active_protection(new_parent); 2891 kernfs_break_active_protection(new_parent);
2334 kernfs_break_active_protection(kn); 2892 kernfs_break_active_protection(kn);
2335 2893
2336 mutex_lock(&cgroup_tree_mutex);
2337 mutex_lock(&cgroup_mutex); 2894 mutex_lock(&cgroup_mutex);
2338 2895
2339 ret = kernfs_rename(kn, new_parent, new_name_str); 2896 ret = kernfs_rename(kn, new_parent, new_name_str);
2340 2897
2341 mutex_unlock(&cgroup_mutex); 2898 mutex_unlock(&cgroup_mutex);
2342 mutex_unlock(&cgroup_tree_mutex);
2343 2899
2344 kernfs_unbreak_active_protection(kn); 2900 kernfs_unbreak_active_protection(kn);
2345 kernfs_unbreak_active_protection(new_parent); 2901 kernfs_unbreak_active_protection(new_parent);
@@ -2377,9 +2933,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2377 return PTR_ERR(kn); 2933 return PTR_ERR(kn);
2378 2934
2379 ret = cgroup_kn_set_ugid(kn); 2935 ret = cgroup_kn_set_ugid(kn);
2380 if (ret) 2936 if (ret) {
2381 kernfs_remove(kn); 2937 kernfs_remove(kn);
2382 return ret; 2938 return ret;
2939 }
2940
2941 if (cft->seq_show == cgroup_populated_show)
2942 cgrp->populated_kn = kn;
2943 return 0;
2383} 2944}
2384 2945
2385/** 2946/**
@@ -2399,7 +2960,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2399 struct cftype *cft; 2960 struct cftype *cft;
2400 int ret; 2961 int ret;
2401 2962
2402 lockdep_assert_held(&cgroup_tree_mutex); 2963 lockdep_assert_held(&cgroup_mutex);
2403 2964
2404 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2965 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2405 /* does cft->flags tell us to skip this file on @cgrp? */ 2966 /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2407,16 +2968,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2407 continue; 2968 continue;
2408 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 2969 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2409 continue; 2970 continue;
2410 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2971 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
2411 continue; 2972 continue;
2412 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2973 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
2413 continue; 2974 continue;
2414 2975
2415 if (is_add) { 2976 if (is_add) {
2416 ret = cgroup_add_file(cgrp, cft); 2977 ret = cgroup_add_file(cgrp, cft);
2417 if (ret) { 2978 if (ret) {
2418 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2979 pr_warn("%s: failed to add %s, err=%d\n",
2419 cft->name, ret); 2980 __func__, cft->name, ret);
2420 return ret; 2981 return ret;
2421 } 2982 }
2422 } else { 2983 } else {
@@ -2434,11 +2995,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2434 struct cgroup_subsys_state *css; 2995 struct cgroup_subsys_state *css;
2435 int ret = 0; 2996 int ret = 0;
2436 2997
2437 lockdep_assert_held(&cgroup_tree_mutex); 2998 lockdep_assert_held(&cgroup_mutex);
2438
2439 /* don't bother if @ss isn't attached */
2440 if (ss->root == &cgrp_dfl_root)
2441 return 0;
2442 2999
2443 /* add/rm files for all cgroups created before */ 3000 /* add/rm files for all cgroups created before */
2444 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 3001 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2506,7 +3063,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2506 3063
2507static int cgroup_rm_cftypes_locked(struct cftype *cfts) 3064static int cgroup_rm_cftypes_locked(struct cftype *cfts)
2508{ 3065{
2509 lockdep_assert_held(&cgroup_tree_mutex); 3066 lockdep_assert_held(&cgroup_mutex);
2510 3067
2511 if (!cfts || !cfts[0].ss) 3068 if (!cfts || !cfts[0].ss)
2512 return -ENOENT; 3069 return -ENOENT;
@@ -2532,9 +3089,9 @@ int cgroup_rm_cftypes(struct cftype *cfts)
2532{ 3089{
2533 int ret; 3090 int ret;
2534 3091
2535 mutex_lock(&cgroup_tree_mutex); 3092 mutex_lock(&cgroup_mutex);
2536 ret = cgroup_rm_cftypes_locked(cfts); 3093 ret = cgroup_rm_cftypes_locked(cfts);
2537 mutex_unlock(&cgroup_tree_mutex); 3094 mutex_unlock(&cgroup_mutex);
2538 return ret; 3095 return ret;
2539} 3096}
2540 3097
@@ -2556,6 +3113,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2556{ 3113{
2557 int ret; 3114 int ret;
2558 3115
3116 if (ss->disabled)
3117 return 0;
3118
2559 if (!cfts || cfts[0].name[0] == '\0') 3119 if (!cfts || cfts[0].name[0] == '\0')
2560 return 0; 3120 return 0;
2561 3121
@@ -2563,14 +3123,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2563 if (ret) 3123 if (ret)
2564 return ret; 3124 return ret;
2565 3125
2566 mutex_lock(&cgroup_tree_mutex); 3126 mutex_lock(&cgroup_mutex);
2567 3127
2568 list_add_tail(&cfts->node, &ss->cfts); 3128 list_add_tail(&cfts->node, &ss->cfts);
2569 ret = cgroup_apply_cftypes(cfts, true); 3129 ret = cgroup_apply_cftypes(cfts, true);
2570 if (ret) 3130 if (ret)
2571 cgroup_rm_cftypes_locked(cfts); 3131 cgroup_rm_cftypes_locked(cfts);
2572 3132
2573 mutex_unlock(&cgroup_tree_mutex); 3133 mutex_unlock(&cgroup_mutex);
2574 return ret; 3134 return ret;
2575} 3135}
2576 3136
@@ -2594,57 +3154,65 @@ static int cgroup_task_count(const struct cgroup *cgrp)
2594 3154
2595/** 3155/**
2596 * css_next_child - find the next child of a given css 3156 * css_next_child - find the next child of a given css
2597 * @pos_css: the current position (%NULL to initiate traversal) 3157 * @pos: the current position (%NULL to initiate traversal)
2598 * @parent_css: css whose children to walk 3158 * @parent: css whose children to walk
2599 * 3159 *
2600 * This function returns the next child of @parent_css and should be called 3160 * This function returns the next child of @parent and should be called
2601 * under either cgroup_mutex or RCU read lock. The only requirement is 3161 * under either cgroup_mutex or RCU read lock. The only requirement is
2602 * that @parent_css and @pos_css are accessible. The next sibling is 3162 * that @parent and @pos are accessible. The next sibling is guaranteed to
2603 * guaranteed to be returned regardless of their states. 3163 * be returned regardless of their states.
3164 *
3165 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3166 * css which finished ->css_online() is guaranteed to be visible in the
3167 * future iterations and will stay visible until the last reference is put.
3168 * A css which hasn't finished ->css_online() or already finished
3169 * ->css_offline() may show up during traversal. It's each subsystem's
3170 * responsibility to synchronize against on/offlining.
2604 */ 3171 */
2605struct cgroup_subsys_state * 3172struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
2606css_next_child(struct cgroup_subsys_state *pos_css, 3173 struct cgroup_subsys_state *parent)
2607 struct cgroup_subsys_state *parent_css)
2608{ 3174{
2609 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; 3175 struct cgroup_subsys_state *next;
2610 struct cgroup *cgrp = parent_css->cgroup;
2611 struct cgroup *next;
2612 3176
2613 cgroup_assert_mutexes_or_rcu_locked(); 3177 cgroup_assert_mutex_or_rcu_locked();
2614 3178
2615 /* 3179 /*
2616 * @pos could already have been removed. Once a cgroup is removed, 3180 * @pos could already have been unlinked from the sibling list.
2617 * its ->sibling.next is no longer updated when its next sibling 3181 * Once a cgroup is removed, its ->sibling.next is no longer
2618 * changes. As CGRP_DEAD assertion is serialized and happens 3182 * updated when its next sibling changes. CSS_RELEASED is set when
2619 * before the cgroup is taken off the ->sibling list, if we see it 3183 * @pos is taken off list, at which time its next pointer is valid,
2620 * unasserted, it's guaranteed that the next sibling hasn't 3184 * and, as releases are serialized, the one pointed to by the next
2621 * finished its grace period even if it's already removed, and thus 3185 * pointer is guaranteed to not have started release yet. This
2622 * safe to dereference from this RCU critical section. If 3186 * implies that if we observe !CSS_RELEASED on @pos in this RCU
2623 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 3187 * critical section, the one pointed to by its next pointer is
2624 * to be visible as %true here. 3188 * guaranteed to not have finished its RCU grace period even if we
3189 * have dropped rcu_read_lock() inbetween iterations.
2625 * 3190 *
2626 * If @pos is dead, its next pointer can't be dereferenced; 3191 * If @pos has CSS_RELEASED set, its next pointer can't be
2627 * however, as each cgroup is given a monotonically increasing 3192 * dereferenced; however, as each css is given a monotonically
2628 * unique serial number and always appended to the sibling list, 3193 * increasing unique serial number and always appended to the
2629 * the next one can be found by walking the parent's children until 3194 * sibling list, the next one can be found by walking the parent's
2630 * we see a cgroup with higher serial number than @pos's. While 3195 * children until the first css with higher serial number than
2631 * this path can be slower, it's taken only when either the current 3196 * @pos's. While this path can be slower, it happens iff iteration
2632 * cgroup is removed or iteration and removal race. 3197 * races against release and the race window is very small.
2633 */ 3198 */
2634 if (!pos) { 3199 if (!pos) {
2635 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); 3200 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
2636 } else if (likely(!cgroup_is_dead(pos))) { 3201 } else if (likely(!(pos->flags & CSS_RELEASED))) {
2637 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3202 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
2638 } else { 3203 } else {
2639 list_for_each_entry_rcu(next, &cgrp->children, sibling) 3204 list_for_each_entry_rcu(next, &parent->children, sibling)
2640 if (next->serial_nr > pos->serial_nr) 3205 if (next->serial_nr > pos->serial_nr)
2641 break; 3206 break;
2642 } 3207 }
2643 3208
2644 if (&next->sibling == &cgrp->children) 3209 /*
2645 return NULL; 3210 * @next, if not pointing to the head, can be dereferenced and is
2646 3211 * the next sibling.
2647 return cgroup_css(next, parent_css->ss); 3212 */
3213 if (&next->sibling != &parent->children)
3214 return next;
3215 return NULL;
2648} 3216}
2649 3217
2650/** 3218/**
@@ -2660,6 +3228,13 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2660 * doesn't require the whole traversal to be contained in a single critical 3228 * doesn't require the whole traversal to be contained in a single critical
2661 * section. This function will return the correct next descendant as long 3229 * section. This function will return the correct next descendant as long
2662 * as both @pos and @root are accessible and @pos is a descendant of @root. 3230 * as both @pos and @root are accessible and @pos is a descendant of @root.
3231 *
3232 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3233 * css which finished ->css_online() is guaranteed to be visible in the
3234 * future iterations and will stay visible until the last reference is put.
3235 * A css which hasn't finished ->css_online() or already finished
3236 * ->css_offline() may show up during traversal. It's each subsystem's
3237 * responsibility to synchronize against on/offlining.
2663 */ 3238 */
2664struct cgroup_subsys_state * 3239struct cgroup_subsys_state *
2665css_next_descendant_pre(struct cgroup_subsys_state *pos, 3240css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -2667,7 +3242,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2667{ 3242{
2668 struct cgroup_subsys_state *next; 3243 struct cgroup_subsys_state *next;
2669 3244
2670 cgroup_assert_mutexes_or_rcu_locked(); 3245 cgroup_assert_mutex_or_rcu_locked();
2671 3246
2672 /* if first iteration, visit @root */ 3247 /* if first iteration, visit @root */
2673 if (!pos) 3248 if (!pos)
@@ -2680,10 +3255,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2680 3255
2681 /* no child, visit my or the closest ancestor's next sibling */ 3256 /* no child, visit my or the closest ancestor's next sibling */
2682 while (pos != root) { 3257 while (pos != root) {
2683 next = css_next_child(pos, css_parent(pos)); 3258 next = css_next_child(pos, pos->parent);
2684 if (next) 3259 if (next)
2685 return next; 3260 return next;
2686 pos = css_parent(pos); 3261 pos = pos->parent;
2687 } 3262 }
2688 3263
2689 return NULL; 3264 return NULL;
@@ -2707,7 +3282,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
2707{ 3282{
2708 struct cgroup_subsys_state *last, *tmp; 3283 struct cgroup_subsys_state *last, *tmp;
2709 3284
2710 cgroup_assert_mutexes_or_rcu_locked(); 3285 cgroup_assert_mutex_or_rcu_locked();
2711 3286
2712 do { 3287 do {
2713 last = pos; 3288 last = pos;
@@ -2747,6 +3322,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
2747 * section. This function will return the correct next descendant as long 3322 * section. This function will return the correct next descendant as long
2748 * as both @pos and @cgroup are accessible and @pos is a descendant of 3323 * as both @pos and @cgroup are accessible and @pos is a descendant of
2749 * @cgroup. 3324 * @cgroup.
3325 *
3326 * If a subsystem synchronizes ->css_online() and the start of iteration, a
3327 * css which finished ->css_online() is guaranteed to be visible in the
3328 * future iterations and will stay visible until the last reference is put.
3329 * A css which hasn't finished ->css_online() or already finished
3330 * ->css_offline() may show up during traversal. It's each subsystem's
3331 * responsibility to synchronize against on/offlining.
2750 */ 3332 */
2751struct cgroup_subsys_state * 3333struct cgroup_subsys_state *
2752css_next_descendant_post(struct cgroup_subsys_state *pos, 3334css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -2754,7 +3336,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2754{ 3336{
2755 struct cgroup_subsys_state *next; 3337 struct cgroup_subsys_state *next;
2756 3338
2757 cgroup_assert_mutexes_or_rcu_locked(); 3339 cgroup_assert_mutex_or_rcu_locked();
2758 3340
2759 /* if first iteration, visit leftmost descendant which may be @root */ 3341 /* if first iteration, visit leftmost descendant which may be @root */
2760 if (!pos) 3342 if (!pos)
@@ -2765,12 +3347,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2765 return NULL; 3347 return NULL;
2766 3348
2767 /* if there's an unvisited sibling, visit its leftmost descendant */ 3349 /* if there's an unvisited sibling, visit its leftmost descendant */
2768 next = css_next_child(pos, css_parent(pos)); 3350 next = css_next_child(pos, pos->parent);
2769 if (next) 3351 if (next)
2770 return css_leftmost_descendant(next); 3352 return css_leftmost_descendant(next);
2771 3353
2772 /* no sibling left, visit parent */ 3354 /* no sibling left, visit parent */
2773 return css_parent(pos); 3355 return pos->parent;
3356}
3357
3358/**
3359 * css_has_online_children - does a css have online children
3360 * @css: the target css
3361 *
3362 * Returns %true if @css has any online children; otherwise, %false. This
3363 * function can be called from any context but the caller is responsible
3364 * for synchronizing against on/offlining as necessary.
3365 */
3366bool css_has_online_children(struct cgroup_subsys_state *css)
3367{
3368 struct cgroup_subsys_state *child;
3369 bool ret = false;
3370
3371 rcu_read_lock();
3372 css_for_each_child(child, css) {
3373 if (child->flags & CSS_ONLINE) {
3374 ret = true;
3375 break;
3376 }
3377 }
3378 rcu_read_unlock();
3379 return ret;
2774} 3380}
2775 3381
2776/** 3382/**
@@ -2781,27 +3387,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
2781 */ 3387 */
2782static void css_advance_task_iter(struct css_task_iter *it) 3388static void css_advance_task_iter(struct css_task_iter *it)
2783{ 3389{
2784 struct list_head *l = it->cset_link; 3390 struct list_head *l = it->cset_pos;
2785 struct cgrp_cset_link *link; 3391 struct cgrp_cset_link *link;
2786 struct css_set *cset; 3392 struct css_set *cset;
2787 3393
2788 /* Advance to the next non-empty css_set */ 3394 /* Advance to the next non-empty css_set */
2789 do { 3395 do {
2790 l = l->next; 3396 l = l->next;
2791 if (l == &it->origin_css->cgroup->cset_links) { 3397 if (l == it->cset_head) {
2792 it->cset_link = NULL; 3398 it->cset_pos = NULL;
2793 return; 3399 return;
2794 } 3400 }
2795 link = list_entry(l, struct cgrp_cset_link, cset_link); 3401
2796 cset = link->cset; 3402 if (it->ss) {
3403 cset = container_of(l, struct css_set,
3404 e_cset_node[it->ss->id]);
3405 } else {
3406 link = list_entry(l, struct cgrp_cset_link, cset_link);
3407 cset = link->cset;
3408 }
2797 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); 3409 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
2798 3410
2799 it->cset_link = l; 3411 it->cset_pos = l;
2800 3412
2801 if (!list_empty(&cset->tasks)) 3413 if (!list_empty(&cset->tasks))
2802 it->task = cset->tasks.next; 3414 it->task_pos = cset->tasks.next;
2803 else 3415 else
2804 it->task = cset->mg_tasks.next; 3416 it->task_pos = cset->mg_tasks.next;
3417
3418 it->tasks_head = &cset->tasks;
3419 it->mg_tasks_head = &cset->mg_tasks;
2805} 3420}
2806 3421
2807/** 3422/**
@@ -2827,8 +3442,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
2827 3442
2828 down_read(&css_set_rwsem); 3443 down_read(&css_set_rwsem);
2829 3444
2830 it->origin_css = css; 3445 it->ss = css->ss;
2831 it->cset_link = &css->cgroup->cset_links; 3446
3447 if (it->ss)
3448 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
3449 else
3450 it->cset_pos = &css->cgroup->cset_links;
3451
3452 it->cset_head = it->cset_pos;
2832 3453
2833 css_advance_task_iter(it); 3454 css_advance_task_iter(it);
2834} 3455}
@@ -2844,12 +3465,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
2844struct task_struct *css_task_iter_next(struct css_task_iter *it) 3465struct task_struct *css_task_iter_next(struct css_task_iter *it)
2845{ 3466{
2846 struct task_struct *res; 3467 struct task_struct *res;
2847 struct list_head *l = it->task; 3468 struct list_head *l = it->task_pos;
2848 struct cgrp_cset_link *link = list_entry(it->cset_link,
2849 struct cgrp_cset_link, cset_link);
2850 3469
2851 /* If the iterator cg is NULL, we have no tasks */ 3470 /* If the iterator cg is NULL, we have no tasks */
2852 if (!it->cset_link) 3471 if (!it->cset_pos)
2853 return NULL; 3472 return NULL;
2854 res = list_entry(l, struct task_struct, cg_list); 3473 res = list_entry(l, struct task_struct, cg_list);
2855 3474
@@ -2860,13 +3479,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
2860 */ 3479 */
2861 l = l->next; 3480 l = l->next;
2862 3481
2863 if (l == &link->cset->tasks) 3482 if (l == it->tasks_head)
2864 l = link->cset->mg_tasks.next; 3483 l = it->mg_tasks_head->next;
2865 3484
2866 if (l == &link->cset->mg_tasks) 3485 if (l == it->mg_tasks_head)
2867 css_advance_task_iter(it); 3486 css_advance_task_iter(it);
2868 else 3487 else
2869 it->task = l; 3488 it->task_pos = l;
2870 3489
2871 return res; 3490 return res;
2872} 3491}
@@ -2919,7 +3538,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
2919 * ->can_attach() fails. 3538 * ->can_attach() fails.
2920 */ 3539 */
2921 do { 3540 do {
2922 css_task_iter_start(&from->dummy_css, &it); 3541 css_task_iter_start(&from->self, &it);
2923 task = css_task_iter_next(&it); 3542 task = css_task_iter_next(&it);
2924 if (task) 3543 if (task)
2925 get_task_struct(task); 3544 get_task_struct(task);
@@ -3184,7 +3803,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3184 if (!array) 3803 if (!array)
3185 return -ENOMEM; 3804 return -ENOMEM;
3186 /* now, populate the array */ 3805 /* now, populate the array */
3187 css_task_iter_start(&cgrp->dummy_css, &it); 3806 css_task_iter_start(&cgrp->self, &it);
3188 while ((tsk = css_task_iter_next(&it))) { 3807 while ((tsk = css_task_iter_next(&it))) {
3189 if (unlikely(n == length)) 3808 if (unlikely(n == length))
3190 break; 3809 break;
@@ -3246,7 +3865,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3246 3865
3247 /* 3866 /*
3248 * We aren't being called from kernfs and there's no guarantee on 3867 * We aren't being called from kernfs and there's no guarantee on
3249 * @kn->priv's validity. For this and css_tryget_from_dir(), 3868 * @kn->priv's validity. For this and css_tryget_online_from_dir(),
3250 * @kn->priv is RCU safe. Let's do the RCU dancing. 3869 * @kn->priv is RCU safe. Let's do the RCU dancing.
3251 */ 3870 */
3252 rcu_read_lock(); 3871 rcu_read_lock();
@@ -3258,7 +3877,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3258 } 3877 }
3259 rcu_read_unlock(); 3878 rcu_read_unlock();
3260 3879
3261 css_task_iter_start(&cgrp->dummy_css, &it); 3880 css_task_iter_start(&cgrp->self, &it);
3262 while ((tsk = css_task_iter_next(&it))) { 3881 while ((tsk = css_task_iter_next(&it))) {
3263 switch (tsk->state) { 3882 switch (tsk->state) {
3264 case TASK_RUNNING: 3883 case TASK_RUNNING:
@@ -3388,17 +4007,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
3388 return seq_printf(s, "%d\n", *(int *)v); 4007 return seq_printf(s, "%d\n", *(int *)v);
3389} 4008}
3390 4009
3391/*
3392 * seq_operations functions for iterating on pidlists through seq_file -
3393 * independent of whether it's tasks or procs
3394 */
3395static const struct seq_operations cgroup_pidlist_seq_operations = {
3396 .start = cgroup_pidlist_start,
3397 .stop = cgroup_pidlist_stop,
3398 .next = cgroup_pidlist_next,
3399 .show = cgroup_pidlist_show,
3400};
3401
3402static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, 4010static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3403 struct cftype *cft) 4011 struct cftype *cft)
3404{ 4012{
@@ -3440,7 +4048,7 @@ static struct cftype cgroup_base_files[] = {
3440 .seq_stop = cgroup_pidlist_stop, 4048 .seq_stop = cgroup_pidlist_stop,
3441 .seq_show = cgroup_pidlist_show, 4049 .seq_show = cgroup_pidlist_show,
3442 .private = CGROUP_FILE_PROCS, 4050 .private = CGROUP_FILE_PROCS,
3443 .write_u64 = cgroup_procs_write, 4051 .write = cgroup_procs_write,
3444 .mode = S_IRUGO | S_IWUSR, 4052 .mode = S_IRUGO | S_IWUSR,
3445 }, 4053 },
3446 { 4054 {
@@ -3454,6 +4062,27 @@ static struct cftype cgroup_base_files[] = {
3454 .flags = CFTYPE_ONLY_ON_ROOT, 4062 .flags = CFTYPE_ONLY_ON_ROOT,
3455 .seq_show = cgroup_sane_behavior_show, 4063 .seq_show = cgroup_sane_behavior_show,
3456 }, 4064 },
4065 {
4066 .name = "cgroup.controllers",
4067 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
4068 .seq_show = cgroup_root_controllers_show,
4069 },
4070 {
4071 .name = "cgroup.controllers",
4072 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4073 .seq_show = cgroup_controllers_show,
4074 },
4075 {
4076 .name = "cgroup.subtree_control",
4077 .flags = CFTYPE_ONLY_ON_DFL,
4078 .seq_show = cgroup_subtree_control_show,
4079 .write = cgroup_subtree_control_write,
4080 },
4081 {
4082 .name = "cgroup.populated",
4083 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4084 .seq_show = cgroup_populated_show,
4085 },
3457 4086
3458 /* 4087 /*
3459 * Historical crazy stuff. These don't have "cgroup." prefix and 4088 * Historical crazy stuff. These don't have "cgroup." prefix and
@@ -3468,7 +4097,7 @@ static struct cftype cgroup_base_files[] = {
3468 .seq_stop = cgroup_pidlist_stop, 4097 .seq_stop = cgroup_pidlist_stop,
3469 .seq_show = cgroup_pidlist_show, 4098 .seq_show = cgroup_pidlist_show,
3470 .private = CGROUP_FILE_TASKS, 4099 .private = CGROUP_FILE_TASKS,
3471 .write_u64 = cgroup_tasks_write, 4100 .write = cgroup_tasks_write,
3472 .mode = S_IRUGO | S_IWUSR, 4101 .mode = S_IRUGO | S_IWUSR,
3473 }, 4102 },
3474 { 4103 {
@@ -3481,7 +4110,7 @@ static struct cftype cgroup_base_files[] = {
3481 .name = "release_agent", 4110 .name = "release_agent",
3482 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 4111 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3483 .seq_show = cgroup_release_agent_show, 4112 .seq_show = cgroup_release_agent_show,
3484 .write_string = cgroup_release_agent_write, 4113 .write = cgroup_release_agent_write,
3485 .max_write_len = PATH_MAX - 1, 4114 .max_write_len = PATH_MAX - 1,
3486 }, 4115 },
3487 { } /* terminate */ 4116 { } /* terminate */
@@ -3494,7 +4123,7 @@ static struct cftype cgroup_base_files[] = {
3494 * 4123 *
3495 * On failure, no file is added. 4124 * On failure, no file is added.
3496 */ 4125 */
3497static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) 4126static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
3498{ 4127{
3499 struct cgroup_subsys *ss; 4128 struct cgroup_subsys *ss;
3500 int i, ret = 0; 4129 int i, ret = 0;
@@ -3503,7 +4132,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3503 for_each_subsys(ss, i) { 4132 for_each_subsys(ss, i) {
3504 struct cftype *cfts; 4133 struct cftype *cfts;
3505 4134
3506 if (!test_bit(i, &subsys_mask)) 4135 if (!(subsys_mask & (1 << i)))
3507 continue; 4136 continue;
3508 4137
3509 list_for_each_entry(cfts, &ss->cfts, node) { 4138 list_for_each_entry(cfts, &ss->cfts, node) {
@@ -3525,9 +4154,9 @@ err:
3525 * Implemented in kill_css(). 4154 * Implemented in kill_css().
3526 * 4155 *
3527 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs 4156 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
3528 * and thus css_tryget() is guaranteed to fail, the css can be offlined 4157 * and thus css_tryget_online() is guaranteed to fail, the css can be
3529 * by invoking offline_css(). After offlining, the base ref is put. 4158 * offlined by invoking offline_css(). After offlining, the base ref is
3530 * Implemented in css_killed_work_fn(). 4159 * put. Implemented in css_killed_work_fn().
3531 * 4160 *
3532 * 3. When the percpu_ref reaches zero, the only possible remaining 4161 * 3. When the percpu_ref reaches zero, the only possible remaining
3533 * accessors are inside RCU read sections. css_release() schedules the 4162 * accessors are inside RCU read sections. css_release() schedules the
@@ -3546,11 +4175,37 @@ static void css_free_work_fn(struct work_struct *work)
3546 container_of(work, struct cgroup_subsys_state, destroy_work); 4175 container_of(work, struct cgroup_subsys_state, destroy_work);
3547 struct cgroup *cgrp = css->cgroup; 4176 struct cgroup *cgrp = css->cgroup;
3548 4177
3549 if (css->parent) 4178 if (css->ss) {
3550 css_put(css->parent); 4179 /* css free path */
4180 if (css->parent)
4181 css_put(css->parent);
3551 4182
3552 css->ss->css_free(css); 4183 css->ss->css_free(css);
3553 cgroup_put(cgrp); 4184 cgroup_put(cgrp);
4185 } else {
4186 /* cgroup free path */
4187 atomic_dec(&cgrp->root->nr_cgrps);
4188 cgroup_pidlist_destroy_all(cgrp);
4189
4190 if (cgroup_parent(cgrp)) {
4191 /*
4192 * We get a ref to the parent, and put the ref when
4193 * this cgroup is being freed, so it's guaranteed
4194 * that the parent won't be destroyed before its
4195 * children.
4196 */
4197 cgroup_put(cgroup_parent(cgrp));
4198 kernfs_put(cgrp->kn);
4199 kfree(cgrp);
4200 } else {
4201 /*
4202 * This is root cgroup's refcnt reaching zero,
4203 * which indicates that the root should be
4204 * released.
4205 */
4206 cgroup_destroy_root(cgrp->root);
4207 }
4208 }
3554} 4209}
3555 4210
3556static void css_free_rcu_fn(struct rcu_head *rcu_head) 4211static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -3562,26 +4217,59 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
3562 queue_work(cgroup_destroy_wq, &css->destroy_work); 4217 queue_work(cgroup_destroy_wq, &css->destroy_work);
3563} 4218}
3564 4219
4220static void css_release_work_fn(struct work_struct *work)
4221{
4222 struct cgroup_subsys_state *css =
4223 container_of(work, struct cgroup_subsys_state, destroy_work);
4224 struct cgroup_subsys *ss = css->ss;
4225 struct cgroup *cgrp = css->cgroup;
4226
4227 mutex_lock(&cgroup_mutex);
4228
4229 css->flags |= CSS_RELEASED;
4230 list_del_rcu(&css->sibling);
4231
4232 if (ss) {
4233 /* css release path */
4234 cgroup_idr_remove(&ss->css_idr, css->id);
4235 } else {
4236 /* cgroup release path */
4237 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4238 cgrp->id = -1;
4239 }
4240
4241 mutex_unlock(&cgroup_mutex);
4242
4243 call_rcu(&css->rcu_head, css_free_rcu_fn);
4244}
4245
3565static void css_release(struct percpu_ref *ref) 4246static void css_release(struct percpu_ref *ref)
3566{ 4247{
3567 struct cgroup_subsys_state *css = 4248 struct cgroup_subsys_state *css =
3568 container_of(ref, struct cgroup_subsys_state, refcnt); 4249 container_of(ref, struct cgroup_subsys_state, refcnt);
3569 4250
3570 RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); 4251 INIT_WORK(&css->destroy_work, css_release_work_fn);
3571 call_rcu(&css->rcu_head, css_free_rcu_fn); 4252 queue_work(cgroup_destroy_wq, &css->destroy_work);
3572} 4253}
3573 4254
3574static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, 4255static void init_and_link_css(struct cgroup_subsys_state *css,
3575 struct cgroup *cgrp) 4256 struct cgroup_subsys *ss, struct cgroup *cgrp)
3576{ 4257{
4258 lockdep_assert_held(&cgroup_mutex);
4259
4260 cgroup_get(cgrp);
4261
4262 memset(css, 0, sizeof(*css));
3577 css->cgroup = cgrp; 4263 css->cgroup = cgrp;
3578 css->ss = ss; 4264 css->ss = ss;
3579 css->flags = 0; 4265 INIT_LIST_HEAD(&css->sibling);
4266 INIT_LIST_HEAD(&css->children);
4267 css->serial_nr = css_serial_nr_next++;
3580 4268
3581 if (cgrp->parent) 4269 if (cgroup_parent(cgrp)) {
3582 css->parent = cgroup_css(cgrp->parent, ss); 4270 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
3583 else 4271 css_get(css->parent);
3584 css->flags |= CSS_ROOT; 4272 }
3585 4273
3586 BUG_ON(cgroup_css(cgrp, ss)); 4274 BUG_ON(cgroup_css(cgrp, ss));
3587} 4275}
@@ -3592,14 +4280,12 @@ static int online_css(struct cgroup_subsys_state *css)
3592 struct cgroup_subsys *ss = css->ss; 4280 struct cgroup_subsys *ss = css->ss;
3593 int ret = 0; 4281 int ret = 0;
3594 4282
3595 lockdep_assert_held(&cgroup_tree_mutex);
3596 lockdep_assert_held(&cgroup_mutex); 4283 lockdep_assert_held(&cgroup_mutex);
3597 4284
3598 if (ss->css_online) 4285 if (ss->css_online)
3599 ret = ss->css_online(css); 4286 ret = ss->css_online(css);
3600 if (!ret) { 4287 if (!ret) {
3601 css->flags |= CSS_ONLINE; 4288 css->flags |= CSS_ONLINE;
3602 css->cgroup->nr_css++;
3603 rcu_assign_pointer(css->cgroup->subsys[ss->id], css); 4289 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
3604 } 4290 }
3605 return ret; 4291 return ret;
@@ -3610,7 +4296,6 @@ static void offline_css(struct cgroup_subsys_state *css)
3610{ 4296{
3611 struct cgroup_subsys *ss = css->ss; 4297 struct cgroup_subsys *ss = css->ss;
3612 4298
3613 lockdep_assert_held(&cgroup_tree_mutex);
3614 lockdep_assert_held(&cgroup_mutex); 4299 lockdep_assert_held(&cgroup_mutex);
3615 4300
3616 if (!(css->flags & CSS_ONLINE)) 4301 if (!(css->flags & CSS_ONLINE))
@@ -3620,8 +4305,9 @@ static void offline_css(struct cgroup_subsys_state *css)
3620 ss->css_offline(css); 4305 ss->css_offline(css);
3621 4306
3622 css->flags &= ~CSS_ONLINE; 4307 css->flags &= ~CSS_ONLINE;
3623 css->cgroup->nr_css--; 4308 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
3624 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); 4309
4310 wake_up_all(&css->cgroup->offline_waitq);
3625} 4311}
3626 4312
3627/** 4313/**
@@ -3635,111 +4321,102 @@ static void offline_css(struct cgroup_subsys_state *css)
3635 */ 4321 */
3636static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) 4322static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
3637{ 4323{
3638 struct cgroup *parent = cgrp->parent; 4324 struct cgroup *parent = cgroup_parent(cgrp);
4325 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
3639 struct cgroup_subsys_state *css; 4326 struct cgroup_subsys_state *css;
3640 int err; 4327 int err;
3641 4328
3642 lockdep_assert_held(&cgroup_mutex); 4329 lockdep_assert_held(&cgroup_mutex);
3643 4330
3644 css = ss->css_alloc(cgroup_css(parent, ss)); 4331 css = ss->css_alloc(parent_css);
3645 if (IS_ERR(css)) 4332 if (IS_ERR(css))
3646 return PTR_ERR(css); 4333 return PTR_ERR(css);
3647 4334
4335 init_and_link_css(css, ss, cgrp);
4336
3648 err = percpu_ref_init(&css->refcnt, css_release); 4337 err = percpu_ref_init(&css->refcnt, css_release);
3649 if (err) 4338 if (err)
3650 goto err_free_css; 4339 goto err_free_css;
3651 4340
3652 init_css(css, ss, cgrp); 4341 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
4342 if (err < 0)
4343 goto err_free_percpu_ref;
4344 css->id = err;
3653 4345
3654 err = cgroup_populate_dir(cgrp, 1 << ss->id); 4346 err = cgroup_populate_dir(cgrp, 1 << ss->id);
3655 if (err) 4347 if (err)
3656 goto err_free_percpu_ref; 4348 goto err_free_id;
4349
4350 /* @css is ready to be brought online now, make it visible */
4351 list_add_tail_rcu(&css->sibling, &parent_css->children);
4352 cgroup_idr_replace(&ss->css_idr, css, css->id);
3657 4353
3658 err = online_css(css); 4354 err = online_css(css);
3659 if (err) 4355 if (err)
3660 goto err_clear_dir; 4356 goto err_list_del;
3661
3662 cgroup_get(cgrp);
3663 css_get(css->parent);
3664
3665 cgrp->subsys_mask |= 1 << ss->id;
3666 4357
3667 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4358 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
3668 parent->parent) { 4359 cgroup_parent(parent)) {
3669 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 4360 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
3670 current->comm, current->pid, ss->name); 4361 current->comm, current->pid, ss->name);
3671 if (!strcmp(ss->name, "memory")) 4362 if (!strcmp(ss->name, "memory"))
3672 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); 4363 pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
3673 ss->warned_broken_hierarchy = true; 4364 ss->warned_broken_hierarchy = true;
3674 } 4365 }
3675 4366
3676 return 0; 4367 return 0;
3677 4368
3678err_clear_dir: 4369err_list_del:
4370 list_del_rcu(&css->sibling);
3679 cgroup_clear_dir(css->cgroup, 1 << css->ss->id); 4371 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4372err_free_id:
4373 cgroup_idr_remove(&ss->css_idr, css->id);
3680err_free_percpu_ref: 4374err_free_percpu_ref:
3681 percpu_ref_cancel_init(&css->refcnt); 4375 percpu_ref_cancel_init(&css->refcnt);
3682err_free_css: 4376err_free_css:
3683 ss->css_free(css); 4377 call_rcu(&css->rcu_head, css_free_rcu_fn);
3684 return err; 4378 return err;
3685} 4379}
3686 4380
3687/** 4381static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3688 * cgroup_create - create a cgroup 4382 umode_t mode)
3689 * @parent: cgroup that will be parent of the new cgroup
3690 * @name: name of the new cgroup
3691 * @mode: mode to set on new cgroup
3692 */
3693static long cgroup_create(struct cgroup *parent, const char *name,
3694 umode_t mode)
3695{ 4383{
3696 struct cgroup *cgrp; 4384 struct cgroup *parent, *cgrp;
3697 struct cgroup_root *root = parent->root; 4385 struct cgroup_root *root;
3698 int ssid, err;
3699 struct cgroup_subsys *ss; 4386 struct cgroup_subsys *ss;
3700 struct kernfs_node *kn; 4387 struct kernfs_node *kn;
4388 int ssid, ret;
3701 4389
3702 /* 4390 parent = cgroup_kn_lock_live(parent_kn);
3703 * XXX: The default hierarchy isn't fully implemented yet. Block 4391 if (!parent)
3704 * !root cgroup creation on it for now. 4392 return -ENODEV;
3705 */ 4393 root = parent->root;
3706 if (root == &cgrp_dfl_root)
3707 return -EINVAL;
3708 4394
3709 /* allocate the cgroup and its ID, 0 is reserved for the root */ 4395 /* allocate the cgroup and its ID, 0 is reserved for the root */
3710 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 4396 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
3711 if (!cgrp) 4397 if (!cgrp) {
3712 return -ENOMEM; 4398 ret = -ENOMEM;
3713 4399 goto out_unlock;
3714 mutex_lock(&cgroup_tree_mutex);
3715
3716 /*
3717 * Only live parents can have children. Note that the liveliness
3718 * check isn't strictly necessary because cgroup_mkdir() and
3719 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
3720 * anyway so that locking is contained inside cgroup proper and we
3721 * don't get nasty surprises if we ever grow another caller.
3722 */
3723 if (!cgroup_lock_live_group(parent)) {
3724 err = -ENODEV;
3725 goto err_unlock_tree;
3726 } 4400 }
3727 4401
4402 ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
4403 if (ret)
4404 goto out_free_cgrp;
4405
3728 /* 4406 /*
3729 * Temporarily set the pointer to NULL, so idr_find() won't return 4407 * Temporarily set the pointer to NULL, so idr_find() won't return
3730 * a half-baked cgroup. 4408 * a half-baked cgroup.
3731 */ 4409 */
3732 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); 4410 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
3733 if (cgrp->id < 0) { 4411 if (cgrp->id < 0) {
3734 err = -ENOMEM; 4412 ret = -ENOMEM;
3735 goto err_unlock; 4413 goto out_cancel_ref;
3736 } 4414 }
3737 4415
3738 init_cgroup_housekeeping(cgrp); 4416 init_cgroup_housekeeping(cgrp);
3739 4417
3740 cgrp->parent = parent; 4418 cgrp->self.parent = &parent->self;
3741 cgrp->dummy_css.parent = &parent->dummy_css; 4419 cgrp->root = root;
3742 cgrp->root = parent->root;
3743 4420
3744 if (notify_on_release(parent)) 4421 if (notify_on_release(parent))
3745 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4422 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -3750,8 +4427,8 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3750 /* create the directory */ 4427 /* create the directory */
3751 kn = kernfs_create_dir(parent->kn, name, mode, cgrp); 4428 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
3752 if (IS_ERR(kn)) { 4429 if (IS_ERR(kn)) {
3753 err = PTR_ERR(kn); 4430 ret = PTR_ERR(kn);
3754 goto err_free_id; 4431 goto out_free_id;
3755 } 4432 }
3756 cgrp->kn = kn; 4433 cgrp->kn = kn;
3757 4434
@@ -3761,10 +4438,10 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3761 */ 4438 */
3762 kernfs_get(kn); 4439 kernfs_get(kn);
3763 4440
3764 cgrp->serial_nr = cgroup_serial_nr_next++; 4441 cgrp->self.serial_nr = css_serial_nr_next++;
3765 4442
3766 /* allocation complete, commit to creation */ 4443 /* allocation complete, commit to creation */
3767 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4444 list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
3768 atomic_inc(&root->nr_cgrps); 4445 atomic_inc(&root->nr_cgrps);
3769 cgroup_get(parent); 4446 cgroup_get(parent);
3770 4447
@@ -3772,107 +4449,66 @@ static long cgroup_create(struct cgroup *parent, const char *name,
3772 * @cgrp is now fully operational. If something fails after this 4449 * @cgrp is now fully operational. If something fails after this
3773 * point, it'll be released via the normal destruction path. 4450 * point, it'll be released via the normal destruction path.
3774 */ 4451 */
3775 idr_replace(&root->cgroup_idr, cgrp, cgrp->id); 4452 cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
3776 4453
3777 err = cgroup_kn_set_ugid(kn); 4454 ret = cgroup_kn_set_ugid(kn);
3778 if (err) 4455 if (ret)
3779 goto err_destroy; 4456 goto out_destroy;
3780 4457
3781 err = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4458 ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
3782 if (err) 4459 if (ret)
3783 goto err_destroy; 4460 goto out_destroy;
3784 4461
3785 /* let's create and online css's */ 4462 /* let's create and online css's */
3786 for_each_subsys(ss, ssid) { 4463 for_each_subsys(ss, ssid) {
3787 if (root->cgrp.subsys_mask & (1 << ssid)) { 4464 if (parent->child_subsys_mask & (1 << ssid)) {
3788 err = create_css(cgrp, ss); 4465 ret = create_css(cgrp, ss);
3789 if (err) 4466 if (ret)
3790 goto err_destroy; 4467 goto out_destroy;
3791 } 4468 }
3792 } 4469 }
3793 4470
3794 kernfs_activate(kn); 4471 /*
4472 * On the default hierarchy, a child doesn't automatically inherit
4473 * child_subsys_mask from the parent. Each is configured manually.
4474 */
4475 if (!cgroup_on_dfl(cgrp))
4476 cgrp->child_subsys_mask = parent->child_subsys_mask;
3795 4477
3796 mutex_unlock(&cgroup_mutex); 4478 kernfs_activate(kn);
3797 mutex_unlock(&cgroup_tree_mutex);
3798 4479
3799 return 0; 4480 ret = 0;
4481 goto out_unlock;
3800 4482
3801err_free_id: 4483out_free_id:
3802 idr_remove(&root->cgroup_idr, cgrp->id); 4484 cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
3803err_unlock: 4485out_cancel_ref:
3804 mutex_unlock(&cgroup_mutex); 4486 percpu_ref_cancel_init(&cgrp->self.refcnt);
3805err_unlock_tree: 4487out_free_cgrp:
3806 mutex_unlock(&cgroup_tree_mutex);
3807 kfree(cgrp); 4488 kfree(cgrp);
3808 return err; 4489out_unlock:
4490 cgroup_kn_unlock(parent_kn);
4491 return ret;
3809 4492
3810err_destroy: 4493out_destroy:
3811 cgroup_destroy_locked(cgrp); 4494 cgroup_destroy_locked(cgrp);
3812 mutex_unlock(&cgroup_mutex); 4495 goto out_unlock;
3813 mutex_unlock(&cgroup_tree_mutex);
3814 return err;
3815}
3816
3817static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3818 umode_t mode)
3819{
3820 struct cgroup *parent = parent_kn->priv;
3821 int ret;
3822
3823 /*
3824 * cgroup_create() grabs cgroup_tree_mutex which nests outside
3825 * kernfs active_ref and cgroup_create() already synchronizes
3826 * properly against removal through cgroup_lock_live_group().
3827 * Break it before calling cgroup_create().
3828 */
3829 cgroup_get(parent);
3830 kernfs_break_active_protection(parent_kn);
3831
3832 ret = cgroup_create(parent, name, mode);
3833
3834 kernfs_unbreak_active_protection(parent_kn);
3835 cgroup_put(parent);
3836 return ret;
3837} 4496}
3838 4497
3839/* 4498/*
3840 * This is called when the refcnt of a css is confirmed to be killed. 4499 * This is called when the refcnt of a css is confirmed to be killed.
3841 * css_tryget() is now guaranteed to fail. 4500 * css_tryget_online() is now guaranteed to fail. Tell the subsystem to
4501 * initate destruction and put the css ref from kill_css().
3842 */ 4502 */
3843static void css_killed_work_fn(struct work_struct *work) 4503static void css_killed_work_fn(struct work_struct *work)
3844{ 4504{
3845 struct cgroup_subsys_state *css = 4505 struct cgroup_subsys_state *css =
3846 container_of(work, struct cgroup_subsys_state, destroy_work); 4506 container_of(work, struct cgroup_subsys_state, destroy_work);
3847 struct cgroup *cgrp = css->cgroup;
3848 4507
3849 mutex_lock(&cgroup_tree_mutex);
3850 mutex_lock(&cgroup_mutex); 4508 mutex_lock(&cgroup_mutex);
3851
3852 /*
3853 * css_tryget() is guaranteed to fail now. Tell subsystems to
3854 * initate destruction.
3855 */
3856 offline_css(css); 4509 offline_css(css);
3857
3858 /*
3859 * If @cgrp is marked dead, it's waiting for refs of all css's to
3860 * be disabled before proceeding to the second phase of cgroup
3861 * destruction. If we are the last one, kick it off.
3862 */
3863 if (!cgrp->nr_css && cgroup_is_dead(cgrp))
3864 cgroup_destroy_css_killed(cgrp);
3865
3866 mutex_unlock(&cgroup_mutex); 4510 mutex_unlock(&cgroup_mutex);
3867 mutex_unlock(&cgroup_tree_mutex);
3868 4511
3869 /*
3870 * Put the css refs from kill_css(). Each css holds an extra
3871 * reference to the cgroup's dentry and cgroup removal proceeds
3872 * regardless of css refs. On the last put of each css, whenever
3873 * that may be, the extra dentry ref is put so that dentry
3874 * destruction happens only after all css's are released.
3875 */
3876 css_put(css); 4512 css_put(css);
3877} 4513}
3878 4514
@@ -3886,9 +4522,18 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
3886 queue_work(cgroup_destroy_wq, &css->destroy_work); 4522 queue_work(cgroup_destroy_wq, &css->destroy_work);
3887} 4523}
3888 4524
3889static void __kill_css(struct cgroup_subsys_state *css) 4525/**
4526 * kill_css - destroy a css
4527 * @css: css to destroy
4528 *
4529 * This function initiates destruction of @css by removing cgroup interface
4530 * files and putting its base reference. ->css_offline() will be invoked
4531 * asynchronously once css_tryget_online() is guaranteed to fail and when
4532 * the reference count reaches zero, @css will be released.
4533 */
4534static void kill_css(struct cgroup_subsys_state *css)
3890{ 4535{
3891 lockdep_assert_held(&cgroup_tree_mutex); 4536 lockdep_assert_held(&cgroup_mutex);
3892 4537
3893 /* 4538 /*
3894 * This must happen before css is disassociated with its cgroup. 4539 * This must happen before css is disassociated with its cgroup.
@@ -3905,7 +4550,7 @@ static void __kill_css(struct cgroup_subsys_state *css)
3905 /* 4550 /*
3906 * cgroup core guarantees that, by the time ->css_offline() is 4551 * cgroup core guarantees that, by the time ->css_offline() is
3907 * invoked, no new css reference will be given out via 4552 * invoked, no new css reference will be given out via
3908 * css_tryget(). We can't simply call percpu_ref_kill() and 4553 * css_tryget_online(). We can't simply call percpu_ref_kill() and
3909 * proceed to offlining css's because percpu_ref_kill() doesn't 4554 * proceed to offlining css's because percpu_ref_kill() doesn't
3910 * guarantee that the ref is seen as killed on all CPUs on return. 4555 * guarantee that the ref is seen as killed on all CPUs on return.
3911 * 4556 *
@@ -3916,36 +4561,14 @@ static void __kill_css(struct cgroup_subsys_state *css)
3916} 4561}
3917 4562
3918/** 4563/**
3919 * kill_css - destroy a css
3920 * @css: css to destroy
3921 *
3922 * This function initiates destruction of @css by removing cgroup interface
3923 * files and putting its base reference. ->css_offline() will be invoked
3924 * asynchronously once css_tryget() is guaranteed to fail and when the
3925 * reference count reaches zero, @css will be released.
3926 */
3927static void kill_css(struct cgroup_subsys_state *css)
3928{
3929 struct cgroup *cgrp = css->cgroup;
3930
3931 lockdep_assert_held(&cgroup_tree_mutex);
3932
3933 /* if already killed, noop */
3934 if (cgrp->subsys_mask & (1 << css->ss->id)) {
3935 cgrp->subsys_mask &= ~(1 << css->ss->id);
3936 __kill_css(css);
3937 }
3938}
3939
3940/**
3941 * cgroup_destroy_locked - the first stage of cgroup destruction 4564 * cgroup_destroy_locked - the first stage of cgroup destruction
3942 * @cgrp: cgroup to be destroyed 4565 * @cgrp: cgroup to be destroyed
3943 * 4566 *
3944 * css's make use of percpu refcnts whose killing latency shouldn't be 4567 * css's make use of percpu refcnts whose killing latency shouldn't be
3945 * exposed to userland and are RCU protected. Also, cgroup core needs to 4568 * exposed to userland and are RCU protected. Also, cgroup core needs to
3946 * guarantee that css_tryget() won't succeed by the time ->css_offline() is 4569 * guarantee that css_tryget_online() won't succeed by the time
3947 * invoked. To satisfy all the requirements, destruction is implemented in 4570 * ->css_offline() is invoked. To satisfy all the requirements,
3948 * the following two steps. 4571 * destruction is implemented in the following two steps.
3949 * 4572 *
3950 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all 4573 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
3951 * userland visible parts and start killing the percpu refcnts of 4574 * userland visible parts and start killing the percpu refcnts of
@@ -3964,12 +4587,10 @@ static void kill_css(struct cgroup_subsys_state *css)
3964static int cgroup_destroy_locked(struct cgroup *cgrp) 4587static int cgroup_destroy_locked(struct cgroup *cgrp)
3965 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4588 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
3966{ 4589{
3967 struct cgroup *child;
3968 struct cgroup_subsys_state *css; 4590 struct cgroup_subsys_state *css;
3969 bool empty; 4591 bool empty;
3970 int ssid; 4592 int ssid;
3971 4593
3972 lockdep_assert_held(&cgroup_tree_mutex);
3973 lockdep_assert_held(&cgroup_mutex); 4594 lockdep_assert_held(&cgroup_mutex);
3974 4595
3975 /* 4596 /*
@@ -3983,127 +4604,68 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
3983 return -EBUSY; 4604 return -EBUSY;
3984 4605
3985 /* 4606 /*
3986 * Make sure there's no live children. We can't test ->children 4607 * Make sure there's no live children. We can't test emptiness of
3987 * emptiness as dead children linger on it while being destroyed; 4608 * ->self.children as dead children linger on it while being
3988 * otherwise, "rmdir parent/child parent" may fail with -EBUSY. 4609 * drained; otherwise, "rmdir parent/child parent" may fail.
3989 */ 4610 */
3990 empty = true; 4611 if (css_has_online_children(&cgrp->self))
3991 rcu_read_lock();
3992 list_for_each_entry_rcu(child, &cgrp->children, sibling) {
3993 empty = cgroup_is_dead(child);
3994 if (!empty)
3995 break;
3996 }
3997 rcu_read_unlock();
3998 if (!empty)
3999 return -EBUSY; 4612 return -EBUSY;
4000 4613
4001 /* 4614 /*
4002 * Mark @cgrp dead. This prevents further task migration and child 4615 * Mark @cgrp dead. This prevents further task migration and child
4003 * creation by disabling cgroup_lock_live_group(). Note that 4616 * creation by disabling cgroup_lock_live_group().
4004 * CGRP_DEAD assertion is depended upon by css_next_child() to
4005 * resume iteration after dropping RCU read lock. See
4006 * css_next_child() for details.
4007 */ 4617 */
4008 set_bit(CGRP_DEAD, &cgrp->flags); 4618 cgrp->self.flags &= ~CSS_ONLINE;
4009 4619
4010 /* 4620 /* initiate massacre of all css's */
4011 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4012 * will be invoked to perform the rest of destruction once the
4013 * percpu refs of all css's are confirmed to be killed. This
4014 * involves removing the subsystem's files, drop cgroup_mutex.
4015 */
4016 mutex_unlock(&cgroup_mutex);
4017 for_each_css(css, ssid, cgrp) 4621 for_each_css(css, ssid, cgrp)
4018 kill_css(css); 4622 kill_css(css);
4019 mutex_lock(&cgroup_mutex);
4020 4623
4021 /* CGRP_DEAD is set, remove from ->release_list for the last time */ 4624 /* CSS_ONLINE is clear, remove from ->release_list for the last time */
4022 raw_spin_lock(&release_list_lock); 4625 raw_spin_lock(&release_list_lock);
4023 if (!list_empty(&cgrp->release_list)) 4626 if (!list_empty(&cgrp->release_list))
4024 list_del_init(&cgrp->release_list); 4627 list_del_init(&cgrp->release_list);
4025 raw_spin_unlock(&release_list_lock); 4628 raw_spin_unlock(&release_list_lock);
4026 4629
4027 /* 4630 /*
4028 * If @cgrp has css's attached, the second stage of cgroup 4631 * Remove @cgrp directory along with the base files. @cgrp has an
4029 * destruction is kicked off from css_killed_work_fn() after the 4632 * extra ref on its kn.
4030 * refs of all attached css's are killed. If @cgrp doesn't have
4031 * any css, we kick it off here.
4032 */ 4633 */
4033 if (!cgrp->nr_css) 4634 kernfs_remove(cgrp->kn);
4034 cgroup_destroy_css_killed(cgrp);
4035 4635
4036 /* remove @cgrp directory along with the base files */ 4636 set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
4037 mutex_unlock(&cgroup_mutex); 4637 check_for_release(cgroup_parent(cgrp));
4038
4039 /*
4040 * There are two control paths which try to determine cgroup from
4041 * dentry without going through kernfs - cgroupstats_build() and
4042 * css_tryget_from_dir(). Those are supported by RCU protecting
4043 * clearing of cgrp->kn->priv backpointer, which should happen
4044 * after all files under it have been removed.
4045 */
4046 kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */
4047 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4048 4638
4049 mutex_lock(&cgroup_mutex); 4639 /* put the base reference */
4640 percpu_ref_kill(&cgrp->self.refcnt);
4050 4641
4051 return 0; 4642 return 0;
4052}; 4643};
4053 4644
4054/**
4055 * cgroup_destroy_css_killed - the second step of cgroup destruction
4056 * @work: cgroup->destroy_free_work
4057 *
4058 * This function is invoked from a work item for a cgroup which is being
4059 * destroyed after all css's are offlined and performs the rest of
4060 * destruction. This is the second step of destruction described in the
4061 * comment above cgroup_destroy_locked().
4062 */
4063static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4064{
4065 struct cgroup *parent = cgrp->parent;
4066
4067 lockdep_assert_held(&cgroup_tree_mutex);
4068 lockdep_assert_held(&cgroup_mutex);
4069
4070 /* delete this cgroup from parent->children */
4071 list_del_rcu(&cgrp->sibling);
4072
4073 cgroup_put(cgrp);
4074
4075 set_bit(CGRP_RELEASABLE, &parent->flags);
4076 check_for_release(parent);
4077}
4078
4079static int cgroup_rmdir(struct kernfs_node *kn) 4645static int cgroup_rmdir(struct kernfs_node *kn)
4080{ 4646{
4081 struct cgroup *cgrp = kn->priv; 4647 struct cgroup *cgrp;
4082 int ret = 0; 4648 int ret = 0;
4083 4649
4084 /* 4650 cgrp = cgroup_kn_lock_live(kn);
4085 * This is self-destruction but @kn can't be removed while this 4651 if (!cgrp)
4086 * callback is in progress. Let's break active protection. Once 4652 return 0;
4087 * the protection is broken, @cgrp can be destroyed at any point. 4653 cgroup_get(cgrp); /* for @kn->priv clearing */
4088 * Pin it so that it stays accessible.
4089 */
4090 cgroup_get(cgrp);
4091 kernfs_break_active_protection(kn);
4092 4654
4093 mutex_lock(&cgroup_tree_mutex); 4655 ret = cgroup_destroy_locked(cgrp);
4094 mutex_lock(&cgroup_mutex); 4656
4657 cgroup_kn_unlock(kn);
4095 4658
4096 /* 4659 /*
4097 * @cgrp might already have been destroyed while we're trying to 4660 * There are two control paths which try to determine cgroup from
4098 * grab the mutexes. 4661 * dentry without going through kernfs - cgroupstats_build() and
4662 * css_tryget_online_from_dir(). Those are supported by RCU
4663 * protecting clearing of cgrp->kn->priv backpointer, which should
4664 * happen after all files under it have been removed.
4099 */ 4665 */
4100 if (!cgroup_is_dead(cgrp)) 4666 if (!ret)
4101 ret = cgroup_destroy_locked(cgrp); 4667 RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
4102
4103 mutex_unlock(&cgroup_mutex);
4104 mutex_unlock(&cgroup_tree_mutex);
4105 4668
4106 kernfs_unbreak_active_protection(kn);
4107 cgroup_put(cgrp); 4669 cgroup_put(cgrp);
4108 return ret; 4670 return ret;
4109} 4671}
@@ -4116,15 +4678,15 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4116 .rename = cgroup_rename, 4678 .rename = cgroup_rename,
4117}; 4679};
4118 4680
4119static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4681static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4120{ 4682{
4121 struct cgroup_subsys_state *css; 4683 struct cgroup_subsys_state *css;
4122 4684
4123 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4685 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4124 4686
4125 mutex_lock(&cgroup_tree_mutex);
4126 mutex_lock(&cgroup_mutex); 4687 mutex_lock(&cgroup_mutex);
4127 4688
4689 idr_init(&ss->css_idr);
4128 INIT_LIST_HEAD(&ss->cfts); 4690 INIT_LIST_HEAD(&ss->cfts);
4129 4691
4130 /* Create the root cgroup state for this subsystem */ 4692 /* Create the root cgroup state for this subsystem */
@@ -4132,7 +4694,21 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4132 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); 4694 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4133 /* We don't handle early failures gracefully */ 4695 /* We don't handle early failures gracefully */
4134 BUG_ON(IS_ERR(css)); 4696 BUG_ON(IS_ERR(css));
4135 init_css(css, ss, &cgrp_dfl_root.cgrp); 4697 init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
4698
4699 /*
4700 * Root csses are never destroyed and we can't initialize
4701 * percpu_ref during early init. Disable refcnting.
4702 */
4703 css->flags |= CSS_NO_REF;
4704
4705 if (early) {
4706 /* allocation can't be done safely during early init */
4707 css->id = 1;
4708 } else {
4709 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
4710 BUG_ON(css->id < 0);
4711 }
4136 4712
4137 /* Update the init_css_set to contain a subsys 4713 /* Update the init_css_set to contain a subsys
4138 * pointer to this state - since the subsystem is 4714 * pointer to this state - since the subsystem is
@@ -4149,10 +4725,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4149 4725
4150 BUG_ON(online_css(css)); 4726 BUG_ON(online_css(css));
4151 4727
4152 cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
4153
4154 mutex_unlock(&cgroup_mutex); 4728 mutex_unlock(&cgroup_mutex);
4155 mutex_unlock(&cgroup_tree_mutex);
4156} 4729}
4157 4730
4158/** 4731/**
@@ -4169,6 +4742,8 @@ int __init cgroup_init_early(void)
4169 int i; 4742 int i;
4170 4743
4171 init_cgroup_root(&cgrp_dfl_root, &opts); 4744 init_cgroup_root(&cgrp_dfl_root, &opts);
4745 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
4746
4172 RCU_INIT_POINTER(init_task.cgroups, &init_css_set); 4747 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4173 4748
4174 for_each_subsys(ss, i) { 4749 for_each_subsys(ss, i) {
@@ -4183,7 +4758,7 @@ int __init cgroup_init_early(void)
4183 ss->name = cgroup_subsys_name[i]; 4758 ss->name = cgroup_subsys_name[i];
4184 4759
4185 if (ss->early_init) 4760 if (ss->early_init)
4186 cgroup_init_subsys(ss); 4761 cgroup_init_subsys(ss, true);
4187 } 4762 }
4188 return 0; 4763 return 0;
4189} 4764}
@@ -4202,7 +4777,6 @@ int __init cgroup_init(void)
4202 4777
4203 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 4778 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4204 4779
4205 mutex_lock(&cgroup_tree_mutex);
4206 mutex_lock(&cgroup_mutex); 4780 mutex_lock(&cgroup_mutex);
4207 4781
4208 /* Add init_css_set to the hash table */ 4782 /* Add init_css_set to the hash table */
@@ -4212,18 +4786,31 @@ int __init cgroup_init(void)
4212 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); 4786 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4213 4787
4214 mutex_unlock(&cgroup_mutex); 4788 mutex_unlock(&cgroup_mutex);
4215 mutex_unlock(&cgroup_tree_mutex);
4216 4789
4217 for_each_subsys(ss, ssid) { 4790 for_each_subsys(ss, ssid) {
4218 if (!ss->early_init) 4791 if (ss->early_init) {
4219 cgroup_init_subsys(ss); 4792 struct cgroup_subsys_state *css =
4793 init_css_set.subsys[ss->id];
4794
4795 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
4796 GFP_KERNEL);
4797 BUG_ON(css->id < 0);
4798 } else {
4799 cgroup_init_subsys(ss, false);
4800 }
4801
4802 list_add_tail(&init_css_set.e_cset_node[ssid],
4803 &cgrp_dfl_root.cgrp.e_csets[ssid]);
4220 4804
4221 /* 4805 /*
4222 * cftype registration needs kmalloc and can't be done 4806 * Setting dfl_root subsys_mask needs to consider the
4223 * during early_init. Register base cftypes separately. 4807 * disabled flag and cftype registration needs kmalloc,
4808 * both of which aren't available during early_init.
4224 */ 4809 */
4225 if (ss->base_cftypes) 4810 if (!ss->disabled) {
4811 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4226 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); 4812 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4813 }
4227 } 4814 }
4228 4815
4229 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4816 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4306,7 +4893,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4306 4893
4307 seq_printf(m, "%d:", root->hierarchy_id); 4894 seq_printf(m, "%d:", root->hierarchy_id);
4308 for_each_subsys(ss, ssid) 4895 for_each_subsys(ss, ssid)
4309 if (root->cgrp.subsys_mask & (1 << ssid)) 4896 if (root->subsys_mask & (1 << ssid))
4310 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4897 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4311 if (strlen(root->name)) 4898 if (strlen(root->name))
4312 seq_printf(m, "%sname=%s", count ? "," : "", 4899 seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4501,8 +5088,8 @@ void cgroup_exit(struct task_struct *tsk)
4501 5088
4502static void check_for_release(struct cgroup *cgrp) 5089static void check_for_release(struct cgroup *cgrp)
4503{ 5090{
4504 if (cgroup_is_releasable(cgrp) && 5091 if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
4505 list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { 5092 !css_has_online_children(&cgrp->self)) {
4506 /* 5093 /*
4507 * Control Group is currently removeable. If it's not 5094 * Control Group is currently removeable. If it's not
4508 * already queued for a userspace notification, queue 5095 * already queued for a userspace notification, queue
@@ -4619,7 +5206,7 @@ static int __init cgroup_disable(char *str)
4619__setup("cgroup_disable=", cgroup_disable); 5206__setup("cgroup_disable=", cgroup_disable);
4620 5207
4621/** 5208/**
4622 * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir 5209 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
4623 * @dentry: directory dentry of interest 5210 * @dentry: directory dentry of interest
4624 * @ss: subsystem of interest 5211 * @ss: subsystem of interest
4625 * 5212 *
@@ -4627,8 +5214,8 @@ __setup("cgroup_disable=", cgroup_disable);
4627 * to get the corresponding css and return it. If such css doesn't exist 5214 * to get the corresponding css and return it. If such css doesn't exist
4628 * or can't be pinned, an ERR_PTR value is returned. 5215 * or can't be pinned, an ERR_PTR value is returned.
4629 */ 5216 */
4630struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, 5217struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
4631 struct cgroup_subsys *ss) 5218 struct cgroup_subsys *ss)
4632{ 5219{
4633 struct kernfs_node *kn = kernfs_node_from_dentry(dentry); 5220 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4634 struct cgroup_subsys_state *css = NULL; 5221 struct cgroup_subsys_state *css = NULL;
@@ -4644,13 +5231,13 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
4644 /* 5231 /*
4645 * This path doesn't originate from kernfs and @kn could already 5232 * This path doesn't originate from kernfs and @kn could already
4646 * have been or be removed at any point. @kn->priv is RCU 5233 * have been or be removed at any point. @kn->priv is RCU
4647 * protected for this access. See destroy_locked() for details. 5234 * protected for this access. See cgroup_rmdir() for details.
4648 */ 5235 */
4649 cgrp = rcu_dereference(kn->priv); 5236 cgrp = rcu_dereference(kn->priv);
4650 if (cgrp) 5237 if (cgrp)
4651 css = cgroup_css(cgrp, ss); 5238 css = cgroup_css(cgrp, ss);
4652 5239
4653 if (!css || !css_tryget(css)) 5240 if (!css || !css_tryget_online(css))
4654 css = ERR_PTR(-ENOENT); 5241 css = ERR_PTR(-ENOENT);
4655 5242
4656 rcu_read_unlock(); 5243 rcu_read_unlock();
@@ -4667,14 +5254,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
4667 */ 5254 */
4668struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) 5255struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
4669{ 5256{
4670 struct cgroup *cgrp; 5257 WARN_ON_ONCE(!rcu_read_lock_held());
4671 5258 return idr_find(&ss->css_idr, id);
4672 cgroup_assert_mutexes_or_rcu_locked();
4673
4674 cgrp = idr_find(&ss->root->cgroup_idr, id);
4675 if (cgrp)
4676 return cgroup_css(cgrp, ss);
4677 return NULL;
4678} 5259}
4679 5260
4680#ifdef CONFIG_CGROUP_DEBUG 5261#ifdef CONFIG_CGROUP_DEBUG
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2bc4a2256444..a79e40f9d700 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -21,6 +21,7 @@
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/seq_file.h> 23#include <linux/seq_file.h>
24#include <linux/mutex.h>
24 25
25/* 26/*
26 * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is 27 * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
@@ -42,9 +43,10 @@ enum freezer_state_flags {
42struct freezer { 43struct freezer {
43 struct cgroup_subsys_state css; 44 struct cgroup_subsys_state css;
44 unsigned int state; 45 unsigned int state;
45 spinlock_t lock;
46}; 46};
47 47
48static DEFINE_MUTEX(freezer_mutex);
49
48static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) 50static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
49{ 51{
50 return css ? container_of(css, struct freezer, css) : NULL; 52 return css ? container_of(css, struct freezer, css) : NULL;
@@ -57,7 +59,7 @@ static inline struct freezer *task_freezer(struct task_struct *task)
57 59
58static struct freezer *parent_freezer(struct freezer *freezer) 60static struct freezer *parent_freezer(struct freezer *freezer)
59{ 61{
60 return css_freezer(css_parent(&freezer->css)); 62 return css_freezer(freezer->css.parent);
61} 63}
62 64
63bool cgroup_freezing(struct task_struct *task) 65bool cgroup_freezing(struct task_struct *task)
@@ -71,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task)
71 return ret; 73 return ret;
72} 74}
73 75
74/*
75 * cgroups_write_string() limits the size of freezer state strings to
76 * CGROUP_LOCAL_BUFFER_SIZE
77 */
78static const char *freezer_state_strs(unsigned int state) 76static const char *freezer_state_strs(unsigned int state)
79{ 77{
80 if (state & CGROUP_FROZEN) 78 if (state & CGROUP_FROZEN)
@@ -93,7 +91,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
93 if (!freezer) 91 if (!freezer)
94 return ERR_PTR(-ENOMEM); 92 return ERR_PTR(-ENOMEM);
95 93
96 spin_lock_init(&freezer->lock);
97 return &freezer->css; 94 return &freezer->css;
98} 95}
99 96
@@ -110,14 +107,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
110 struct freezer *freezer = css_freezer(css); 107 struct freezer *freezer = css_freezer(css);
111 struct freezer *parent = parent_freezer(freezer); 108 struct freezer *parent = parent_freezer(freezer);
112 109
113 /* 110 mutex_lock(&freezer_mutex);
114 * The following double locking and freezing state inheritance
115 * guarantee that @cgroup can never escape ancestors' freezing
116 * states. See css_for_each_descendant_pre() for details.
117 */
118 if (parent)
119 spin_lock_irq(&parent->lock);
120 spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
121 111
122 freezer->state |= CGROUP_FREEZER_ONLINE; 112 freezer->state |= CGROUP_FREEZER_ONLINE;
123 113
@@ -126,10 +116,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
126 atomic_inc(&system_freezing_cnt); 116 atomic_inc(&system_freezing_cnt);
127 } 117 }
128 118
129 spin_unlock(&freezer->lock); 119 mutex_unlock(&freezer_mutex);
130 if (parent)
131 spin_unlock_irq(&parent->lock);
132
133 return 0; 120 return 0;
134} 121}
135 122
@@ -144,14 +131,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
144{ 131{
145 struct freezer *freezer = css_freezer(css); 132 struct freezer *freezer = css_freezer(css);
146 133
147 spin_lock_irq(&freezer->lock); 134 mutex_lock(&freezer_mutex);
148 135
149 if (freezer->state & CGROUP_FREEZING) 136 if (freezer->state & CGROUP_FREEZING)
150 atomic_dec(&system_freezing_cnt); 137 atomic_dec(&system_freezing_cnt);
151 138
152 freezer->state = 0; 139 freezer->state = 0;
153 140
154 spin_unlock_irq(&freezer->lock); 141 mutex_unlock(&freezer_mutex);
155} 142}
156 143
157static void freezer_css_free(struct cgroup_subsys_state *css) 144static void freezer_css_free(struct cgroup_subsys_state *css)
@@ -175,7 +162,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
175 struct task_struct *task; 162 struct task_struct *task;
176 bool clear_frozen = false; 163 bool clear_frozen = false;
177 164
178 spin_lock_irq(&freezer->lock); 165 mutex_lock(&freezer_mutex);
179 166
180 /* 167 /*
181 * Make the new tasks conform to the current state of @new_css. 168 * Make the new tasks conform to the current state of @new_css.
@@ -197,21 +184,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
197 } 184 }
198 } 185 }
199 186
200 spin_unlock_irq(&freezer->lock); 187 /* propagate FROZEN clearing upwards */
201
202 /*
203 * Propagate FROZEN clearing upwards. We may race with
204 * update_if_frozen(), but as long as both work bottom-up, either
205 * update_if_frozen() sees child's FROZEN cleared or we clear the
206 * parent's FROZEN later. No parent w/ !FROZEN children can be
207 * left FROZEN.
208 */
209 while (clear_frozen && (freezer = parent_freezer(freezer))) { 188 while (clear_frozen && (freezer = parent_freezer(freezer))) {
210 spin_lock_irq(&freezer->lock);
211 freezer->state &= ~CGROUP_FROZEN; 189 freezer->state &= ~CGROUP_FROZEN;
212 clear_frozen = freezer->state & CGROUP_FREEZING; 190 clear_frozen = freezer->state & CGROUP_FREEZING;
213 spin_unlock_irq(&freezer->lock);
214 } 191 }
192
193 mutex_unlock(&freezer_mutex);
215} 194}
216 195
217/** 196/**
@@ -228,9 +207,6 @@ static void freezer_fork(struct task_struct *task)
228{ 207{
229 struct freezer *freezer; 208 struct freezer *freezer;
230 209
231 rcu_read_lock();
232 freezer = task_freezer(task);
233
234 /* 210 /*
235 * The root cgroup is non-freezable, so we can skip locking the 211 * The root cgroup is non-freezable, so we can skip locking the
236 * freezer. This is safe regardless of race with task migration. 212 * freezer. This is safe regardless of race with task migration.
@@ -238,24 +214,18 @@ static void freezer_fork(struct task_struct *task)
238 * to do. If we lost and root is the new cgroup, noop is still the 214 * to do. If we lost and root is the new cgroup, noop is still the
239 * right thing to do. 215 * right thing to do.
240 */ 216 */
241 if (!parent_freezer(freezer)) 217 if (task_css_is_root(task, freezer_cgrp_id))
242 goto out; 218 return;
243 219
244 /* 220 mutex_lock(&freezer_mutex);
245 * Grab @freezer->lock and freeze @task after verifying @task still 221 rcu_read_lock();
246 * belongs to @freezer and it's freezing. The former is for the 222
247 * case where we have raced against task migration and lost and 223 freezer = task_freezer(task);
248 * @task is already in a different cgroup which may not be frozen. 224 if (freezer->state & CGROUP_FREEZING)
249 * This isn't strictly necessary as freeze_task() is allowed to be
250 * called spuriously but let's do it anyway for, if nothing else,
251 * documentation.
252 */
253 spin_lock_irq(&freezer->lock);
254 if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
255 freeze_task(task); 225 freeze_task(task);
256 spin_unlock_irq(&freezer->lock); 226
257out:
258 rcu_read_unlock(); 227 rcu_read_unlock();
228 mutex_unlock(&freezer_mutex);
259} 229}
260 230
261/** 231/**
@@ -281,22 +251,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
281 struct css_task_iter it; 251 struct css_task_iter it;
282 struct task_struct *task; 252 struct task_struct *task;
283 253
284 WARN_ON_ONCE(!rcu_read_lock_held()); 254 lockdep_assert_held(&freezer_mutex);
285
286 spin_lock_irq(&freezer->lock);
287 255
288 if (!(freezer->state & CGROUP_FREEZING) || 256 if (!(freezer->state & CGROUP_FREEZING) ||
289 (freezer->state & CGROUP_FROZEN)) 257 (freezer->state & CGROUP_FROZEN))
290 goto out_unlock; 258 return;
291 259
292 /* are all (live) children frozen? */ 260 /* are all (live) children frozen? */
261 rcu_read_lock();
293 css_for_each_child(pos, css) { 262 css_for_each_child(pos, css) {
294 struct freezer *child = css_freezer(pos); 263 struct freezer *child = css_freezer(pos);
295 264
296 if ((child->state & CGROUP_FREEZER_ONLINE) && 265 if ((child->state & CGROUP_FREEZER_ONLINE) &&
297 !(child->state & CGROUP_FROZEN)) 266 !(child->state & CGROUP_FROZEN)) {
298 goto out_unlock; 267 rcu_read_unlock();
268 return;
269 }
299 } 270 }
271 rcu_read_unlock();
300 272
301 /* are all tasks frozen? */ 273 /* are all tasks frozen? */
302 css_task_iter_start(css, &it); 274 css_task_iter_start(css, &it);
@@ -317,21 +289,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
317 freezer->state |= CGROUP_FROZEN; 289 freezer->state |= CGROUP_FROZEN;
318out_iter_end: 290out_iter_end:
319 css_task_iter_end(&it); 291 css_task_iter_end(&it);
320out_unlock:
321 spin_unlock_irq(&freezer->lock);
322} 292}
323 293
324static int freezer_read(struct seq_file *m, void *v) 294static int freezer_read(struct seq_file *m, void *v)
325{ 295{
326 struct cgroup_subsys_state *css = seq_css(m), *pos; 296 struct cgroup_subsys_state *css = seq_css(m), *pos;
327 297
298 mutex_lock(&freezer_mutex);
328 rcu_read_lock(); 299 rcu_read_lock();
329 300
330 /* update states bottom-up */ 301 /* update states bottom-up */
331 css_for_each_descendant_post(pos, css) 302 css_for_each_descendant_post(pos, css) {
303 if (!css_tryget_online(pos))
304 continue;
305 rcu_read_unlock();
306
332 update_if_frozen(pos); 307 update_if_frozen(pos);
333 308
309 rcu_read_lock();
310 css_put(pos);
311 }
312
334 rcu_read_unlock(); 313 rcu_read_unlock();
314 mutex_unlock(&freezer_mutex);
335 315
336 seq_puts(m, freezer_state_strs(css_freezer(css)->state)); 316 seq_puts(m, freezer_state_strs(css_freezer(css)->state));
337 seq_putc(m, '\n'); 317 seq_putc(m, '\n');
@@ -373,7 +353,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
373 unsigned int state) 353 unsigned int state)
374{ 354{
375 /* also synchronizes against task migration, see freezer_attach() */ 355 /* also synchronizes against task migration, see freezer_attach() */
376 lockdep_assert_held(&freezer->lock); 356 lockdep_assert_held(&freezer_mutex);
377 357
378 if (!(freezer->state & CGROUP_FREEZER_ONLINE)) 358 if (!(freezer->state & CGROUP_FREEZER_ONLINE))
379 return; 359 return;
@@ -414,47 +394,47 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
414 * descendant will try to inherit its parent's FREEZING state as 394 * descendant will try to inherit its parent's FREEZING state as
415 * CGROUP_FREEZING_PARENT. 395 * CGROUP_FREEZING_PARENT.
416 */ 396 */
397 mutex_lock(&freezer_mutex);
417 rcu_read_lock(); 398 rcu_read_lock();
418 css_for_each_descendant_pre(pos, &freezer->css) { 399 css_for_each_descendant_pre(pos, &freezer->css) {
419 struct freezer *pos_f = css_freezer(pos); 400 struct freezer *pos_f = css_freezer(pos);
420 struct freezer *parent = parent_freezer(pos_f); 401 struct freezer *parent = parent_freezer(pos_f);
421 402
422 spin_lock_irq(&pos_f->lock); 403 if (!css_tryget_online(pos))
404 continue;
405 rcu_read_unlock();
423 406
424 if (pos_f == freezer) { 407 if (pos_f == freezer)
425 freezer_apply_state(pos_f, freeze, 408 freezer_apply_state(pos_f, freeze,
426 CGROUP_FREEZING_SELF); 409 CGROUP_FREEZING_SELF);
427 } else { 410 else
428 /*
429 * Our update to @parent->state is already visible
430 * which is all we need. No need to lock @parent.
431 * For more info on synchronization, see
432 * freezer_post_create().
433 */
434 freezer_apply_state(pos_f, 411 freezer_apply_state(pos_f,
435 parent->state & CGROUP_FREEZING, 412 parent->state & CGROUP_FREEZING,
436 CGROUP_FREEZING_PARENT); 413 CGROUP_FREEZING_PARENT);
437 }
438 414
439 spin_unlock_irq(&pos_f->lock); 415 rcu_read_lock();
416 css_put(pos);
440 } 417 }
441 rcu_read_unlock(); 418 rcu_read_unlock();
419 mutex_unlock(&freezer_mutex);
442} 420}
443 421
444static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, 422static ssize_t freezer_write(struct kernfs_open_file *of,
445 char *buffer) 423 char *buf, size_t nbytes, loff_t off)
446{ 424{
447 bool freeze; 425 bool freeze;
448 426
449 if (strcmp(buffer, freezer_state_strs(0)) == 0) 427 buf = strstrip(buf);
428
429 if (strcmp(buf, freezer_state_strs(0)) == 0)
450 freeze = false; 430 freeze = false;
451 else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) 431 else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
452 freeze = true; 432 freeze = true;
453 else 433 else
454 return -EINVAL; 434 return -EINVAL;
455 435
456 freezer_change_state(css_freezer(css), freeze); 436 freezer_change_state(css_freezer(of_css(of)), freeze);
457 return 0; 437 return nbytes;
458} 438}
459 439
460static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, 440static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
@@ -478,7 +458,7 @@ static struct cftype files[] = {
478 .name = "state", 458 .name = "state",
479 .flags = CFTYPE_NOT_ON_ROOT, 459 .flags = CFTYPE_NOT_ON_ROOT,
480 .seq_show = freezer_read, 460 .seq_show = freezer_read,
481 .write_string = freezer_write, 461 .write = freezer_write,
482 }, 462 },
483 { 463 {
484 .name = "self_freezing", 464 .name = "self_freezing",
diff --git a/kernel/compat.c b/kernel/compat.c
index e40b0430b562..633394f442f8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -157,7 +157,7 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
157int compat_get_timeval(struct timeval *tv, const void __user *utv) 157int compat_get_timeval(struct timeval *tv, const void __user *utv)
158{ 158{
159 if (COMPAT_USE_64BIT_TIME) 159 if (COMPAT_USE_64BIT_TIME)
160 return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; 160 return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0;
161 else 161 else
162 return __compat_get_timeval(tv, utv); 162 return __compat_get_timeval(tv, utv);
163} 163}
@@ -166,7 +166,7 @@ EXPORT_SYMBOL_GPL(compat_get_timeval);
166int compat_put_timeval(const struct timeval *tv, void __user *utv) 166int compat_put_timeval(const struct timeval *tv, void __user *utv)
167{ 167{
168 if (COMPAT_USE_64BIT_TIME) 168 if (COMPAT_USE_64BIT_TIME)
169 return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; 169 return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0;
170 else 170 else
171 return __compat_put_timeval(tv, utv); 171 return __compat_put_timeval(tv, utv);
172} 172}
@@ -175,7 +175,7 @@ EXPORT_SYMBOL_GPL(compat_put_timeval);
175int compat_get_timespec(struct timespec *ts, const void __user *uts) 175int compat_get_timespec(struct timespec *ts, const void __user *uts)
176{ 176{
177 if (COMPAT_USE_64BIT_TIME) 177 if (COMPAT_USE_64BIT_TIME)
178 return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; 178 return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
179 else 179 else
180 return __compat_get_timespec(ts, uts); 180 return __compat_get_timespec(ts, uts);
181} 181}
@@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(compat_get_timespec);
184int compat_put_timespec(const struct timespec *ts, void __user *uts) 184int compat_put_timespec(const struct timespec *ts, void __user *uts)
185{ 185{
186 if (COMPAT_USE_64BIT_TIME) 186 if (COMPAT_USE_64BIT_TIME)
187 return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; 187 return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
188 else 188 else
189 return __compat_put_timespec(ts, uts); 189 return __compat_put_timespec(ts, uts);
190} 190}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 019d45008448..5664985c46a0 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -19,6 +19,7 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/hardirq.h> 20#include <linux/hardirq.h>
21#include <linux/export.h> 21#include <linux/export.h>
22#include <linux/kprobes.h>
22 23
23#define CREATE_TRACE_POINTS 24#define CREATE_TRACE_POINTS
24#include <trace/events/context_tracking.h> 25#include <trace/events/context_tracking.h>
@@ -104,6 +105,7 @@ void context_tracking_user_enter(void)
104 } 105 }
105 local_irq_restore(flags); 106 local_irq_restore(flags);
106} 107}
108NOKPROBE_SYMBOL(context_tracking_user_enter);
107 109
108#ifdef CONFIG_PREEMPT 110#ifdef CONFIG_PREEMPT
109/** 111/**
@@ -181,6 +183,7 @@ void context_tracking_user_exit(void)
181 } 183 }
182 local_irq_restore(flags); 184 local_irq_restore(flags);
183} 185}
186NOKPROBE_SYMBOL(context_tracking_user_exit);
184 187
185/** 188/**
186 * __context_tracking_task_switch - context switch the syscall callbacks 189 * __context_tracking_task_switch - context switch the syscall callbacks
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a9e710eef0e2..a343bde710b1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,7 @@
20#include <linux/gfp.h> 20#include <linux/gfp.h>
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <linux/lockdep.h> 22#include <linux/lockdep.h>
23#include <trace/events/power.h>
23 24
24#include "smpboot.h" 25#include "smpboot.h"
25 26
@@ -283,8 +284,7 @@ static inline void check_for_tasks(int cpu)
283 task_cputime(p, &utime, &stime); 284 task_cputime(p, &utime, &stime);
284 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 285 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
285 (utime || stime)) 286 (utime || stime))
286 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 287 pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
287 "(state = %ld, flags = %x)\n",
288 p->comm, task_pid_nr(p), cpu, 288 p->comm, task_pid_nr(p), cpu,
289 p->state, p->flags); 289 p->state, p->flags);
290 } 290 }
@@ -336,8 +336,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
336 if (err) { 336 if (err) {
337 nr_calls--; 337 nr_calls--;
338 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); 338 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
339 printk("%s: attempt to take down CPU %u failed\n", 339 pr_warn("%s: attempt to take down CPU %u failed\n",
340 __func__, cpu); 340 __func__, cpu);
341 goto out_release; 341 goto out_release;
342 } 342 }
343 343
@@ -444,8 +444,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
444 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 444 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
445 if (ret) { 445 if (ret) {
446 nr_calls--; 446 nr_calls--;
447 printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", 447 pr_warn("%s: attempt to bring up CPU %u failed\n",
448 __func__, cpu); 448 __func__, cpu);
449 goto out_notify; 449 goto out_notify;
450 } 450 }
451 451
@@ -475,11 +475,10 @@ int cpu_up(unsigned int cpu)
475 int err = 0; 475 int err = 0;
476 476
477 if (!cpu_possible(cpu)) { 477 if (!cpu_possible(cpu)) {
478 printk(KERN_ERR "can't online cpu %d because it is not " 478 pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
479 "configured as may-hotadd at boot time\n", cpu); 479 cpu);
480#if defined(CONFIG_IA64) 480#if defined(CONFIG_IA64)
481 printk(KERN_ERR "please check additional_cpus= boot " 481 pr_err("please check additional_cpus= boot parameter\n");
482 "parameter\n");
483#endif 482#endif
484 return -EINVAL; 483 return -EINVAL;
485 } 484 }
@@ -518,16 +517,17 @@ int disable_nonboot_cpus(void)
518 */ 517 */
519 cpumask_clear(frozen_cpus); 518 cpumask_clear(frozen_cpus);
520 519
521 printk("Disabling non-boot CPUs ...\n"); 520 pr_info("Disabling non-boot CPUs ...\n");
522 for_each_online_cpu(cpu) { 521 for_each_online_cpu(cpu) {
523 if (cpu == first_cpu) 522 if (cpu == first_cpu)
524 continue; 523 continue;
524 trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
525 error = _cpu_down(cpu, 1); 525 error = _cpu_down(cpu, 1);
526 trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
526 if (!error) 527 if (!error)
527 cpumask_set_cpu(cpu, frozen_cpus); 528 cpumask_set_cpu(cpu, frozen_cpus);
528 else { 529 else {
529 printk(KERN_ERR "Error taking CPU%d down: %d\n", 530 pr_err("Error taking CPU%d down: %d\n", cpu, error);
530 cpu, error);
531 break; 531 break;
532 } 532 }
533 } 533 }
@@ -537,7 +537,7 @@ int disable_nonboot_cpus(void)
537 /* Make sure the CPUs won't be enabled by someone else */ 537 /* Make sure the CPUs won't be enabled by someone else */
538 cpu_hotplug_disabled = 1; 538 cpu_hotplug_disabled = 1;
539 } else { 539 } else {
540 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 540 pr_err("Non-boot CPUs are not disabled\n");
541 } 541 }
542 cpu_maps_update_done(); 542 cpu_maps_update_done();
543 return error; 543 return error;
@@ -561,17 +561,19 @@ void __ref enable_nonboot_cpus(void)
561 if (cpumask_empty(frozen_cpus)) 561 if (cpumask_empty(frozen_cpus))
562 goto out; 562 goto out;
563 563
564 printk(KERN_INFO "Enabling non-boot CPUs ...\n"); 564 pr_info("Enabling non-boot CPUs ...\n");
565 565
566 arch_enable_nonboot_cpus_begin(); 566 arch_enable_nonboot_cpus_begin();
567 567
568 for_each_cpu(cpu, frozen_cpus) { 568 for_each_cpu(cpu, frozen_cpus) {
569 trace_suspend_resume(TPS("CPU_ON"), cpu, true);
569 error = _cpu_up(cpu, 1); 570 error = _cpu_up(cpu, 1);
571 trace_suspend_resume(TPS("CPU_ON"), cpu, false);
570 if (!error) { 572 if (!error) {
571 printk(KERN_INFO "CPU%d is up\n", cpu); 573 pr_info("CPU%d is up\n", cpu);
572 continue; 574 continue;
573 } 575 }
574 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 576 pr_warn("Error taking CPU%d up: %d\n", cpu, error);
575 } 577 }
576 578
577 arch_enable_nonboot_cpus_end(); 579 arch_enable_nonboot_cpus_end();
@@ -726,10 +728,12 @@ void set_cpu_present(unsigned int cpu, bool present)
726 728
727void set_cpu_online(unsigned int cpu, bool online) 729void set_cpu_online(unsigned int cpu, bool online)
728{ 730{
729 if (online) 731 if (online) {
730 cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); 732 cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
731 else 733 cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
734 } else {
732 cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); 735 cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
736 }
733} 737}
734 738
735void set_cpu_active(unsigned int cpu, bool active) 739void set_cpu_active(unsigned int cpu, bool active)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3d54c418bd06..116a4164720a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,12 +61,7 @@
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62#include <linux/wait.h> 62#include <linux/wait.h>
63 63
64/* 64struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
65 * Tracks how many cpusets are currently defined in system.
66 * When there is only one cpuset (the root cpuset) we can
67 * short circuit some hooks.
68 */
69int number_of_cpusets __read_mostly;
70 65
71/* See "Frequency meter" comments, below. */ 66/* See "Frequency meter" comments, below. */
72 67
@@ -124,7 +119,7 @@ static inline struct cpuset *task_cs(struct task_struct *task)
124 119
125static inline struct cpuset *parent_cs(struct cpuset *cs) 120static inline struct cpuset *parent_cs(struct cpuset *cs)
126{ 121{
127 return css_cs(css_parent(&cs->css)); 122 return css_cs(cs->css.parent);
128} 123}
129 124
130#ifdef CONFIG_NUMA 125#ifdef CONFIG_NUMA
@@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
611 goto done; 606 goto done;
612 } 607 }
613 608
614 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 609 csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
615 if (!csa) 610 if (!csa)
616 goto done; 611 goto done;
617 csn = 0; 612 csn = 0;
@@ -696,11 +691,8 @@ restart:
696 if (nslot == ndoms) { 691 if (nslot == ndoms) {
697 static int warnings = 10; 692 static int warnings = 10;
698 if (warnings) { 693 if (warnings) {
699 printk(KERN_WARNING 694 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
700 "rebuild_sched_domains confused:" 695 nslot, ndoms, csn, i, apn);
701 " nslot %d, ndoms %d, csn %d, i %d,"
702 " apn %d\n",
703 nslot, ndoms, csn, i, apn);
704 warnings--; 696 warnings--;
705 } 697 }
706 continue; 698 continue;
@@ -875,7 +867,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
875 continue; 867 continue;
876 } 868 }
877 } 869 }
878 if (!css_tryget(&cp->css)) 870 if (!css_tryget_online(&cp->css))
879 continue; 871 continue;
880 rcu_read_unlock(); 872 rcu_read_unlock();
881 873
@@ -890,6 +882,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
890/** 882/**
891 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 883 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
892 * @cs: the cpuset to consider 884 * @cs: the cpuset to consider
885 * @trialcs: trial cpuset
893 * @buf: buffer of cpu numbers written to this cpuset 886 * @buf: buffer of cpu numbers written to this cpuset
894 */ 887 */
895static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 888static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
@@ -1110,7 +1103,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
1110 continue; 1103 continue;
1111 } 1104 }
1112 } 1105 }
1113 if (!css_tryget(&cp->css)) 1106 if (!css_tryget_online(&cp->css))
1114 continue; 1107 continue;
1115 rcu_read_unlock(); 1108 rcu_read_unlock();
1116 1109
@@ -1188,7 +1181,13 @@ done:
1188 1181
1189int current_cpuset_is_being_rebound(void) 1182int current_cpuset_is_being_rebound(void)
1190{ 1183{
1191 return task_cs(current) == cpuset_being_rebound; 1184 int ret;
1185
1186 rcu_read_lock();
1187 ret = task_cs(current) == cpuset_being_rebound;
1188 rcu_read_unlock();
1189
1190 return ret;
1192} 1191}
1193 1192
1194static int update_relax_domain_level(struct cpuset *cs, s64 val) 1193static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@ -1605,13 +1604,15 @@ out_unlock:
1605/* 1604/*
1606 * Common handling for a write to a "cpus" or "mems" file. 1605 * Common handling for a write to a "cpus" or "mems" file.
1607 */ 1606 */
1608static int cpuset_write_resmask(struct cgroup_subsys_state *css, 1607static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1609 struct cftype *cft, char *buf) 1608 char *buf, size_t nbytes, loff_t off)
1610{ 1609{
1611 struct cpuset *cs = css_cs(css); 1610 struct cpuset *cs = css_cs(of_css(of));
1612 struct cpuset *trialcs; 1611 struct cpuset *trialcs;
1613 int retval = -ENODEV; 1612 int retval = -ENODEV;
1614 1613
1614 buf = strstrip(buf);
1615
1615 /* 1616 /*
1616 * CPU or memory hotunplug may leave @cs w/o any execution 1617 * CPU or memory hotunplug may leave @cs w/o any execution
1617 * resources, in which case the hotplug code asynchronously updates 1618 * resources, in which case the hotplug code asynchronously updates
@@ -1622,7 +1623,17 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1622 * resources, wait for the previously scheduled operations before 1623 * resources, wait for the previously scheduled operations before
1623 * proceeding, so that we don't end up keep removing tasks added 1624 * proceeding, so that we don't end up keep removing tasks added
1624 * after execution capability is restored. 1625 * after execution capability is restored.
1626 *
1627 * cpuset_hotplug_work calls back into cgroup core via
1628 * cgroup_transfer_tasks() and waiting for it from a cgroupfs
1629 * operation like this one can lead to a deadlock through kernfs
1630 * active_ref protection. Let's break the protection. Losing the
1631 * protection is okay as we check whether @cs is online after
1632 * grabbing cpuset_mutex anyway. This only happens on the legacy
1633 * hierarchies.
1625 */ 1634 */
1635 css_get(&cs->css);
1636 kernfs_break_active_protection(of->kn);
1626 flush_work(&cpuset_hotplug_work); 1637 flush_work(&cpuset_hotplug_work);
1627 1638
1628 mutex_lock(&cpuset_mutex); 1639 mutex_lock(&cpuset_mutex);
@@ -1635,7 +1646,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1635 goto out_unlock; 1646 goto out_unlock;
1636 } 1647 }
1637 1648
1638 switch (cft->private) { 1649 switch (of_cft(of)->private) {
1639 case FILE_CPULIST: 1650 case FILE_CPULIST:
1640 retval = update_cpumask(cs, trialcs, buf); 1651 retval = update_cpumask(cs, trialcs, buf);
1641 break; 1652 break;
@@ -1650,7 +1661,9 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1650 free_trial_cpuset(trialcs); 1661 free_trial_cpuset(trialcs);
1651out_unlock: 1662out_unlock:
1652 mutex_unlock(&cpuset_mutex); 1663 mutex_unlock(&cpuset_mutex);
1653 return retval; 1664 kernfs_unbreak_active_protection(of->kn);
1665 css_put(&cs->css);
1666 return retval ?: nbytes;
1654} 1667}
1655 1668
1656/* 1669/*
@@ -1752,7 +1765,7 @@ static struct cftype files[] = {
1752 { 1765 {
1753 .name = "cpus", 1766 .name = "cpus",
1754 .seq_show = cpuset_common_seq_show, 1767 .seq_show = cpuset_common_seq_show,
1755 .write_string = cpuset_write_resmask, 1768 .write = cpuset_write_resmask,
1756 .max_write_len = (100U + 6 * NR_CPUS), 1769 .max_write_len = (100U + 6 * NR_CPUS),
1757 .private = FILE_CPULIST, 1770 .private = FILE_CPULIST,
1758 }, 1771 },
@@ -1760,7 +1773,7 @@ static struct cftype files[] = {
1760 { 1773 {
1761 .name = "mems", 1774 .name = "mems",
1762 .seq_show = cpuset_common_seq_show, 1775 .seq_show = cpuset_common_seq_show,
1763 .write_string = cpuset_write_resmask, 1776 .write = cpuset_write_resmask,
1764 .max_write_len = (100U + 6 * MAX_NUMNODES), 1777 .max_write_len = (100U + 6 * MAX_NUMNODES),
1765 .private = FILE_MEMLIST, 1778 .private = FILE_MEMLIST,
1766 }, 1779 },
@@ -1888,7 +1901,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1888 if (is_spread_slab(parent)) 1901 if (is_spread_slab(parent))
1889 set_bit(CS_SPREAD_SLAB, &cs->flags); 1902 set_bit(CS_SPREAD_SLAB, &cs->flags);
1890 1903
1891 number_of_cpusets++; 1904 cpuset_inc();
1892 1905
1893 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 1906 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1894 goto out_unlock; 1907 goto out_unlock;
@@ -1939,7 +1952,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
1939 if (is_sched_load_balance(cs)) 1952 if (is_sched_load_balance(cs))
1940 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1953 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1941 1954
1942 number_of_cpusets--; 1955 cpuset_dec();
1943 clear_bit(CS_ONLINE, &cs->flags); 1956 clear_bit(CS_ONLINE, &cs->flags);
1944 1957
1945 mutex_unlock(&cpuset_mutex); 1958 mutex_unlock(&cpuset_mutex);
@@ -1992,7 +2005,6 @@ int __init cpuset_init(void)
1992 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) 2005 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1993 BUG(); 2006 BUG();
1994 2007
1995 number_of_cpusets = 1;
1996 return 0; 2008 return 0;
1997} 2009}
1998 2010
@@ -2017,7 +2029,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2017 parent = parent_cs(parent); 2029 parent = parent_cs(parent);
2018 2030
2019 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 2031 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2020 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset "); 2032 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2021 pr_cont_cgroup_name(cs->css.cgroup); 2033 pr_cont_cgroup_name(cs->css.cgroup);
2022 pr_cont("\n"); 2034 pr_cont("\n");
2023 } 2035 }
@@ -2155,7 +2167,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2155 2167
2156 rcu_read_lock(); 2168 rcu_read_lock();
2157 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { 2169 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2158 if (cs == &top_cpuset || !css_tryget(&cs->css)) 2170 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
2159 continue; 2171 continue;
2160 rcu_read_unlock(); 2172 rcu_read_unlock();
2161 2173
@@ -2536,7 +2548,7 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2536 2548
2537/** 2549/**
2538 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed 2550 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2539 * @task: pointer to task_struct of some task. 2551 * @tsk: pointer to task_struct of some task.
2540 * 2552 *
2541 * Description: Prints @task's name, cpuset name, and cached copy of its 2553 * Description: Prints @task's name, cpuset name, and cached copy of its
2542 * mems_allowed to the kernel log. 2554 * mems_allowed to the kernel log.
@@ -2554,7 +2566,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2554 cgrp = task_cs(tsk)->css.cgroup; 2566 cgrp = task_cs(tsk)->css.cgroup;
2555 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2567 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2556 tsk->mems_allowed); 2568 tsk->mems_allowed);
2557 printk(KERN_INFO "%s cpuset=", tsk->comm); 2569 pr_info("%s cpuset=", tsk->comm);
2558 pr_cont_cgroup_name(cgrp); 2570 pr_cont_cgroup_name(cgrp);
2559 pr_cont(" mems_allowed=%s\n", cpuset_nodelist); 2571 pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
2560 2572
@@ -2646,10 +2658,10 @@ out:
2646/* Display task mems_allowed in /proc/<pid>/status file. */ 2658/* Display task mems_allowed in /proc/<pid>/status file. */
2647void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2659void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2648{ 2660{
2649 seq_printf(m, "Mems_allowed:\t"); 2661 seq_puts(m, "Mems_allowed:\t");
2650 seq_nodemask(m, &task->mems_allowed); 2662 seq_nodemask(m, &task->mems_allowed);
2651 seq_printf(m, "\n"); 2663 seq_puts(m, "\n");
2652 seq_printf(m, "Mems_allowed_list:\t"); 2664 seq_puts(m, "Mems_allowed_list:\t");
2653 seq_nodemask_list(m, &task->mems_allowed); 2665 seq_nodemask_list(m, &task->mems_allowed);
2654 seq_printf(m, "\n"); 2666 seq_puts(m, "\n");
2655} 2667}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 2956c8da1605..1adf62b39b96 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -534,7 +534,7 @@ return_normal:
534 kgdb_info[cpu].exception_state &= 534 kgdb_info[cpu].exception_state &=
535 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); 535 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
536 kgdb_info[cpu].enter_kgdb--; 536 kgdb_info[cpu].enter_kgdb--;
537 smp_mb__before_atomic_dec(); 537 smp_mb__before_atomic();
538 atomic_dec(&slaves_in_kgdb); 538 atomic_dec(&slaves_in_kgdb);
539 dbg_touch_watchdogs(); 539 dbg_touch_watchdogs();
540 local_irq_restore(flags); 540 local_irq_restore(flags);
@@ -662,7 +662,7 @@ kgdb_restore:
662 kgdb_info[cpu].exception_state &= 662 kgdb_info[cpu].exception_state &=
663 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); 663 ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
664 kgdb_info[cpu].enter_kgdb--; 664 kgdb_info[cpu].enter_kgdb--;
665 smp_mb__before_atomic_dec(); 665 smp_mb__before_atomic();
666 atomic_dec(&masters_in_kgdb); 666 atomic_dec(&masters_in_kgdb);
667 /* Free kgdb_active */ 667 /* Free kgdb_active */
668 atomic_set(&kgdb_active, -1); 668 atomic_set(&kgdb_active, -1);
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index b03e0e814e43..fe15fff5df53 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -21,7 +21,7 @@
21static void kdb_show_stack(struct task_struct *p, void *addr) 21static void kdb_show_stack(struct task_struct *p, void *addr)
22{ 22{
23 int old_lvl = console_loglevel; 23 int old_lvl = console_loglevel;
24 console_loglevel = 15; 24 console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
25 kdb_trap_printk++; 25 kdb_trap_printk++;
26 kdb_set_current_task(p); 26 kdb_set_current_task(p);
27 if (addr) { 27 if (addr) {
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 14ff4849262c..7c70812caea5 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -710,7 +710,7 @@ kdb_printit:
710 } 710 }
711 if (logging) { 711 if (logging) {
712 saved_loglevel = console_loglevel; 712 saved_loglevel = console_loglevel;
713 console_loglevel = 0; 713 console_loglevel = CONSOLE_LOGLEVEL_SILENT;
714 printk(KERN_INFO "%s", kdb_buffer); 714 printk(KERN_INFO "%s", kdb_buffer);
715 } 715 }
716 716
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 0b097c8a1e50..2f7c760305ca 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv)
1091static void kdb_dumpregs(struct pt_regs *regs) 1091static void kdb_dumpregs(struct pt_regs *regs)
1092{ 1092{
1093 int old_lvl = console_loglevel; 1093 int old_lvl = console_loglevel;
1094 console_loglevel = 15; 1094 console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
1095 kdb_trap_printk++; 1095 kdb_trap_printk++;
1096 show_regs(regs); 1096 show_regs(regs);
1097 kdb_trap_printk--; 1097 kdb_trap_printk--;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f83a71a3e46d..b0c95f0f06fd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -39,6 +39,8 @@
39#include <linux/hw_breakpoint.h> 39#include <linux/hw_breakpoint.h>
40#include <linux/mm_types.h> 40#include <linux/mm_types.h>
41#include <linux/cgroup.h> 41#include <linux/cgroup.h>
42#include <linux/module.h>
43#include <linux/mman.h>
42 44
43#include "internal.h" 45#include "internal.h"
44 46
@@ -607,7 +609,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
607 if (!f.file) 609 if (!f.file)
608 return -EBADF; 610 return -EBADF;
609 611
610 css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); 612 css = css_tryget_online_from_dir(f.file->f_dentry,
613 &perf_event_cgrp_subsys);
611 if (IS_ERR(css)) { 614 if (IS_ERR(css)) {
612 ret = PTR_ERR(css); 615 ret = PTR_ERR(css);
613 goto out; 616 goto out;
@@ -1443,6 +1446,11 @@ group_sched_out(struct perf_event *group_event,
1443 cpuctx->exclusive = 0; 1446 cpuctx->exclusive = 0;
1444} 1447}
1445 1448
1449struct remove_event {
1450 struct perf_event *event;
1451 bool detach_group;
1452};
1453
1446/* 1454/*
1447 * Cross CPU call to remove a performance event 1455 * Cross CPU call to remove a performance event
1448 * 1456 *
@@ -1451,12 +1459,15 @@ group_sched_out(struct perf_event *group_event,
1451 */ 1459 */
1452static int __perf_remove_from_context(void *info) 1460static int __perf_remove_from_context(void *info)
1453{ 1461{
1454 struct perf_event *event = info; 1462 struct remove_event *re = info;
1463 struct perf_event *event = re->event;
1455 struct perf_event_context *ctx = event->ctx; 1464 struct perf_event_context *ctx = event->ctx;
1456 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1465 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1457 1466
1458 raw_spin_lock(&ctx->lock); 1467 raw_spin_lock(&ctx->lock);
1459 event_sched_out(event, cpuctx, ctx); 1468 event_sched_out(event, cpuctx, ctx);
1469 if (re->detach_group)
1470 perf_group_detach(event);
1460 list_del_event(event, ctx); 1471 list_del_event(event, ctx);
1461 if (!ctx->nr_events && cpuctx->task_ctx == ctx) { 1472 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1462 ctx->is_active = 0; 1473 ctx->is_active = 0;
@@ -1481,10 +1492,14 @@ static int __perf_remove_from_context(void *info)
1481 * When called from perf_event_exit_task, it's OK because the 1492 * When called from perf_event_exit_task, it's OK because the
1482 * context has been detached from its task. 1493 * context has been detached from its task.
1483 */ 1494 */
1484static void perf_remove_from_context(struct perf_event *event) 1495static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1485{ 1496{
1486 struct perf_event_context *ctx = event->ctx; 1497 struct perf_event_context *ctx = event->ctx;
1487 struct task_struct *task = ctx->task; 1498 struct task_struct *task = ctx->task;
1499 struct remove_event re = {
1500 .event = event,
1501 .detach_group = detach_group,
1502 };
1488 1503
1489 lockdep_assert_held(&ctx->mutex); 1504 lockdep_assert_held(&ctx->mutex);
1490 1505
@@ -1493,12 +1508,12 @@ static void perf_remove_from_context(struct perf_event *event)
1493 * Per cpu events are removed via an smp call and 1508 * Per cpu events are removed via an smp call and
1494 * the removal is always successful. 1509 * the removal is always successful.
1495 */ 1510 */
1496 cpu_function_call(event->cpu, __perf_remove_from_context, event); 1511 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1497 return; 1512 return;
1498 } 1513 }
1499 1514
1500retry: 1515retry:
1501 if (!task_function_call(task, __perf_remove_from_context, event)) 1516 if (!task_function_call(task, __perf_remove_from_context, &re))
1502 return; 1517 return;
1503 1518
1504 raw_spin_lock_irq(&ctx->lock); 1519 raw_spin_lock_irq(&ctx->lock);
@@ -1515,6 +1530,8 @@ retry:
1515 * Since the task isn't running, its safe to remove the event, us 1530 * Since the task isn't running, its safe to remove the event, us
1516 * holding the ctx->lock ensures the task won't get scheduled in. 1531 * holding the ctx->lock ensures the task won't get scheduled in.
1517 */ 1532 */
1533 if (detach_group)
1534 perf_group_detach(event);
1518 list_del_event(event, ctx); 1535 list_del_event(event, ctx);
1519 raw_spin_unlock_irq(&ctx->lock); 1536 raw_spin_unlock_irq(&ctx->lock);
1520} 1537}
@@ -1663,6 +1680,8 @@ event_sched_in(struct perf_event *event,
1663 u64 tstamp = perf_event_time(event); 1680 u64 tstamp = perf_event_time(event);
1664 int ret = 0; 1681 int ret = 0;
1665 1682
1683 lockdep_assert_held(&ctx->lock);
1684
1666 if (event->state <= PERF_EVENT_STATE_OFF) 1685 if (event->state <= PERF_EVENT_STATE_OFF)
1667 return 0; 1686 return 0;
1668 1687
@@ -2301,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2301 next_parent = rcu_dereference(next_ctx->parent_ctx); 2320 next_parent = rcu_dereference(next_ctx->parent_ctx);
2302 2321
2303 /* If neither context have a parent context; they cannot be clones. */ 2322 /* If neither context have a parent context; they cannot be clones. */
2304 if (!parent && !next_parent) 2323 if (!parent || !next_parent)
2305 goto unlock; 2324 goto unlock;
2306 2325
2307 if (next_parent == ctx || next_ctx == parent || next_parent == parent) { 2326 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -2956,6 +2975,22 @@ out:
2956 local_irq_restore(flags); 2975 local_irq_restore(flags);
2957} 2976}
2958 2977
2978void perf_event_exec(void)
2979{
2980 struct perf_event_context *ctx;
2981 int ctxn;
2982
2983 rcu_read_lock();
2984 for_each_task_context_nr(ctxn) {
2985 ctx = current->perf_event_ctxp[ctxn];
2986 if (!ctx)
2987 continue;
2988
2989 perf_event_enable_on_exec(ctx);
2990 }
2991 rcu_read_unlock();
2992}
2993
2959/* 2994/*
2960 * Cross CPU call to read the hardware event 2995 * Cross CPU call to read the hardware event
2961 */ 2996 */
@@ -3178,7 +3213,8 @@ static void free_event_rcu(struct rcu_head *head)
3178} 3213}
3179 3214
3180static void ring_buffer_put(struct ring_buffer *rb); 3215static void ring_buffer_put(struct ring_buffer *rb);
3181static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); 3216static void ring_buffer_attach(struct perf_event *event,
3217 struct ring_buffer *rb);
3182 3218
3183static void unaccount_event_cpu(struct perf_event *event, int cpu) 3219static void unaccount_event_cpu(struct perf_event *event, int cpu)
3184{ 3220{
@@ -3229,17 +3265,19 @@ static void __free_event(struct perf_event *event)
3229 if (event->ctx) 3265 if (event->ctx)
3230 put_ctx(event->ctx); 3266 put_ctx(event->ctx);
3231 3267
3268 if (event->pmu)
3269 module_put(event->pmu->module);
3270
3232 call_rcu(&event->rcu_head, free_event_rcu); 3271 call_rcu(&event->rcu_head, free_event_rcu);
3233} 3272}
3234static void free_event(struct perf_event *event) 3273
3274static void _free_event(struct perf_event *event)
3235{ 3275{
3236 irq_work_sync(&event->pending); 3276 irq_work_sync(&event->pending);
3237 3277
3238 unaccount_event(event); 3278 unaccount_event(event);
3239 3279
3240 if (event->rb) { 3280 if (event->rb) {
3241 struct ring_buffer *rb;
3242
3243 /* 3281 /*
3244 * Can happen when we close an event with re-directed output. 3282 * Can happen when we close an event with re-directed output.
3245 * 3283 *
@@ -3247,57 +3285,38 @@ static void free_event(struct perf_event *event)
3247 * over us; possibly making our ring_buffer_put() the last. 3285 * over us; possibly making our ring_buffer_put() the last.
3248 */ 3286 */
3249 mutex_lock(&event->mmap_mutex); 3287 mutex_lock(&event->mmap_mutex);
3250 rb = event->rb; 3288 ring_buffer_attach(event, NULL);
3251 if (rb) {
3252 rcu_assign_pointer(event->rb, NULL);
3253 ring_buffer_detach(event, rb);
3254 ring_buffer_put(rb); /* could be last */
3255 }
3256 mutex_unlock(&event->mmap_mutex); 3289 mutex_unlock(&event->mmap_mutex);
3257 } 3290 }
3258 3291
3259 if (is_cgroup_event(event)) 3292 if (is_cgroup_event(event))
3260 perf_detach_cgroup(event); 3293 perf_detach_cgroup(event);
3261 3294
3262
3263 __free_event(event); 3295 __free_event(event);
3264} 3296}
3265 3297
3266int perf_event_release_kernel(struct perf_event *event) 3298/*
3299 * Used to free events which have a known refcount of 1, such as in error paths
3300 * where the event isn't exposed yet and inherited events.
3301 */
3302static void free_event(struct perf_event *event)
3267{ 3303{
3268 struct perf_event_context *ctx = event->ctx; 3304 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3269 3305 "unexpected event refcount: %ld; ptr=%p\n",
3270 WARN_ON_ONCE(ctx->parent_ctx); 3306 atomic_long_read(&event->refcount), event)) {
3271 /* 3307 /* leak to avoid use-after-free */
3272 * There are two ways this annotation is useful: 3308 return;
3273 * 3309 }
3274 * 1) there is a lock recursion from perf_event_exit_task
3275 * see the comment there.
3276 *
3277 * 2) there is a lock-inversion with mmap_sem through
3278 * perf_event_read_group(), which takes faults while
3279 * holding ctx->mutex, however this is called after
3280 * the last filedesc died, so there is no possibility
3281 * to trigger the AB-BA case.
3282 */
3283 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
3284 raw_spin_lock_irq(&ctx->lock);
3285 perf_group_detach(event);
3286 raw_spin_unlock_irq(&ctx->lock);
3287 perf_remove_from_context(event);
3288 mutex_unlock(&ctx->mutex);
3289
3290 free_event(event);
3291 3310
3292 return 0; 3311 _free_event(event);
3293} 3312}
3294EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3295 3313
3296/* 3314/*
3297 * Called when the last reference to the file is gone. 3315 * Called when the last reference to the file is gone.
3298 */ 3316 */
3299static void put_event(struct perf_event *event) 3317static void put_event(struct perf_event *event)
3300{ 3318{
3319 struct perf_event_context *ctx = event->ctx;
3301 struct task_struct *owner; 3320 struct task_struct *owner;
3302 3321
3303 if (!atomic_long_dec_and_test(&event->refcount)) 3322 if (!atomic_long_dec_and_test(&event->refcount))
@@ -3336,8 +3355,32 @@ static void put_event(struct perf_event *event)
3336 put_task_struct(owner); 3355 put_task_struct(owner);
3337 } 3356 }
3338 3357
3339 perf_event_release_kernel(event); 3358 WARN_ON_ONCE(ctx->parent_ctx);
3359 /*
3360 * There are two ways this annotation is useful:
3361 *
3362 * 1) there is a lock recursion from perf_event_exit_task
3363 * see the comment there.
3364 *
3365 * 2) there is a lock-inversion with mmap_sem through
3366 * perf_event_read_group(), which takes faults while
3367 * holding ctx->mutex, however this is called after
3368 * the last filedesc died, so there is no possibility
3369 * to trigger the AB-BA case.
3370 */
3371 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
3372 perf_remove_from_context(event, true);
3373 mutex_unlock(&ctx->mutex);
3374
3375 _free_event(event);
3376}
3377
3378int perf_event_release_kernel(struct perf_event *event)
3379{
3380 put_event(event);
3381 return 0;
3340} 3382}
3383EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3341 3384
3342static int perf_release(struct inode *inode, struct file *file) 3385static int perf_release(struct inode *inode, struct file *file)
3343{ 3386{
@@ -3839,28 +3882,47 @@ unlock:
3839static void ring_buffer_attach(struct perf_event *event, 3882static void ring_buffer_attach(struct perf_event *event,
3840 struct ring_buffer *rb) 3883 struct ring_buffer *rb)
3841{ 3884{
3885 struct ring_buffer *old_rb = NULL;
3842 unsigned long flags; 3886 unsigned long flags;
3843 3887
3844 if (!list_empty(&event->rb_entry)) 3888 if (event->rb) {
3845 return; 3889 /*
3890 * Should be impossible, we set this when removing
3891 * event->rb_entry and wait/clear when adding event->rb_entry.
3892 */
3893 WARN_ON_ONCE(event->rcu_pending);
3846 3894
3847 spin_lock_irqsave(&rb->event_lock, flags); 3895 old_rb = event->rb;
3848 if (list_empty(&event->rb_entry)) 3896 event->rcu_batches = get_state_synchronize_rcu();
3849 list_add(&event->rb_entry, &rb->event_list); 3897 event->rcu_pending = 1;
3850 spin_unlock_irqrestore(&rb->event_lock, flags);
3851}
3852 3898
3853static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) 3899 spin_lock_irqsave(&old_rb->event_lock, flags);
3854{ 3900 list_del_rcu(&event->rb_entry);
3855 unsigned long flags; 3901 spin_unlock_irqrestore(&old_rb->event_lock, flags);
3902 }
3856 3903
3857 if (list_empty(&event->rb_entry)) 3904 if (event->rcu_pending && rb) {
3858 return; 3905 cond_synchronize_rcu(event->rcu_batches);
3906 event->rcu_pending = 0;
3907 }
3859 3908
3860 spin_lock_irqsave(&rb->event_lock, flags); 3909 if (rb) {
3861 list_del_init(&event->rb_entry); 3910 spin_lock_irqsave(&rb->event_lock, flags);
3862 wake_up_all(&event->waitq); 3911 list_add_rcu(&event->rb_entry, &rb->event_list);
3863 spin_unlock_irqrestore(&rb->event_lock, flags); 3912 spin_unlock_irqrestore(&rb->event_lock, flags);
3913 }
3914
3915 rcu_assign_pointer(event->rb, rb);
3916
3917 if (old_rb) {
3918 ring_buffer_put(old_rb);
3919 /*
3920 * Since we detached before setting the new rb, so that we
3921 * could attach the new rb, we could have missed a wakeup.
3922 * Provide it now.
3923 */
3924 wake_up_all(&event->waitq);
3925 }
3864} 3926}
3865 3927
3866static void ring_buffer_wakeup(struct perf_event *event) 3928static void ring_buffer_wakeup(struct perf_event *event)
@@ -3929,7 +3991,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3929{ 3991{
3930 struct perf_event *event = vma->vm_file->private_data; 3992 struct perf_event *event = vma->vm_file->private_data;
3931 3993
3932 struct ring_buffer *rb = event->rb; 3994 struct ring_buffer *rb = ring_buffer_get(event);
3933 struct user_struct *mmap_user = rb->mmap_user; 3995 struct user_struct *mmap_user = rb->mmap_user;
3934 int mmap_locked = rb->mmap_locked; 3996 int mmap_locked = rb->mmap_locked;
3935 unsigned long size = perf_data_size(rb); 3997 unsigned long size = perf_data_size(rb);
@@ -3937,18 +3999,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3937 atomic_dec(&rb->mmap_count); 3999 atomic_dec(&rb->mmap_count);
3938 4000
3939 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 4001 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
3940 return; 4002 goto out_put;
3941 4003
3942 /* Detach current event from the buffer. */ 4004 ring_buffer_attach(event, NULL);
3943 rcu_assign_pointer(event->rb, NULL);
3944 ring_buffer_detach(event, rb);
3945 mutex_unlock(&event->mmap_mutex); 4005 mutex_unlock(&event->mmap_mutex);
3946 4006
3947 /* If there's still other mmap()s of this buffer, we're done. */ 4007 /* If there's still other mmap()s of this buffer, we're done. */
3948 if (atomic_read(&rb->mmap_count)) { 4008 if (atomic_read(&rb->mmap_count))
3949 ring_buffer_put(rb); /* can't be last */ 4009 goto out_put;
3950 return;
3951 }
3952 4010
3953 /* 4011 /*
3954 * No other mmap()s, detach from all other events that might redirect 4012 * No other mmap()s, detach from all other events that might redirect
@@ -3978,11 +4036,9 @@ again:
3978 * still restart the iteration to make sure we're not now 4036 * still restart the iteration to make sure we're not now
3979 * iterating the wrong list. 4037 * iterating the wrong list.
3980 */ 4038 */
3981 if (event->rb == rb) { 4039 if (event->rb == rb)
3982 rcu_assign_pointer(event->rb, NULL); 4040 ring_buffer_attach(event, NULL);
3983 ring_buffer_detach(event, rb); 4041
3984 ring_buffer_put(rb); /* can't be last, we still have one */
3985 }
3986 mutex_unlock(&event->mmap_mutex); 4042 mutex_unlock(&event->mmap_mutex);
3987 put_event(event); 4043 put_event(event);
3988 4044
@@ -4007,6 +4063,7 @@ again:
4007 vma->vm_mm->pinned_vm -= mmap_locked; 4063 vma->vm_mm->pinned_vm -= mmap_locked;
4008 free_uid(mmap_user); 4064 free_uid(mmap_user);
4009 4065
4066out_put:
4010 ring_buffer_put(rb); /* could be last */ 4067 ring_buffer_put(rb); /* could be last */
4011} 4068}
4012 4069
@@ -4124,7 +4181,6 @@ again:
4124 vma->vm_mm->pinned_vm += extra; 4181 vma->vm_mm->pinned_vm += extra;
4125 4182
4126 ring_buffer_attach(event, rb); 4183 ring_buffer_attach(event, rb);
4127 rcu_assign_pointer(event->rb, rb);
4128 4184
4129 perf_event_init_userpage(event); 4185 perf_event_init_userpage(event);
4130 perf_event_update_userpage(event); 4186 perf_event_update_userpage(event);
@@ -5036,21 +5092,9 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
5036 NULL); 5092 NULL);
5037} 5093}
5038 5094
5039void perf_event_comm(struct task_struct *task) 5095void perf_event_comm(struct task_struct *task, bool exec)
5040{ 5096{
5041 struct perf_comm_event comm_event; 5097 struct perf_comm_event comm_event;
5042 struct perf_event_context *ctx;
5043 int ctxn;
5044
5045 rcu_read_lock();
5046 for_each_task_context_nr(ctxn) {
5047 ctx = task->perf_event_ctxp[ctxn];
5048 if (!ctx)
5049 continue;
5050
5051 perf_event_enable_on_exec(ctx);
5052 }
5053 rcu_read_unlock();
5054 5098
5055 if (!atomic_read(&nr_comm_events)) 5099 if (!atomic_read(&nr_comm_events))
5056 return; 5100 return;
@@ -5062,7 +5106,7 @@ void perf_event_comm(struct task_struct *task)
5062 .event_id = { 5106 .event_id = {
5063 .header = { 5107 .header = {
5064 .type = PERF_RECORD_COMM, 5108 .type = PERF_RECORD_COMM,
5065 .misc = 0, 5109 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
5066 /* .size */ 5110 /* .size */
5067 }, 5111 },
5068 /* .pid */ 5112 /* .pid */
@@ -5085,6 +5129,7 @@ struct perf_mmap_event {
5085 int maj, min; 5129 int maj, min;
5086 u64 ino; 5130 u64 ino;
5087 u64 ino_generation; 5131 u64 ino_generation;
5132 u32 prot, flags;
5088 5133
5089 struct { 5134 struct {
5090 struct perf_event_header header; 5135 struct perf_event_header header;
@@ -5126,6 +5171,8 @@ static void perf_event_mmap_output(struct perf_event *event,
5126 mmap_event->event_id.header.size += sizeof(mmap_event->min); 5171 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5127 mmap_event->event_id.header.size += sizeof(mmap_event->ino); 5172 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5128 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); 5173 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5174 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5175 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
5129 } 5176 }
5130 5177
5131 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 5178 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
@@ -5144,6 +5191,8 @@ static void perf_event_mmap_output(struct perf_event *event,
5144 perf_output_put(&handle, mmap_event->min); 5191 perf_output_put(&handle, mmap_event->min);
5145 perf_output_put(&handle, mmap_event->ino); 5192 perf_output_put(&handle, mmap_event->ino);
5146 perf_output_put(&handle, mmap_event->ino_generation); 5193 perf_output_put(&handle, mmap_event->ino_generation);
5194 perf_output_put(&handle, mmap_event->prot);
5195 perf_output_put(&handle, mmap_event->flags);
5147 } 5196 }
5148 5197
5149 __output_copy(&handle, mmap_event->file_name, 5198 __output_copy(&handle, mmap_event->file_name,
@@ -5162,6 +5211,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5162 struct file *file = vma->vm_file; 5211 struct file *file = vma->vm_file;
5163 int maj = 0, min = 0; 5212 int maj = 0, min = 0;
5164 u64 ino = 0, gen = 0; 5213 u64 ino = 0, gen = 0;
5214 u32 prot = 0, flags = 0;
5165 unsigned int size; 5215 unsigned int size;
5166 char tmp[16]; 5216 char tmp[16];
5167 char *buf = NULL; 5217 char *buf = NULL;
@@ -5192,6 +5242,28 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5192 gen = inode->i_generation; 5242 gen = inode->i_generation;
5193 maj = MAJOR(dev); 5243 maj = MAJOR(dev);
5194 min = MINOR(dev); 5244 min = MINOR(dev);
5245
5246 if (vma->vm_flags & VM_READ)
5247 prot |= PROT_READ;
5248 if (vma->vm_flags & VM_WRITE)
5249 prot |= PROT_WRITE;
5250 if (vma->vm_flags & VM_EXEC)
5251 prot |= PROT_EXEC;
5252
5253 if (vma->vm_flags & VM_MAYSHARE)
5254 flags = MAP_SHARED;
5255 else
5256 flags = MAP_PRIVATE;
5257
5258 if (vma->vm_flags & VM_DENYWRITE)
5259 flags |= MAP_DENYWRITE;
5260 if (vma->vm_flags & VM_MAYEXEC)
5261 flags |= MAP_EXECUTABLE;
5262 if (vma->vm_flags & VM_LOCKED)
5263 flags |= MAP_LOCKED;
5264 if (vma->vm_flags & VM_HUGETLB)
5265 flags |= MAP_HUGETLB;
5266
5195 goto got_name; 5267 goto got_name;
5196 } else { 5268 } else {
5197 name = (char *)arch_vma_name(vma); 5269 name = (char *)arch_vma_name(vma);
@@ -5232,6 +5304,8 @@ got_name:
5232 mmap_event->min = min; 5304 mmap_event->min = min;
5233 mmap_event->ino = ino; 5305 mmap_event->ino = ino;
5234 mmap_event->ino_generation = gen; 5306 mmap_event->ino_generation = gen;
5307 mmap_event->prot = prot;
5308 mmap_event->flags = flags;
5235 5309
5236 if (!(vma->vm_flags & VM_EXEC)) 5310 if (!(vma->vm_flags & VM_EXEC))
5237 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; 5311 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
@@ -5272,6 +5346,8 @@ void perf_event_mmap(struct vm_area_struct *vma)
5272 /* .min (attr_mmap2 only) */ 5346 /* .min (attr_mmap2 only) */
5273 /* .ino (attr_mmap2 only) */ 5347 /* .ino (attr_mmap2 only) */
5274 /* .ino_generation (attr_mmap2 only) */ 5348 /* .ino_generation (attr_mmap2 only) */
5349 /* .prot (attr_mmap2 only) */
5350 /* .flags (attr_mmap2 only) */
5275 }; 5351 };
5276 5352
5277 perf_event_mmap_event(&mmap_event); 5353 perf_event_mmap_event(&mmap_event);
@@ -5408,6 +5484,9 @@ struct swevent_htable {
5408 5484
5409 /* Recursion avoidance in each contexts */ 5485 /* Recursion avoidance in each contexts */
5410 int recursion[PERF_NR_CONTEXTS]; 5486 int recursion[PERF_NR_CONTEXTS];
5487
5488 /* Keeps track of cpu being initialized/exited */
5489 bool online;
5411}; 5490};
5412 5491
5413static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); 5492static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -5654,8 +5733,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)
5654 hwc->state = !(flags & PERF_EF_START); 5733 hwc->state = !(flags & PERF_EF_START);
5655 5734
5656 head = find_swevent_head(swhash, event); 5735 head = find_swevent_head(swhash, event);
5657 if (WARN_ON_ONCE(!head)) 5736 if (!head) {
5737 /*
5738 * We can race with cpu hotplug code. Do not
5739 * WARN if the cpu just got unplugged.
5740 */
5741 WARN_ON_ONCE(swhash->online);
5658 return -EINVAL; 5742 return -EINVAL;
5743 }
5659 5744
5660 hlist_add_head_rcu(&event->hlist_entry, head); 5745 hlist_add_head_rcu(&event->hlist_entry, head);
5661 5746
@@ -6551,6 +6636,7 @@ free_pdc:
6551 free_percpu(pmu->pmu_disable_count); 6636 free_percpu(pmu->pmu_disable_count);
6552 goto unlock; 6637 goto unlock;
6553} 6638}
6639EXPORT_SYMBOL_GPL(perf_pmu_register);
6554 6640
6555void perf_pmu_unregister(struct pmu *pmu) 6641void perf_pmu_unregister(struct pmu *pmu)
6556{ 6642{
@@ -6572,6 +6658,7 @@ void perf_pmu_unregister(struct pmu *pmu)
6572 put_device(pmu->dev); 6658 put_device(pmu->dev);
6573 free_pmu_context(pmu); 6659 free_pmu_context(pmu);
6574} 6660}
6661EXPORT_SYMBOL_GPL(perf_pmu_unregister);
6575 6662
6576struct pmu *perf_init_event(struct perf_event *event) 6663struct pmu *perf_init_event(struct perf_event *event)
6577{ 6664{
@@ -6585,6 +6672,10 @@ struct pmu *perf_init_event(struct perf_event *event)
6585 pmu = idr_find(&pmu_idr, event->attr.type); 6672 pmu = idr_find(&pmu_idr, event->attr.type);
6586 rcu_read_unlock(); 6673 rcu_read_unlock();
6587 if (pmu) { 6674 if (pmu) {
6675 if (!try_module_get(pmu->module)) {
6676 pmu = ERR_PTR(-ENODEV);
6677 goto unlock;
6678 }
6588 event->pmu = pmu; 6679 event->pmu = pmu;
6589 ret = pmu->event_init(event); 6680 ret = pmu->event_init(event);
6590 if (ret) 6681 if (ret)
@@ -6593,6 +6684,10 @@ struct pmu *perf_init_event(struct perf_event *event)
6593 } 6684 }
6594 6685
6595 list_for_each_entry_rcu(pmu, &pmus, entry) { 6686 list_for_each_entry_rcu(pmu, &pmus, entry) {
6687 if (!try_module_get(pmu->module)) {
6688 pmu = ERR_PTR(-ENODEV);
6689 goto unlock;
6690 }
6596 event->pmu = pmu; 6691 event->pmu = pmu;
6597 ret = pmu->event_init(event); 6692 ret = pmu->event_init(event);
6598 if (!ret) 6693 if (!ret)
@@ -6771,6 +6866,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6771err_pmu: 6866err_pmu:
6772 if (event->destroy) 6867 if (event->destroy)
6773 event->destroy(event); 6868 event->destroy(event);
6869 module_put(pmu->module);
6774err_ns: 6870err_ns:
6775 if (event->ns) 6871 if (event->ns)
6776 put_pid_ns(event->ns); 6872 put_pid_ns(event->ns);
@@ -6834,10 +6930,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6834 if (ret) 6930 if (ret)
6835 return -EFAULT; 6931 return -EFAULT;
6836 6932
6837 /* disabled for now */
6838 if (attr->mmap2)
6839 return -EINVAL;
6840
6841 if (attr->__reserved_1) 6933 if (attr->__reserved_1)
6842 return -EINVAL; 6934 return -EINVAL;
6843 6935
@@ -6914,7 +7006,7 @@ err_size:
6914static int 7006static int
6915perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 7007perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6916{ 7008{
6917 struct ring_buffer *rb = NULL, *old_rb = NULL; 7009 struct ring_buffer *rb = NULL;
6918 int ret = -EINVAL; 7010 int ret = -EINVAL;
6919 7011
6920 if (!output_event) 7012 if (!output_event)
@@ -6942,8 +7034,6 @@ set:
6942 if (atomic_read(&event->mmap_count)) 7034 if (atomic_read(&event->mmap_count))
6943 goto unlock; 7035 goto unlock;
6944 7036
6945 old_rb = event->rb;
6946
6947 if (output_event) { 7037 if (output_event) {
6948 /* get the rb we want to redirect to */ 7038 /* get the rb we want to redirect to */
6949 rb = ring_buffer_get(output_event); 7039 rb = ring_buffer_get(output_event);
@@ -6951,23 +7041,7 @@ set:
6951 goto unlock; 7041 goto unlock;
6952 } 7042 }
6953 7043
6954 if (old_rb) 7044 ring_buffer_attach(event, rb);
6955 ring_buffer_detach(event, old_rb);
6956
6957 if (rb)
6958 ring_buffer_attach(event, rb);
6959
6960 rcu_assign_pointer(event->rb, rb);
6961
6962 if (old_rb) {
6963 ring_buffer_put(old_rb);
6964 /*
6965 * Since we detached before setting the new rb, so that we
6966 * could attach the new rb, we could have missed a wakeup.
6967 * Provide it now.
6968 */
6969 wake_up_all(&event->waitq);
6970 }
6971 7045
6972 ret = 0; 7046 ret = 0;
6973unlock: 7047unlock:
@@ -7018,6 +7092,9 @@ SYSCALL_DEFINE5(perf_event_open,
7018 if (attr.freq) { 7092 if (attr.freq) {
7019 if (attr.sample_freq > sysctl_perf_event_sample_rate) 7093 if (attr.sample_freq > sysctl_perf_event_sample_rate)
7020 return -EINVAL; 7094 return -EINVAL;
7095 } else {
7096 if (attr.sample_period & (1ULL << 63))
7097 return -EINVAL;
7021 } 7098 }
7022 7099
7023 /* 7100 /*
@@ -7055,20 +7132,33 @@ SYSCALL_DEFINE5(perf_event_open,
7055 } 7132 }
7056 } 7133 }
7057 7134
7135 if (task && group_leader &&
7136 group_leader->attr.inherit != attr.inherit) {
7137 err = -EINVAL;
7138 goto err_task;
7139 }
7140
7058 get_online_cpus(); 7141 get_online_cpus();
7059 7142
7060 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, 7143 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
7061 NULL, NULL); 7144 NULL, NULL);
7062 if (IS_ERR(event)) { 7145 if (IS_ERR(event)) {
7063 err = PTR_ERR(event); 7146 err = PTR_ERR(event);
7064 goto err_task; 7147 goto err_cpus;
7065 } 7148 }
7066 7149
7067 if (flags & PERF_FLAG_PID_CGROUP) { 7150 if (flags & PERF_FLAG_PID_CGROUP) {
7068 err = perf_cgroup_connect(pid, event, &attr, group_leader); 7151 err = perf_cgroup_connect(pid, event, &attr, group_leader);
7069 if (err) { 7152 if (err) {
7070 __free_event(event); 7153 __free_event(event);
7071 goto err_task; 7154 goto err_cpus;
7155 }
7156 }
7157
7158 if (is_sampling_event(event)) {
7159 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
7160 err = -ENOTSUPP;
7161 goto err_alloc;
7072 } 7162 }
7073 } 7163 }
7074 7164
@@ -7165,7 +7255,7 @@ SYSCALL_DEFINE5(perf_event_open,
7165 struct perf_event_context *gctx = group_leader->ctx; 7255 struct perf_event_context *gctx = group_leader->ctx;
7166 7256
7167 mutex_lock(&gctx->mutex); 7257 mutex_lock(&gctx->mutex);
7168 perf_remove_from_context(group_leader); 7258 perf_remove_from_context(group_leader, false);
7169 7259
7170 /* 7260 /*
7171 * Removing from the context ends up with disabled 7261 * Removing from the context ends up with disabled
@@ -7175,7 +7265,7 @@ SYSCALL_DEFINE5(perf_event_open,
7175 perf_event__state_init(group_leader); 7265 perf_event__state_init(group_leader);
7176 list_for_each_entry(sibling, &group_leader->sibling_list, 7266 list_for_each_entry(sibling, &group_leader->sibling_list,
7177 group_entry) { 7267 group_entry) {
7178 perf_remove_from_context(sibling); 7268 perf_remove_from_context(sibling, false);
7179 perf_event__state_init(sibling); 7269 perf_event__state_init(sibling);
7180 put_ctx(gctx); 7270 put_ctx(gctx);
7181 } 7271 }
@@ -7230,8 +7320,9 @@ err_context:
7230 put_ctx(ctx); 7320 put_ctx(ctx);
7231err_alloc: 7321err_alloc:
7232 free_event(event); 7322 free_event(event);
7233err_task: 7323err_cpus:
7234 put_online_cpus(); 7324 put_online_cpus();
7325err_task:
7235 if (task) 7326 if (task)
7236 put_task_struct(task); 7327 put_task_struct(task);
7237err_group_fd: 7328err_group_fd:
@@ -7305,7 +7396,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7305 mutex_lock(&src_ctx->mutex); 7396 mutex_lock(&src_ctx->mutex);
7306 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 7397 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
7307 event_entry) { 7398 event_entry) {
7308 perf_remove_from_context(event); 7399 perf_remove_from_context(event, false);
7309 unaccount_event_cpu(event, src_cpu); 7400 unaccount_event_cpu(event, src_cpu);
7310 put_ctx(src_ctx); 7401 put_ctx(src_ctx);
7311 list_add(&event->migrate_entry, &events); 7402 list_add(&event->migrate_entry, &events);
@@ -7367,13 +7458,7 @@ __perf_event_exit_task(struct perf_event *child_event,
7367 struct perf_event_context *child_ctx, 7458 struct perf_event_context *child_ctx,
7368 struct task_struct *child) 7459 struct task_struct *child)
7369{ 7460{
7370 if (child_event->parent) { 7461 perf_remove_from_context(child_event, true);
7371 raw_spin_lock_irq(&child_ctx->lock);
7372 perf_group_detach(child_event);
7373 raw_spin_unlock_irq(&child_ctx->lock);
7374 }
7375
7376 perf_remove_from_context(child_event);
7377 7462
7378 /* 7463 /*
7379 * It can happen that the parent exits first, and has events 7464 * It can happen that the parent exits first, and has events
@@ -7388,7 +7473,7 @@ __perf_event_exit_task(struct perf_event *child_event,
7388 7473
7389static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 7474static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7390{ 7475{
7391 struct perf_event *child_event, *tmp; 7476 struct perf_event *child_event, *next;
7392 struct perf_event_context *child_ctx; 7477 struct perf_event_context *child_ctx;
7393 unsigned long flags; 7478 unsigned long flags;
7394 7479
@@ -7442,24 +7527,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7442 */ 7527 */
7443 mutex_lock(&child_ctx->mutex); 7528 mutex_lock(&child_ctx->mutex);
7444 7529
7445again: 7530 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
7446 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
7447 group_entry)
7448 __perf_event_exit_task(child_event, child_ctx, child);
7449
7450 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
7451 group_entry)
7452 __perf_event_exit_task(child_event, child_ctx, child); 7531 __perf_event_exit_task(child_event, child_ctx, child);
7453 7532
7454 /*
7455 * If the last event was a group event, it will have appended all
7456 * its siblings to the list, but we obtained 'tmp' before that which
7457 * will still point to the list head terminating the iteration.
7458 */
7459 if (!list_empty(&child_ctx->pinned_groups) ||
7460 !list_empty(&child_ctx->flexible_groups))
7461 goto again;
7462
7463 mutex_unlock(&child_ctx->mutex); 7533 mutex_unlock(&child_ctx->mutex);
7464 7534
7465 put_ctx(child_ctx); 7535 put_ctx(child_ctx);
@@ -7724,6 +7794,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
7724 * swapped under us. 7794 * swapped under us.
7725 */ 7795 */
7726 parent_ctx = perf_pin_task_context(parent, ctxn); 7796 parent_ctx = perf_pin_task_context(parent, ctxn);
7797 if (!parent_ctx)
7798 return 0;
7727 7799
7728 /* 7800 /*
7729 * No need to check if parent_ctx != NULL here; since we saw 7801 * No need to check if parent_ctx != NULL here; since we saw
@@ -7835,6 +7907,7 @@ static void perf_event_init_cpu(int cpu)
7835 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 7907 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7836 7908
7837 mutex_lock(&swhash->hlist_mutex); 7909 mutex_lock(&swhash->hlist_mutex);
7910 swhash->online = true;
7838 if (swhash->hlist_refcount > 0) { 7911 if (swhash->hlist_refcount > 0) {
7839 struct swevent_hlist *hlist; 7912 struct swevent_hlist *hlist;
7840 7913
@@ -7857,14 +7930,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
7857 7930
7858static void __perf_event_exit_context(void *__info) 7931static void __perf_event_exit_context(void *__info)
7859{ 7932{
7933 struct remove_event re = { .detach_group = false };
7860 struct perf_event_context *ctx = __info; 7934 struct perf_event_context *ctx = __info;
7861 struct perf_event *event;
7862 7935
7863 perf_pmu_rotate_stop(ctx->pmu); 7936 perf_pmu_rotate_stop(ctx->pmu);
7864 7937
7865 rcu_read_lock(); 7938 rcu_read_lock();
7866 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) 7939 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
7867 __perf_remove_from_context(event); 7940 __perf_remove_from_context(&re);
7868 rcu_read_unlock(); 7941 rcu_read_unlock();
7869} 7942}
7870 7943
@@ -7892,6 +7965,7 @@ static void perf_event_exit_cpu(int cpu)
7892 perf_event_exit_cpu_context(cpu); 7965 perf_event_exit_cpu_context(cpu);
7893 7966
7894 mutex_lock(&swhash->hlist_mutex); 7967 mutex_lock(&swhash->hlist_mutex);
7968 swhash->online = false;
7895 swevent_hlist_release(swhash); 7969 swevent_hlist_release(swhash);
7896 mutex_unlock(&swhash->hlist_mutex); 7970 mutex_unlock(&swhash->hlist_mutex);
7897} 7971}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 04709b66369d..6f3254e8c137 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -36,6 +36,7 @@
36#include "../../mm/internal.h" /* munlock_vma_page */ 36#include "../../mm/internal.h" /* munlock_vma_page */
37#include <linux/percpu-rwsem.h> 37#include <linux/percpu-rwsem.h>
38#include <linux/task_work.h> 38#include <linux/task_work.h>
39#include <linux/shmem_fs.h>
39 40
40#include <linux/uprobes.h> 41#include <linux/uprobes.h>
41 42
@@ -60,8 +61,6 @@ static struct percpu_rw_semaphore dup_mmap_sem;
60 61
61/* Have a copy of original instruction */ 62/* Have a copy of original instruction */
62#define UPROBE_COPY_INSN 0 63#define UPROBE_COPY_INSN 0
63/* Can skip singlestep */
64#define UPROBE_SKIP_SSTEP 1
65 64
66struct uprobe { 65struct uprobe {
67 struct rb_node rb_node; /* node in the rb tree */ 66 struct rb_node rb_node; /* node in the rb tree */
@@ -129,7 +128,7 @@ struct xol_area {
129 */ 128 */
130static bool valid_vma(struct vm_area_struct *vma, bool is_register) 129static bool valid_vma(struct vm_area_struct *vma, bool is_register)
131{ 130{
132 vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED; 131 vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
133 132
134 if (is_register) 133 if (is_register)
135 flags |= VM_WRITE; 134 flags |= VM_WRITE;
@@ -281,18 +280,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
281 * supported by that architecture then we need to modify is_trap_at_addr and 280 * supported by that architecture then we need to modify is_trap_at_addr and
282 * uprobe_write_opcode accordingly. This would never be a problem for archs 281 * uprobe_write_opcode accordingly. This would never be a problem for archs
283 * that have fixed length instructions. 282 * that have fixed length instructions.
284 */ 283 *
285
286/*
287 * uprobe_write_opcode - write the opcode at a given virtual address. 284 * uprobe_write_opcode - write the opcode at a given virtual address.
288 * @mm: the probed process address space. 285 * @mm: the probed process address space.
289 * @vaddr: the virtual address to store the opcode. 286 * @vaddr: the virtual address to store the opcode.
290 * @opcode: opcode to be written at @vaddr. 287 * @opcode: opcode to be written at @vaddr.
291 * 288 *
292 * Called with mm->mmap_sem held (for read and with a reference to 289 * Called with mm->mmap_sem held for write.
293 * mm).
294 *
295 * For mm @mm, write the opcode at @vaddr.
296 * Return 0 (success) or a negative errno. 290 * Return 0 (success) or a negative errno.
297 */ 291 */
298int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, 292int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
@@ -312,21 +306,25 @@ retry:
312 if (ret <= 0) 306 if (ret <= 0)
313 goto put_old; 307 goto put_old;
314 308
309 ret = anon_vma_prepare(vma);
310 if (ret)
311 goto put_old;
312
315 ret = -ENOMEM; 313 ret = -ENOMEM;
316 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); 314 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
317 if (!new_page) 315 if (!new_page)
318 goto put_old; 316 goto put_old;
319 317
320 __SetPageUptodate(new_page); 318 if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
319 goto put_new;
321 320
321 __SetPageUptodate(new_page);
322 copy_highpage(new_page, old_page); 322 copy_highpage(new_page, old_page);
323 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); 323 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
324 324
325 ret = anon_vma_prepare(vma);
326 if (ret)
327 goto put_new;
328
329 ret = __replace_page(vma, vaddr, old_page, new_page); 325 ret = __replace_page(vma, vaddr, old_page, new_page);
326 if (ret)
327 mem_cgroup_uncharge_page(new_page);
330 328
331put_new: 329put_new:
332 page_cache_release(new_page); 330 page_cache_release(new_page);
@@ -491,12 +489,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
491 uprobe->offset = offset; 489 uprobe->offset = offset;
492 init_rwsem(&uprobe->register_rwsem); 490 init_rwsem(&uprobe->register_rwsem);
493 init_rwsem(&uprobe->consumer_rwsem); 491 init_rwsem(&uprobe->consumer_rwsem);
494 /* For now assume that the instruction need not be single-stepped */
495 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
496 492
497 /* add to uprobes_tree, sorted on inode:offset */ 493 /* add to uprobes_tree, sorted on inode:offset */
498 cur_uprobe = insert_uprobe(uprobe); 494 cur_uprobe = insert_uprobe(uprobe);
499
500 /* a uprobe exists for this inode:offset combination */ 495 /* a uprobe exists for this inode:offset combination */
501 if (cur_uprobe) { 496 if (cur_uprobe) {
502 kfree(uprobe); 497 kfree(uprobe);
@@ -542,14 +537,15 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
542 void *insn, int nbytes, loff_t offset) 537 void *insn, int nbytes, loff_t offset)
543{ 538{
544 struct page *page; 539 struct page *page;
545
546 if (!mapping->a_ops->readpage)
547 return -EIO;
548 /* 540 /*
549 * Ensure that the page that has the original instruction is 541 * Ensure that the page that has the original instruction is populated
550 * populated and in page-cache. 542 * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
543 * see uprobe_register().
551 */ 544 */
552 page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); 545 if (mapping->a_ops->readpage)
546 page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
547 else
548 page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
553 if (IS_ERR(page)) 549 if (IS_ERR(page))
554 return PTR_ERR(page); 550 return PTR_ERR(page);
555 551
@@ -850,7 +846,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
850{ 846{
851 int err; 847 int err;
852 848
853 if (!consumer_del(uprobe, uc)) /* WARN? */ 849 if (WARN_ON(!consumer_del(uprobe, uc)))
854 return; 850 return;
855 851
856 err = register_for_each_vma(uprobe, NULL); 852 err = register_for_each_vma(uprobe, NULL);
@@ -885,6 +881,9 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
885 if (!uc->handler && !uc->ret_handler) 881 if (!uc->handler && !uc->ret_handler)
886 return -EINVAL; 882 return -EINVAL;
887 883
884 /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
885 if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
886 return -EIO;
888 /* Racy, just to catch the obvious mistakes */ 887 /* Racy, just to catch the obvious mistakes */
889 if (offset > i_size_read(inode)) 888 if (offset > i_size_read(inode))
890 return -EINVAL; 889 return -EINVAL;
@@ -928,7 +927,7 @@ int uprobe_apply(struct inode *inode, loff_t offset,
928 int ret = -ENOENT; 927 int ret = -ENOENT;
929 928
930 uprobe = find_uprobe(inode, offset); 929 uprobe = find_uprobe(inode, offset);
931 if (!uprobe) 930 if (WARN_ON(!uprobe))
932 return ret; 931 return ret;
933 932
934 down_write(&uprobe->register_rwsem); 933 down_write(&uprobe->register_rwsem);
@@ -953,7 +952,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
953 struct uprobe *uprobe; 952 struct uprobe *uprobe;
954 953
955 uprobe = find_uprobe(inode, offset); 954 uprobe = find_uprobe(inode, offset);
956 if (!uprobe) 955 if (WARN_ON(!uprobe))
957 return; 956 return;
958 957
959 down_write(&uprobe->register_rwsem); 958 down_write(&uprobe->register_rwsem);
@@ -1296,14 +1295,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1296 if (unlikely(!xol_vaddr)) 1295 if (unlikely(!xol_vaddr))
1297 return 0; 1296 return 0;
1298 1297
1299 /* Initialize the slot */ 1298 arch_uprobe_copy_ixol(area->page, xol_vaddr,
1300 copy_to_page(area->page, xol_vaddr, 1299 &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1301 &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1302 /*
1303 * We probably need flush_icache_user_range() but it needs vma.
1304 * This should work on supported architectures too.
1305 */
1306 flush_dcache_page(area->page);
1307 1300
1308 return xol_vaddr; 1301 return xol_vaddr;
1309} 1302}
@@ -1346,6 +1339,21 @@ static void xol_free_insn_slot(struct task_struct *tsk)
1346 } 1339 }
1347} 1340}
1348 1341
1342void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
1343 void *src, unsigned long len)
1344{
1345 /* Initialize the slot */
1346 copy_to_page(page, vaddr, src, len);
1347
1348 /*
1349 * We probably need flush_icache_user_range() but it needs vma.
1350 * This should work on most of architectures by default. If
1351 * architecture needs to do something different it can define
1352 * its own version of the function.
1353 */
1354 flush_dcache_page(page);
1355}
1356
1349/** 1357/**
1350 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs 1358 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
1351 * @regs: Reflects the saved state of the task after it has hit a breakpoint 1359 * @regs: Reflects the saved state of the task after it has hit a breakpoint
@@ -1357,6 +1365,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1357 return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; 1365 return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
1358} 1366}
1359 1367
1368unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
1369{
1370 struct uprobe_task *utask = current->utask;
1371
1372 if (unlikely(utask && utask->active_uprobe))
1373 return utask->vaddr;
1374
1375 return instruction_pointer(regs);
1376}
1377
1360/* 1378/*
1361 * Called with no locks held. 1379 * Called with no locks held.
1362 * Called in context of a exiting or a exec-ing thread. 1380 * Called in context of a exiting or a exec-ing thread.
@@ -1628,20 +1646,6 @@ bool uprobe_deny_signal(void)
1628 return true; 1646 return true;
1629} 1647}
1630 1648
1631/*
1632 * Avoid singlestepping the original instruction if the original instruction
1633 * is a NOP or can be emulated.
1634 */
1635static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
1636{
1637 if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
1638 if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
1639 return true;
1640 clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
1641 }
1642 return false;
1643}
1644
1645static void mmf_recalc_uprobes(struct mm_struct *mm) 1649static void mmf_recalc_uprobes(struct mm_struct *mm)
1646{ 1650{
1647 struct vm_area_struct *vma; 1651 struct vm_area_struct *vma;
@@ -1868,13 +1872,13 @@ static void handle_swbp(struct pt_regs *regs)
1868 1872
1869 handler_chain(uprobe, regs); 1873 handler_chain(uprobe, regs);
1870 1874
1871 if (can_skip_sstep(uprobe, regs)) 1875 if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
1872 goto out; 1876 goto out;
1873 1877
1874 if (!pre_ssout(uprobe, regs, bp_vaddr)) 1878 if (!pre_ssout(uprobe, regs, bp_vaddr))
1875 return; 1879 return;
1876 1880
1877 /* can_skip_sstep() succeeded, or restart if can't singlestep */ 1881 /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
1878out: 1882out:
1879 put_uprobe(uprobe); 1883 put_uprobe(uprobe);
1880} 1884}
@@ -1886,10 +1890,11 @@ out:
1886static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) 1890static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1887{ 1891{
1888 struct uprobe *uprobe; 1892 struct uprobe *uprobe;
1893 int err = 0;
1889 1894
1890 uprobe = utask->active_uprobe; 1895 uprobe = utask->active_uprobe;
1891 if (utask->state == UTASK_SSTEP_ACK) 1896 if (utask->state == UTASK_SSTEP_ACK)
1892 arch_uprobe_post_xol(&uprobe->arch, regs); 1897 err = arch_uprobe_post_xol(&uprobe->arch, regs);
1893 else if (utask->state == UTASK_SSTEP_TRAPPED) 1898 else if (utask->state == UTASK_SSTEP_TRAPPED)
1894 arch_uprobe_abort_xol(&uprobe->arch, regs); 1899 arch_uprobe_abort_xol(&uprobe->arch, regs);
1895 else 1900 else
@@ -1903,6 +1908,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1903 spin_lock_irq(&current->sighand->siglock); 1908 spin_lock_irq(&current->sighand->siglock);
1904 recalc_sigpending(); /* see uprobe_deny_signal() */ 1909 recalc_sigpending(); /* see uprobe_deny_signal() */
1905 spin_unlock_irq(&current->sighand->siglock); 1910 spin_unlock_irq(&current->sighand->siglock);
1911
1912 if (unlikely(err)) {
1913 uprobe_warn(current, "execute the probed insn, sending SIGILL.");
1914 force_sig_info(SIGILL, SEND_SIG_FORCED, current);
1915 }
1906} 1916}
1907 1917
1908/* 1918/*
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0dbeae374225..83d4382f5699 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -37,7 +37,7 @@ static unsigned long ident_map[32] = {
37struct exec_domain default_exec_domain = { 37struct exec_domain default_exec_domain = {
38 .name = "Linux", /* name */ 38 .name = "Linux", /* name */
39 .handler = default_handler, /* lcall7 causes a seg fault. */ 39 .handler = default_handler, /* lcall7 causes a seg fault. */
40 .pers_low = 0, /* PER_LINUX personality. */ 40 .pers_low = 0, /* PER_LINUX personality. */
41 .pers_high = 0, /* PER_LINUX personality. */ 41 .pers_high = 0, /* PER_LINUX personality. */
42 .signal_map = ident_map, /* Identity map signals. */ 42 .signal_map = ident_map, /* Identity map signals. */
43 .signal_invmap = ident_map, /* - both ways. */ 43 .signal_invmap = ident_map, /* - both ways. */
@@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality)
83 ep = &default_exec_domain; 83 ep = &default_exec_domain;
84out: 84out:
85 read_unlock(&exec_domains_lock); 85 read_unlock(&exec_domains_lock);
86 return (ep); 86 return ep;
87} 87}
88 88
89int 89int
@@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep)
110 110
111out: 111out:
112 write_unlock(&exec_domains_lock); 112 write_unlock(&exec_domains_lock);
113 return (err); 113 return err;
114} 114}
115EXPORT_SYMBOL(register_exec_domain);
115 116
116int 117int
117unregister_exec_domain(struct exec_domain *ep) 118unregister_exec_domain(struct exec_domain *ep)
@@ -133,6 +134,7 @@ unregister:
133 write_unlock(&exec_domains_lock); 134 write_unlock(&exec_domains_lock);
134 return 0; 135 return 0;
135} 136}
137EXPORT_SYMBOL(unregister_exec_domain);
136 138
137int __set_personality(unsigned int personality) 139int __set_personality(unsigned int personality)
138{ 140{
@@ -144,6 +146,7 @@ int __set_personality(unsigned int personality)
144 146
145 return 0; 147 return 0;
146} 148}
149EXPORT_SYMBOL(__set_personality);
147 150
148#ifdef CONFIG_PROC_FS 151#ifdef CONFIG_PROC_FS
149static int execdomains_proc_show(struct seq_file *m, void *v) 152static int execdomains_proc_show(struct seq_file *m, void *v)
@@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)
188 191
189 return old; 192 return old;
190} 193}
191
192
193EXPORT_SYMBOL(register_exec_domain);
194EXPORT_SYMBOL(unregister_exec_domain);
195EXPORT_SYMBOL(__set_personality);
diff --git a/kernel/exit.c b/kernel/exit.c
index 6ed6a1d552b5..e5c4668f1799 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -313,46 +313,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
313 } 313 }
314} 314}
315 315
316/* 316#ifdef CONFIG_MEMCG
317 * Let kernel threads use this to say that they allow a certain signal.
318 * Must not be used if kthread was cloned with CLONE_SIGHAND.
319 */
320int allow_signal(int sig)
321{
322 if (!valid_signal(sig) || sig < 1)
323 return -EINVAL;
324
325 spin_lock_irq(&current->sighand->siglock);
326 /* This is only needed for daemonize()'ed kthreads */
327 sigdelset(&current->blocked, sig);
328 /*
329 * Kernel threads handle their own signals. Let the signal code
330 * know it'll be handled, so that they don't get converted to
331 * SIGKILL or just silently dropped.
332 */
333 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
334 recalc_sigpending();
335 spin_unlock_irq(&current->sighand->siglock);
336 return 0;
337}
338
339EXPORT_SYMBOL(allow_signal);
340
341int disallow_signal(int sig)
342{
343 if (!valid_signal(sig) || sig < 1)
344 return -EINVAL;
345
346 spin_lock_irq(&current->sighand->siglock);
347 current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
348 recalc_sigpending();
349 spin_unlock_irq(&current->sighand->siglock);
350 return 0;
351}
352
353EXPORT_SYMBOL(disallow_signal);
354
355#ifdef CONFIG_MM_OWNER
356/* 317/*
357 * A task is exiting. If it owned this mm, find a new owner for the mm. 318 * A task is exiting. If it owned this mm, find a new owner for the mm.
358 */ 319 */
@@ -395,14 +356,18 @@ retry:
395 } 356 }
396 357
397 /* 358 /*
398 * Search through everything else. We should not get 359 * Search through everything else, we should not get here often.
399 * here often
400 */ 360 */
401 do_each_thread(g, c) { 361 for_each_process(g) {
402 if (c->mm == mm) 362 if (g->flags & PF_KTHREAD)
403 goto assign_new_owner; 363 continue;
404 } while_each_thread(g, c); 364 for_each_thread(g, c) {
405 365 if (c->mm == mm)
366 goto assign_new_owner;
367 if (c->mm)
368 break;
369 }
370 }
406 read_unlock(&tasklist_lock); 371 read_unlock(&tasklist_lock);
407 /* 372 /*
408 * We found no owner yet mm_users > 1: this implies that we are 373 * We found no owner yet mm_users > 1: this implies that we are
@@ -434,7 +399,7 @@ assign_new_owner:
434 task_unlock(c); 399 task_unlock(c);
435 put_task_struct(c); 400 put_task_struct(c);
436} 401}
437#endif /* CONFIG_MM_OWNER */ 402#endif /* CONFIG_MEMCG */
438 403
439/* 404/*
440 * Turn us into a lazy TLB process if we 405 * Turn us into a lazy TLB process if we
diff --git a/kernel/fork.c b/kernel/fork.c
index 54a8d26f612f..6a13c46cd87d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti)
150static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 150static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
151 int node) 151 int node)
152{ 152{
153 struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, 153 struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
154 THREAD_SIZE_ORDER); 154 THREAD_SIZE_ORDER);
155 155
156 return page ? page_address(page) : NULL; 156 return page ? page_address(page) : NULL;
157} 157}
158 158
159static inline void free_thread_info(struct thread_info *ti) 159static inline void free_thread_info(struct thread_info *ti)
160{ 160{
161 free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); 161 free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
162} 162}
163# else 163# else
164static struct kmem_cache *thread_info_cache; 164static struct kmem_cache *thread_info_cache;
@@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p)
1099#endif 1099#endif
1100} 1100}
1101 1101
1102#ifdef CONFIG_MM_OWNER 1102#ifdef CONFIG_MEMCG
1103void mm_init_owner(struct mm_struct *mm, struct task_struct *p) 1103void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1104{ 1104{
1105 mm->owner = p; 1105 mm->owner = p;
1106} 1106}
1107#endif /* CONFIG_MM_OWNER */ 1107#endif /* CONFIG_MEMCG */
1108 1108
1109/* 1109/*
1110 * Initialize POSIX timer handling for a single task. 1110 * Initialize POSIX timer handling for a single task.
@@ -1487,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1487 1487
1488 total_forks++; 1488 total_forks++;
1489 spin_unlock(&current->sighand->siglock); 1489 spin_unlock(&current->sighand->siglock);
1490 syscall_tracepoint_update(p);
1490 write_unlock_irq(&tasklist_lock); 1491 write_unlock_irq(&tasklist_lock);
1492
1491 proc_fork_connector(p); 1493 proc_fork_connector(p);
1492 cgroup_post_fork(p); 1494 cgroup_post_fork(p);
1493 if (clone_flags & CLONE_THREAD) 1495 if (clone_flags & CLONE_THREAD)
@@ -1606,10 +1608,12 @@ long do_fork(unsigned long clone_flags,
1606 */ 1608 */
1607 if (!IS_ERR(p)) { 1609 if (!IS_ERR(p)) {
1608 struct completion vfork; 1610 struct completion vfork;
1611 struct pid *pid;
1609 1612
1610 trace_sched_process_fork(current, p); 1613 trace_sched_process_fork(current, p);
1611 1614
1612 nr = task_pid_vnr(p); 1615 pid = get_task_pid(p, PIDTYPE_PID);
1616 nr = pid_vnr(pid);
1613 1617
1614 if (clone_flags & CLONE_PARENT_SETTID) 1618 if (clone_flags & CLONE_PARENT_SETTID)
1615 put_user(nr, parent_tidptr); 1619 put_user(nr, parent_tidptr);
@@ -1624,12 +1628,14 @@ long do_fork(unsigned long clone_flags,
1624 1628
1625 /* forking complete and child started to run, tell ptracer */ 1629 /* forking complete and child started to run, tell ptracer */
1626 if (unlikely(trace)) 1630 if (unlikely(trace))
1627 ptrace_event(trace, nr); 1631 ptrace_event_pid(trace, pid);
1628 1632
1629 if (clone_flags & CLONE_VFORK) { 1633 if (clone_flags & CLONE_VFORK) {
1630 if (!wait_for_vfork_done(p, &vfork)) 1634 if (!wait_for_vfork_done(p, &vfork))
1631 ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); 1635 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
1632 } 1636 }
1637
1638 put_pid(pid);
1633 } else { 1639 } else {
1634 nr = PTR_ERR(p); 1640 nr = PTR_ERR(p);
1635 } 1641 }
diff --git a/kernel/futex.c b/kernel/futex.c
index 5f589279e462..b632b5f3f094 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -267,7 +267,7 @@ static inline void futex_get_mm(union futex_key *key)
267 * get_futex_key() implies a full barrier. This is relied upon 267 * get_futex_key() implies a full barrier. This is relied upon
268 * as full barrier (B), see the ordering comment above. 268 * as full barrier (B), see the ordering comment above.
269 */ 269 */
270 smp_mb__after_atomic_inc(); 270 smp_mb__after_atomic();
271} 271}
272 272
273/* 273/*
@@ -280,7 +280,7 @@ static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
280 /* 280 /*
281 * Full barrier (A), see the ordering comment above. 281 * Full barrier (A), see the ordering comment above.
282 */ 282 */
283 smp_mb__after_atomic_inc(); 283 smp_mb__after_atomic();
284#endif 284#endif
285} 285}
286 286
@@ -743,6 +743,55 @@ void exit_pi_state_list(struct task_struct *curr)
743 raw_spin_unlock_irq(&curr->pi_lock); 743 raw_spin_unlock_irq(&curr->pi_lock);
744} 744}
745 745
746/*
747 * We need to check the following states:
748 *
749 * Waiter | pi_state | pi->owner | uTID | uODIED | ?
750 *
751 * [1] NULL | --- | --- | 0 | 0/1 | Valid
752 * [2] NULL | --- | --- | >0 | 0/1 | Valid
753 *
754 * [3] Found | NULL | -- | Any | 0/1 | Invalid
755 *
756 * [4] Found | Found | NULL | 0 | 1 | Valid
757 * [5] Found | Found | NULL | >0 | 1 | Invalid
758 *
759 * [6] Found | Found | task | 0 | 1 | Valid
760 *
761 * [7] Found | Found | NULL | Any | 0 | Invalid
762 *
763 * [8] Found | Found | task | ==taskTID | 0/1 | Valid
764 * [9] Found | Found | task | 0 | 0 | Invalid
765 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
766 *
767 * [1] Indicates that the kernel can acquire the futex atomically. We
768 * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
769 *
770 * [2] Valid, if TID does not belong to a kernel thread. If no matching
771 * thread is found then it indicates that the owner TID has died.
772 *
773 * [3] Invalid. The waiter is queued on a non PI futex
774 *
775 * [4] Valid state after exit_robust_list(), which sets the user space
776 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
777 *
778 * [5] The user space value got manipulated between exit_robust_list()
779 * and exit_pi_state_list()
780 *
781 * [6] Valid state after exit_pi_state_list() which sets the new owner in
782 * the pi_state but cannot access the user space value.
783 *
784 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
785 *
786 * [8] Owner and user space value match
787 *
788 * [9] There is no transient state which sets the user space TID to 0
789 * except exit_robust_list(), but this is indicated by the
790 * FUTEX_OWNER_DIED bit. See [4]
791 *
792 * [10] There is no transient state which leaves owner and user space
793 * TID out of sync.
794 */
746static int 795static int
747lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 796lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
748 union futex_key *key, struct futex_pi_state **ps) 797 union futex_key *key, struct futex_pi_state **ps)
@@ -755,12 +804,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
755 plist_for_each_entry_safe(this, next, &hb->chain, list) { 804 plist_for_each_entry_safe(this, next, &hb->chain, list) {
756 if (match_futex(&this->key, key)) { 805 if (match_futex(&this->key, key)) {
757 /* 806 /*
758 * Another waiter already exists - bump up 807 * Sanity check the waiter before increasing
759 * the refcount and return its pi_state: 808 * the refcount and attaching to it.
760 */ 809 */
761 pi_state = this->pi_state; 810 pi_state = this->pi_state;
762 /* 811 /*
763 * Userspace might have messed up non-PI and PI futexes 812 * Userspace might have messed up non-PI and
813 * PI futexes [3]
764 */ 814 */
765 if (unlikely(!pi_state)) 815 if (unlikely(!pi_state))
766 return -EINVAL; 816 return -EINVAL;
@@ -768,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
768 WARN_ON(!atomic_read(&pi_state->refcount)); 818 WARN_ON(!atomic_read(&pi_state->refcount));
769 819
770 /* 820 /*
771 * When pi_state->owner is NULL then the owner died 821 * Handle the owner died case:
772 * and another waiter is on the fly. pi_state->owner
773 * is fixed up by the task which acquires
774 * pi_state->rt_mutex.
775 *
776 * We do not check for pid == 0 which can happen when
777 * the owner died and robust_list_exit() cleared the
778 * TID.
779 */ 822 */
780 if (pid && pi_state->owner) { 823 if (uval & FUTEX_OWNER_DIED) {
824 /*
825 * exit_pi_state_list sets owner to NULL and
826 * wakes the topmost waiter. The task which
827 * acquires the pi_state->rt_mutex will fixup
828 * owner.
829 */
830 if (!pi_state->owner) {
831 /*
832 * No pi state owner, but the user
833 * space TID is not 0. Inconsistent
834 * state. [5]
835 */
836 if (pid)
837 return -EINVAL;
838 /*
839 * Take a ref on the state and
840 * return. [4]
841 */
842 goto out_state;
843 }
844
781 /* 845 /*
782 * Bail out if user space manipulated the 846 * If TID is 0, then either the dying owner
783 * futex value. 847 * has not yet executed exit_pi_state_list()
848 * or some waiter acquired the rtmutex in the
849 * pi state, but did not yet fixup the TID in
850 * user space.
851 *
852 * Take a ref on the state and return. [6]
784 */ 853 */
785 if (pid != task_pid_vnr(pi_state->owner)) 854 if (!pid)
855 goto out_state;
856 } else {
857 /*
858 * If the owner died bit is not set,
859 * then the pi_state must have an
860 * owner. [7]
861 */
862 if (!pi_state->owner)
786 return -EINVAL; 863 return -EINVAL;
787 } 864 }
788 865
866 /*
867 * Bail out if user space manipulated the
868 * futex value. If pi state exists then the
869 * owner TID must be the same as the user
870 * space TID. [9/10]
871 */
872 if (pid != task_pid_vnr(pi_state->owner))
873 return -EINVAL;
874
875 out_state:
789 atomic_inc(&pi_state->refcount); 876 atomic_inc(&pi_state->refcount);
790 *ps = pi_state; 877 *ps = pi_state;
791
792 return 0; 878 return 0;
793 } 879 }
794 } 880 }
795 881
796 /* 882 /*
797 * We are the first waiter - try to look up the real owner and attach 883 * We are the first waiter - try to look up the real owner and attach
798 * the new pi_state to it, but bail out when TID = 0 884 * the new pi_state to it, but bail out when TID = 0 [1]
799 */ 885 */
800 if (!pid) 886 if (!pid)
801 return -ESRCH; 887 return -ESRCH;
@@ -803,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
803 if (!p) 889 if (!p)
804 return -ESRCH; 890 return -ESRCH;
805 891
892 if (!p->mm) {
893 put_task_struct(p);
894 return -EPERM;
895 }
896
806 /* 897 /*
807 * We need to look at the task state flags to figure out, 898 * We need to look at the task state flags to figure out,
808 * whether the task is exiting. To protect against the do_exit 899 * whether the task is exiting. To protect against the do_exit
@@ -823,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
823 return ret; 914 return ret;
824 } 915 }
825 916
917 /*
918 * No existing pi state. First waiter. [2]
919 */
826 pi_state = alloc_pi_state(); 920 pi_state = alloc_pi_state();
827 921
828 /* 922 /*
@@ -894,10 +988,18 @@ retry:
894 return -EDEADLK; 988 return -EDEADLK;
895 989
896 /* 990 /*
897 * Surprise - we got the lock. Just return to userspace: 991 * Surprise - we got the lock, but we do not trust user space at all.
898 */ 992 */
899 if (unlikely(!curval)) 993 if (unlikely(!curval)) {
900 return 1; 994 /*
995 * We verify whether there is kernel state for this
996 * futex. If not, we can safely assume, that the 0 ->
997 * TID transition is correct. If state exists, we do
998 * not bother to fixup the user space state as it was
999 * corrupted already.
1000 */
1001 return futex_top_waiter(hb, key) ? -EINVAL : 1;
1002 }
901 1003
902 uval = curval; 1004 uval = curval;
903 1005
@@ -1028,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
1028 struct task_struct *new_owner; 1130 struct task_struct *new_owner;
1029 struct futex_pi_state *pi_state = this->pi_state; 1131 struct futex_pi_state *pi_state = this->pi_state;
1030 u32 uninitialized_var(curval), newval; 1132 u32 uninitialized_var(curval), newval;
1133 int ret = 0;
1031 1134
1032 if (!pi_state) 1135 if (!pi_state)
1033 return -EINVAL; 1136 return -EINVAL;
@@ -1051,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
1051 new_owner = this->task; 1154 new_owner = this->task;
1052 1155
1053 /* 1156 /*
1054 * We pass it to the next owner. (The WAITERS bit is always 1157 * We pass it to the next owner. The WAITERS bit is always
1055 * kept enabled while there is PI state around. We must also 1158 * kept enabled while there is PI state around. We cleanup the
1056 * preserve the owner died bit.) 1159 * owner died bit, because we are the owner.
1057 */ 1160 */
1058 if (!(uval & FUTEX_OWNER_DIED)) { 1161 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1059 int ret = 0;
1060 1162
1061 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 1163 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1062 1164 ret = -EFAULT;
1063 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) 1165 else if (curval != uval)
1064 ret = -EFAULT; 1166 ret = -EINVAL;
1065 else if (curval != uval) 1167 if (ret) {
1066 ret = -EINVAL; 1168 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1067 if (ret) { 1169 return ret;
1068 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1069 return ret;
1070 }
1071 } 1170 }
1072 1171
1073 raw_spin_lock_irq(&pi_state->owner->pi_lock); 1172 raw_spin_lock_irq(&pi_state->owner->pi_lock);
@@ -1347,7 +1446,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1347 * 1446 *
1348 * Return: 1447 * Return:
1349 * 0 - failed to acquire the lock atomically; 1448 * 0 - failed to acquire the lock atomically;
1350 * 1 - acquired the lock; 1449 * >0 - acquired the lock, return value is vpid of the top_waiter
1351 * <0 - error 1450 * <0 - error
1352 */ 1451 */
1353static int futex_proxy_trylock_atomic(u32 __user *pifutex, 1452static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1358,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1358{ 1457{
1359 struct futex_q *top_waiter = NULL; 1458 struct futex_q *top_waiter = NULL;
1360 u32 curval; 1459 u32 curval;
1361 int ret; 1460 int ret, vpid;
1362 1461
1363 if (get_futex_value_locked(&curval, pifutex)) 1462 if (get_futex_value_locked(&curval, pifutex))
1364 return -EFAULT; 1463 return -EFAULT;
@@ -1386,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1386 * the contended case or if set_waiters is 1. The pi_state is returned 1485 * the contended case or if set_waiters is 1. The pi_state is returned
1387 * in ps in contended cases. 1486 * in ps in contended cases.
1388 */ 1487 */
1488 vpid = task_pid_vnr(top_waiter->task);
1389 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1489 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1390 set_waiters); 1490 set_waiters);
1391 if (ret == 1) 1491 if (ret == 1) {
1392 requeue_pi_wake_futex(top_waiter, key2, hb2); 1492 requeue_pi_wake_futex(top_waiter, key2, hb2);
1393 1493 return vpid;
1494 }
1394 return ret; 1495 return ret;
1395} 1496}
1396 1497
@@ -1421,10 +1522,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1421 struct futex_pi_state *pi_state = NULL; 1522 struct futex_pi_state *pi_state = NULL;
1422 struct futex_hash_bucket *hb1, *hb2; 1523 struct futex_hash_bucket *hb1, *hb2;
1423 struct futex_q *this, *next; 1524 struct futex_q *this, *next;
1424 u32 curval2;
1425 1525
1426 if (requeue_pi) { 1526 if (requeue_pi) {
1427 /* 1527 /*
1528 * Requeue PI only works on two distinct uaddrs. This
1529 * check is only valid for private futexes. See below.
1530 */
1531 if (uaddr1 == uaddr2)
1532 return -EINVAL;
1533
1534 /*
1428 * requeue_pi requires a pi_state, try to allocate it now 1535 * requeue_pi requires a pi_state, try to allocate it now
1429 * without any locks in case it fails. 1536 * without any locks in case it fails.
1430 */ 1537 */
@@ -1462,6 +1569,15 @@ retry:
1462 if (unlikely(ret != 0)) 1569 if (unlikely(ret != 0))
1463 goto out_put_key1; 1570 goto out_put_key1;
1464 1571
1572 /*
1573 * The check above which compares uaddrs is not sufficient for
1574 * shared futexes. We need to compare the keys:
1575 */
1576 if (requeue_pi && match_futex(&key1, &key2)) {
1577 ret = -EINVAL;
1578 goto out_put_keys;
1579 }
1580
1465 hb1 = hash_futex(&key1); 1581 hb1 = hash_futex(&key1);
1466 hb2 = hash_futex(&key2); 1582 hb2 = hash_futex(&key2);
1467 1583
@@ -1509,16 +1625,25 @@ retry_private:
1509 * At this point the top_waiter has either taken uaddr2 or is 1625 * At this point the top_waiter has either taken uaddr2 or is
1510 * waiting on it. If the former, then the pi_state will not 1626 * waiting on it. If the former, then the pi_state will not
1511 * exist yet, look it up one more time to ensure we have a 1627 * exist yet, look it up one more time to ensure we have a
1512 * reference to it. 1628 * reference to it. If the lock was taken, ret contains the
1629 * vpid of the top waiter task.
1513 */ 1630 */
1514 if (ret == 1) { 1631 if (ret > 0) {
1515 WARN_ON(pi_state); 1632 WARN_ON(pi_state);
1516 drop_count++; 1633 drop_count++;
1517 task_count++; 1634 task_count++;
1518 ret = get_futex_value_locked(&curval2, uaddr2); 1635 /*
1519 if (!ret) 1636 * If we acquired the lock, then the user
1520 ret = lookup_pi_state(curval2, hb2, &key2, 1637 * space value of uaddr2 should be vpid. It
1521 &pi_state); 1638 * cannot be changed by the top waiter as it
1639 * is blocked on hb2 lock if it tries to do
1640 * so. If something fiddled with it behind our
1641 * back the pi state lookup might unearth
1642 * it. So we rather use the known value than
1643 * rereading and handing potential crap to
1644 * lookup_pi_state.
1645 */
1646 ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
1522 } 1647 }
1523 1648
1524 switch (ret) { 1649 switch (ret) {
@@ -2301,9 +2426,10 @@ retry:
2301 /* 2426 /*
2302 * To avoid races, try to do the TID -> 0 atomic transition 2427 * To avoid races, try to do the TID -> 0 atomic transition
2303 * again. If it succeeds then we can return without waking 2428 * again. If it succeeds then we can return without waking
2304 * anyone else up: 2429 * anyone else up. We only try this if neither the waiters nor
2430 * the owner died bit are set.
2305 */ 2431 */
2306 if (!(uval & FUTEX_OWNER_DIED) && 2432 if (!(uval & ~FUTEX_TID_MASK) &&
2307 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) 2433 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2308 goto pi_faulted; 2434 goto pi_faulted;
2309 /* 2435 /*
@@ -2333,11 +2459,9 @@ retry:
2333 /* 2459 /*
2334 * No waiters - kernel unlocks the futex: 2460 * No waiters - kernel unlocks the futex:
2335 */ 2461 */
2336 if (!(uval & FUTEX_OWNER_DIED)) { 2462 ret = unlock_futex_pi(uaddr, uval);
2337 ret = unlock_futex_pi(uaddr, uval); 2463 if (ret == -EFAULT)
2338 if (ret == -EFAULT) 2464 goto pi_faulted;
2339 goto pi_faulted;
2340 }
2341 2465
2342out_unlock: 2466out_unlock:
2343 spin_unlock(&hb->lock); 2467 spin_unlock(&hb->lock);
@@ -2499,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2499 if (ret) 2623 if (ret)
2500 goto out_key2; 2624 goto out_key2;
2501 2625
2626 /*
2627 * The check above which compares uaddrs is not sufficient for
2628 * shared futexes. We need to compare the keys:
2629 */
2630 if (match_futex(&q.key, &key2)) {
2631 ret = -EINVAL;
2632 goto out_put_keys;
2633 }
2634
2502 /* Queue the futex_q, drop the hb lock, wait for wakeup. */ 2635 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2503 futex_wait_queue_me(hb, &q, to); 2636 futex_wait_queue_me(hb, &q, to);
2504 2637
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index f45b75b713c0..b358a802fd18 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -85,6 +85,12 @@ void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
85} 85}
86EXPORT_SYMBOL(__gcov_merge_ior); 86EXPORT_SYMBOL(__gcov_merge_ior);
87 87
88void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
89{
90 /* Unused. */
91}
92EXPORT_SYMBOL(__gcov_merge_time_profile);
93
88/** 94/**
89 * gcov_enable_events - enable event reporting through gcov_event() 95 * gcov_enable_events - enable event reporting through gcov_event()
90 * 96 *
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 2c6e4631c814..826ba9fb5e32 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,12 @@
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include "gcov.h" 19#include "gcov.h"
20 20
21#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
22#define GCOV_COUNTERS 9
23#else
21#define GCOV_COUNTERS 8 24#define GCOV_COUNTERS 8
25#endif
26
22#define GCOV_TAG_FUNCTION_LENGTH 3 27#define GCOV_TAG_FUNCTION_LENGTH 3
23 28
24static struct gcov_info *gcov_info_head; 29static struct gcov_info *gcov_info_head;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6b715c0af1b1..3ab28993f6e0 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -990,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
990 /* Remove an active timer from the queue: */ 990 /* Remove an active timer from the queue: */
991 ret = remove_hrtimer(timer, base); 991 ret = remove_hrtimer(timer, base);
992 992
993 /* Switch the timer base, if necessary: */
994 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
995
996 if (mode & HRTIMER_MODE_REL) { 993 if (mode & HRTIMER_MODE_REL) {
997 tim = ktime_add_safe(tim, new_base->get_time()); 994 tim = ktime_add_safe(tim, base->get_time());
998 /* 995 /*
999 * CONFIG_TIME_LOW_RES is a temporary way for architectures 996 * CONFIG_TIME_LOW_RES is a temporary way for architectures
1000 * to signal that they simply return xtime in 997 * to signal that they simply return xtime in
@@ -1009,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1009 1006
1010 hrtimer_set_expires_range_ns(timer, tim, delta_ns); 1007 hrtimer_set_expires_range_ns(timer, tim, delta_ns);
1011 1008
1009 /* Switch the timer base, if necessary: */
1010 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
1011
1012 timer_stats_hrtimer_set_start_info(timer); 1012 timer_stats_hrtimer_set_start_info(timer);
1013 1013
1014 leftmost = enqueue_hrtimer(timer, new_base); 1014 leftmost = enqueue_hrtimer(timer, new_base);
@@ -1039,6 +1039,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1039 1039
1040 return ret; 1040 return ret;
1041} 1041}
1042EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
1042 1043
1043/** 1044/**
1044 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU 1045 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 06bb1417b063..06db12434d72 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic =
52 52
53static int __init hung_task_panic_setup(char *str) 53static int __init hung_task_panic_setup(char *str)
54{ 54{
55 sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); 55 int rc = kstrtouint(str, 0, &sysctl_hung_task_panic);
56 56
57 if (rc)
58 return rc;
57 return 1; 59 return 1;
58} 60}
59__setup("hung_task_panic=", hung_task_panic_setup); 61__setup("hung_task_panic=", hung_task_panic_setup);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 07cbdfea9ae2..d269cecdfbf0 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -5,6 +5,10 @@ menu "IRQ subsystem"
5config MAY_HAVE_SPARSE_IRQ 5config MAY_HAVE_SPARSE_IRQ
6 bool 6 bool
7 7
8# Legacy support, required for itanic
9config GENERIC_IRQ_LEGACY
10 bool
11
8# Enable the generic irq autoprobe mechanism 12# Enable the generic irq autoprobe mechanism
9config GENERIC_IRQ_PROBE 13config GENERIC_IRQ_PROBE
10 bool 14 bool
@@ -17,6 +21,11 @@ config GENERIC_IRQ_SHOW
17config GENERIC_IRQ_SHOW_LEVEL 21config GENERIC_IRQ_SHOW_LEVEL
18 bool 22 bool
19 23
24# Facility to allocate a hardware interrupt. This is legacy support
25# and should not be used in new code. Use irq domains instead.
26config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
27 bool
28
20# Support for delayed migration from interrupt context 29# Support for delayed migration from interrupt context
21config GENERIC_PENDING_IRQ 30config GENERIC_PENDING_IRQ
22 bool 31 bool
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6397df2d6945..a2b28a2fd7b1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -40,10 +40,9 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
40 irq_put_desc_unlock(desc, flags); 40 irq_put_desc_unlock(desc, flags);
41 /* 41 /*
42 * For !CONFIG_SPARSE_IRQ make the irq show up in 42 * For !CONFIG_SPARSE_IRQ make the irq show up in
43 * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is 43 * allocated_irqs.
44 * already marked, and this call is harmless.
45 */ 44 */
46 irq_reserve_irq(irq); 45 irq_mark_irq(irq);
47 return 0; 46 return 0;
48} 47}
49EXPORT_SYMBOL(irq_set_chip); 48EXPORT_SYMBOL(irq_set_chip);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ddf1ffeb79f1..099ea2e0eb88 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -33,7 +33,7 @@ enum {
33}; 33};
34 34
35/* 35/*
36 * Bit masks for desc->state 36 * Bit masks for desc->core_internal_state__do_not_mess_with_it
37 * 37 *
38 * IRQS_AUTODETECT - autodetection in progress 38 * IRQS_AUTODETECT - autodetection in progress
39 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt 39 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
@@ -76,6 +76,12 @@ extern void mask_irq(struct irq_desc *desc);
76extern void unmask_irq(struct irq_desc *desc); 76extern void unmask_irq(struct irq_desc *desc);
77extern void unmask_threaded_irq(struct irq_desc *desc); 77extern void unmask_threaded_irq(struct irq_desc *desc);
78 78
79#ifdef CONFIG_SPARSE_IRQ
80static inline void irq_mark_irq(unsigned int irq) { }
81#else
82extern void irq_mark_irq(unsigned int irq);
83#endif
84
79extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 85extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
80 86
81irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); 87irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index bb07f2928f4b..1487a123db5c 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -278,7 +278,12 @@ EXPORT_SYMBOL(irq_to_desc);
278 278
279static void free_desc(unsigned int irq) 279static void free_desc(unsigned int irq)
280{ 280{
281 dynamic_irq_cleanup(irq); 281 struct irq_desc *desc = irq_to_desc(irq);
282 unsigned long flags;
283
284 raw_spin_lock_irqsave(&desc->lock, flags);
285 desc_set_defaults(irq, desc, desc_node(desc), NULL);
286 raw_spin_unlock_irqrestore(&desc->lock, flags);
282} 287}
283 288
284static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, 289static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
@@ -299,6 +304,20 @@ static int irq_expand_nr_irqs(unsigned int nr)
299 return -ENOMEM; 304 return -ENOMEM;
300} 305}
301 306
307void irq_mark_irq(unsigned int irq)
308{
309 mutex_lock(&sparse_irq_lock);
310 bitmap_set(allocated_irqs, irq, 1);
311 mutex_unlock(&sparse_irq_lock);
312}
313
314#ifdef CONFIG_GENERIC_IRQ_LEGACY
315void irq_init_desc(unsigned int irq)
316{
317 free_desc(irq);
318}
319#endif
320
302#endif /* !CONFIG_SPARSE_IRQ */ 321#endif /* !CONFIG_SPARSE_IRQ */
303 322
304/** 323/**
@@ -396,30 +415,56 @@ err:
396} 415}
397EXPORT_SYMBOL_GPL(__irq_alloc_descs); 416EXPORT_SYMBOL_GPL(__irq_alloc_descs);
398 417
418#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
399/** 419/**
400 * irq_reserve_irqs - mark irqs allocated 420 * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware
401 * @from: mark from irq number 421 * @cnt: number of interrupts to allocate
402 * @cnt: number of irqs to mark 422 * @node: node on which to allocate
403 * 423 *
404 * Returns 0 on success or an appropriate error code 424 * Returns an interrupt number > 0 or 0, if the allocation fails.
405 */ 425 */
406int irq_reserve_irqs(unsigned int from, unsigned int cnt) 426unsigned int irq_alloc_hwirqs(int cnt, int node)
407{ 427{
408 unsigned int start; 428 int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL);
409 int ret = 0;
410 429
411 if (!cnt || (from + cnt) > nr_irqs) 430 if (irq < 0)
412 return -EINVAL; 431 return 0;
413 432
414 mutex_lock(&sparse_irq_lock); 433 for (i = irq; cnt > 0; i++, cnt--) {
415 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); 434 if (arch_setup_hwirq(i, node))
416 if (start == from) 435 goto err;
417 bitmap_set(allocated_irqs, start, cnt); 436 irq_clear_status_flags(i, _IRQ_NOREQUEST);
418 else 437 }
419 ret = -EEXIST; 438 return irq;
420 mutex_unlock(&sparse_irq_lock); 439
421 return ret; 440err:
441 for (i--; i >= irq; i--) {
442 irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
443 arch_teardown_hwirq(i);
444 }
445 irq_free_descs(irq, cnt);
446 return 0;
447}
448EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
449
450/**
451 * irq_free_hwirqs - Free irq descriptor and cleanup the hardware
452 * @from: Free from irq number
453 * @cnt: number of interrupts to free
454 *
455 */
456void irq_free_hwirqs(unsigned int from, int cnt)
457{
458 int i, j;
459
460 for (i = from, j = cnt; j > 0; i++, j--) {
461 irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
462 arch_teardown_hwirq(i);
463 }
464 irq_free_descs(from, cnt);
422} 465}
466EXPORT_SYMBOL_GPL(irq_free_hwirqs);
467#endif
423 468
424/** 469/**
425 * irq_get_next_irq - get next allocated irq number 470 * irq_get_next_irq - get next allocated irq number
@@ -482,20 +527,6 @@ int irq_set_percpu_devid(unsigned int irq)
482 return 0; 527 return 0;
483} 528}
484 529
485/**
486 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
487 * @irq: irq number to initialize
488 */
489void dynamic_irq_cleanup(unsigned int irq)
490{
491 struct irq_desc *desc = irq_to_desc(irq);
492 unsigned long flags;
493
494 raw_spin_lock_irqsave(&desc->lock, flags);
495 desc_set_defaults(irq, desc, desc_node(desc), NULL);
496 raw_spin_unlock_irqrestore(&desc->lock, flags);
497}
498
499void kstat_incr_irq_this_cpu(unsigned int irq) 530void kstat_incr_irq_this_cpu(unsigned int irq)
500{ 531{
501 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); 532 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f14033700c25..eb5e10e32e05 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -27,14 +27,14 @@ static struct irq_domain *irq_default_domain;
27 * __irq_domain_add() - Allocate a new irq_domain data structure 27 * __irq_domain_add() - Allocate a new irq_domain data structure
28 * @of_node: optional device-tree node of the interrupt controller 28 * @of_node: optional device-tree node of the interrupt controller
29 * @size: Size of linear map; 0 for radix mapping only 29 * @size: Size of linear map; 0 for radix mapping only
30 * @hwirq_max: Maximum number of interrupts supported by controller
30 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no 31 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
31 * direct mapping 32 * direct mapping
32 * @ops: map/unmap domain callbacks 33 * @ops: map/unmap domain callbacks
33 * @host_data: Controller private data pointer 34 * @host_data: Controller private data pointer
34 * 35 *
35 * Allocates and initialize and irq_domain structure. Caller is expected to 36 * Allocates and initialize and irq_domain structure.
36 * register allocated irq_domain with irq_domain_register(). Returns pointer 37 * Returns pointer to IRQ domain, or NULL on failure.
37 * to IRQ domain, or NULL on failure.
38 */ 38 */
39struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, 39struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
40 irq_hw_number_t hwirq_max, int direct_max, 40 irq_hw_number_t hwirq_max, int direct_max,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d34131ca372b..3dc6a61bf06a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -886,8 +886,8 @@ static int irq_thread(void *data)
886 irq_thread_check_affinity(desc, action); 886 irq_thread_check_affinity(desc, action);
887 887
888 action_ret = handler_fn(desc, action); 888 action_ret = handler_fn(desc, action);
889 if (!noirqdebug) 889 if (action_ret == IRQ_HANDLED)
890 note_interrupt(action->irq, desc, action_ret); 890 atomic_inc(&desc->threads_handled);
891 891
892 wake_threads_waitq(desc); 892 wake_threads_waitq(desc);
893 } 893 }
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index a1d8cc63b56e..e2514b0e439e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -270,6 +270,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
270 return action && (action->flags & IRQF_IRQPOLL); 270 return action && (action->flags & IRQF_IRQPOLL);
271} 271}
272 272
273#define SPURIOUS_DEFERRED 0x80000000
274
273void note_interrupt(unsigned int irq, struct irq_desc *desc, 275void note_interrupt(unsigned int irq, struct irq_desc *desc,
274 irqreturn_t action_ret) 276 irqreturn_t action_ret)
275{ 277{
@@ -277,15 +279,111 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
277 irq_settings_is_polled(desc)) 279 irq_settings_is_polled(desc))
278 return; 280 return;
279 281
280 /* we get here again via the threaded handler */
281 if (action_ret == IRQ_WAKE_THREAD)
282 return;
283
284 if (bad_action_ret(action_ret)) { 282 if (bad_action_ret(action_ret)) {
285 report_bad_irq(irq, desc, action_ret); 283 report_bad_irq(irq, desc, action_ret);
286 return; 284 return;
287 } 285 }
288 286
287 /*
288 * We cannot call note_interrupt from the threaded handler
289 * because we need to look at the compound of all handlers
290 * (primary and threaded). Aside of that in the threaded
291 * shared case we have no serialization against an incoming
292 * hardware interrupt while we are dealing with a threaded
293 * result.
294 *
295 * So in case a thread is woken, we just note the fact and
296 * defer the analysis to the next hardware interrupt.
297 *
298 * The threaded handlers store whether they sucessfully
299 * handled an interrupt and we check whether that number
300 * changed versus the last invocation.
301 *
302 * We could handle all interrupts with the delayed by one
303 * mechanism, but for the non forced threaded case we'd just
304 * add pointless overhead to the straight hardirq interrupts
305 * for the sake of a few lines less code.
306 */
307 if (action_ret & IRQ_WAKE_THREAD) {
308 /*
309 * There is a thread woken. Check whether one of the
310 * shared primary handlers returned IRQ_HANDLED. If
311 * not we defer the spurious detection to the next
312 * interrupt.
313 */
314 if (action_ret == IRQ_WAKE_THREAD) {
315 int handled;
316 /*
317 * We use bit 31 of thread_handled_last to
318 * denote the deferred spurious detection
319 * active. No locking necessary as
320 * thread_handled_last is only accessed here
321 * and we have the guarantee that hard
322 * interrupts are not reentrant.
323 */
324 if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) {
325 desc->threads_handled_last |= SPURIOUS_DEFERRED;
326 return;
327 }
328 /*
329 * Check whether one of the threaded handlers
330 * returned IRQ_HANDLED since the last
331 * interrupt happened.
332 *
333 * For simplicity we just set bit 31, as it is
334 * set in threads_handled_last as well. So we
335 * avoid extra masking. And we really do not
336 * care about the high bits of the handled
337 * count. We just care about the count being
338 * different than the one we saw before.
339 */
340 handled = atomic_read(&desc->threads_handled);
341 handled |= SPURIOUS_DEFERRED;
342 if (handled != desc->threads_handled_last) {
343 action_ret = IRQ_HANDLED;
344 /*
345 * Note: We keep the SPURIOUS_DEFERRED
346 * bit set. We are handling the
347 * previous invocation right now.
348 * Keep it for the current one, so the
349 * next hardware interrupt will
350 * account for it.
351 */
352 desc->threads_handled_last = handled;
353 } else {
354 /*
355 * None of the threaded handlers felt
356 * responsible for the last interrupt
357 *
358 * We keep the SPURIOUS_DEFERRED bit
359 * set in threads_handled_last as we
360 * need to account for the current
361 * interrupt as well.
362 */
363 action_ret = IRQ_NONE;
364 }
365 } else {
366 /*
367 * One of the primary handlers returned
368 * IRQ_HANDLED. So we don't care about the
369 * threaded handlers on the same line. Clear
370 * the deferred detection bit.
371 *
372 * In theory we could/should check whether the
373 * deferred bit is set and take the result of
374 * the previous run into account here as
375 * well. But it's really not worth the
376 * trouble. If every other interrupt is
377 * handled we never trigger the spurious
378 * detector. And if this is just the one out
379 * of 100k unhandled ones which is handled
380 * then we merily delay the spurious detection
381 * by one hard interrupt. Not a real problem.
382 */
383 desc->threads_handled_last &= ~SPURIOUS_DEFERRED;
384 }
385 }
386
289 if (unlikely(action_ret == IRQ_NONE)) { 387 if (unlikely(action_ret == IRQ_NONE)) {
290 /* 388 /*
291 * If we are seeing only the odd spurious IRQ caused by 389 * If we are seeing only the odd spurious IRQ caused by
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c8380ad203bc..369f41a94124 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -125,8 +125,8 @@ static struct page *kimage_alloc_page(struct kimage *image,
125 unsigned long dest); 125 unsigned long dest);
126 126
127static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 127static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
128 unsigned long nr_segments, 128 unsigned long nr_segments,
129 struct kexec_segment __user *segments) 129 struct kexec_segment __user *segments)
130{ 130{
131 size_t segment_bytes; 131 size_t segment_bytes;
132 struct kimage *image; 132 struct kimage *image;
@@ -257,13 +257,13 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
257 image->control_code_page = kimage_alloc_control_pages(image, 257 image->control_code_page = kimage_alloc_control_pages(image,
258 get_order(KEXEC_CONTROL_PAGE_SIZE)); 258 get_order(KEXEC_CONTROL_PAGE_SIZE));
259 if (!image->control_code_page) { 259 if (!image->control_code_page) {
260 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 260 pr_err("Could not allocate control_code_buffer\n");
261 goto out_free; 261 goto out_free;
262 } 262 }
263 263
264 image->swap_page = kimage_alloc_control_pages(image, 0); 264 image->swap_page = kimage_alloc_control_pages(image, 0);
265 if (!image->swap_page) { 265 if (!image->swap_page) {
266 printk(KERN_ERR "Could not allocate swap buffer\n"); 266 pr_err("Could not allocate swap buffer\n");
267 goto out_free; 267 goto out_free;
268 } 268 }
269 269
@@ -332,7 +332,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
332 image->control_code_page = kimage_alloc_control_pages(image, 332 image->control_code_page = kimage_alloc_control_pages(image,
333 get_order(KEXEC_CONTROL_PAGE_SIZE)); 333 get_order(KEXEC_CONTROL_PAGE_SIZE));
334 if (!image->control_code_page) { 334 if (!image->control_code_page) {
335 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 335 pr_err("Could not allocate control_code_buffer\n");
336 goto out_free; 336 goto out_free;
337 } 337 }
338 338
@@ -621,8 +621,8 @@ static void kimage_terminate(struct kimage *image)
621 621
622#define for_each_kimage_entry(image, ptr, entry) \ 622#define for_each_kimage_entry(image, ptr, entry) \
623 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 623 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
624 ptr = (entry & IND_INDIRECTION)? \ 624 ptr = (entry & IND_INDIRECTION) ? \
625 phys_to_virt((entry & PAGE_MASK)): ptr +1) 625 phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
626 626
627static void kimage_free_entry(kimage_entry_t entry) 627static void kimage_free_entry(kimage_entry_t entry)
628{ 628{
@@ -650,8 +650,7 @@ static void kimage_free(struct kimage *image)
650 * done with it. 650 * done with it.
651 */ 651 */
652 ind = entry; 652 ind = entry;
653 } 653 } else if (entry & IND_SOURCE)
654 else if (entry & IND_SOURCE)
655 kimage_free_entry(entry); 654 kimage_free_entry(entry);
656 } 655 }
657 /* Free the final indirection page */ 656 /* Free the final indirection page */
@@ -774,8 +773,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
774 addr = old_addr; 773 addr = old_addr;
775 page = old_page; 774 page = old_page;
776 break; 775 break;
777 } 776 } else {
778 else {
779 /* Place the page on the destination list I 777 /* Place the page on the destination list I
780 * will use it later. 778 * will use it later.
781 */ 779 */
@@ -1059,7 +1057,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
1059 return -EINVAL; 1057 return -EINVAL;
1060 1058
1061 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 1059 ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1062 for (i=0; i < nr_segments; i++) { 1060 for (i = 0; i < nr_segments; i++) {
1063 result = copy_from_user(&in, &segments[i], sizeof(in)); 1061 result = copy_from_user(&in, &segments[i], sizeof(in));
1064 if (result) 1062 if (result)
1065 return -EFAULT; 1063 return -EFAULT;
@@ -1214,14 +1212,14 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
1214 * squirrelled away. ELF notes happen to provide 1212 * squirrelled away. ELF notes happen to provide
1215 * all of that, so there is no need to invent something new. 1213 * all of that, so there is no need to invent something new.
1216 */ 1214 */
1217 buf = (u32*)per_cpu_ptr(crash_notes, cpu); 1215 buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
1218 if (!buf) 1216 if (!buf)
1219 return; 1217 return;
1220 memset(&prstatus, 0, sizeof(prstatus)); 1218 memset(&prstatus, 0, sizeof(prstatus));
1221 prstatus.pr_pid = current->pid; 1219 prstatus.pr_pid = current->pid;
1222 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); 1220 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
1223 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, 1221 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1224 &prstatus, sizeof(prstatus)); 1222 &prstatus, sizeof(prstatus));
1225 final_note(buf); 1223 final_note(buf);
1226} 1224}
1227 1225
@@ -1230,8 +1228,7 @@ static int __init crash_notes_memory_init(void)
1230 /* Allocate memory for saving cpu registers. */ 1228 /* Allocate memory for saving cpu registers. */
1231 crash_notes = alloc_percpu(note_buf_t); 1229 crash_notes = alloc_percpu(note_buf_t);
1232 if (!crash_notes) { 1230 if (!crash_notes) {
1233 printk("Kexec: Memory allocation for saving cpu register" 1231 pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
1234 " states failed\n");
1235 return -ENOMEM; 1232 return -ENOMEM;
1236 } 1233 }
1237 return 0; 1234 return 0;
@@ -1253,10 +1250,10 @@ subsys_initcall(crash_notes_memory_init);
1253 * 1250 *
1254 * The function returns 0 on success and -EINVAL on failure. 1251 * The function returns 0 on success and -EINVAL on failure.
1255 */ 1252 */
1256static int __init parse_crashkernel_mem(char *cmdline, 1253static int __init parse_crashkernel_mem(char *cmdline,
1257 unsigned long long system_ram, 1254 unsigned long long system_ram,
1258 unsigned long long *crash_size, 1255 unsigned long long *crash_size,
1259 unsigned long long *crash_base) 1256 unsigned long long *crash_base)
1260{ 1257{
1261 char *cur = cmdline, *tmp; 1258 char *cur = cmdline, *tmp;
1262 1259
@@ -1267,12 +1264,12 @@ static int __init parse_crashkernel_mem(char *cmdline,
1267 /* get the start of the range */ 1264 /* get the start of the range */
1268 start = memparse(cur, &tmp); 1265 start = memparse(cur, &tmp);
1269 if (cur == tmp) { 1266 if (cur == tmp) {
1270 pr_warning("crashkernel: Memory value expected\n"); 1267 pr_warn("crashkernel: Memory value expected\n");
1271 return -EINVAL; 1268 return -EINVAL;
1272 } 1269 }
1273 cur = tmp; 1270 cur = tmp;
1274 if (*cur != '-') { 1271 if (*cur != '-') {
1275 pr_warning("crashkernel: '-' expected\n"); 1272 pr_warn("crashkernel: '-' expected\n");
1276 return -EINVAL; 1273 return -EINVAL;
1277 } 1274 }
1278 cur++; 1275 cur++;
@@ -1281,31 +1278,30 @@ static int __init parse_crashkernel_mem(char *cmdline,
1281 if (*cur != ':') { 1278 if (*cur != ':') {
1282 end = memparse(cur, &tmp); 1279 end = memparse(cur, &tmp);
1283 if (cur == tmp) { 1280 if (cur == tmp) {
1284 pr_warning("crashkernel: Memory " 1281 pr_warn("crashkernel: Memory value expected\n");
1285 "value expected\n");
1286 return -EINVAL; 1282 return -EINVAL;
1287 } 1283 }
1288 cur = tmp; 1284 cur = tmp;
1289 if (end <= start) { 1285 if (end <= start) {
1290 pr_warning("crashkernel: end <= start\n"); 1286 pr_warn("crashkernel: end <= start\n");
1291 return -EINVAL; 1287 return -EINVAL;
1292 } 1288 }
1293 } 1289 }
1294 1290
1295 if (*cur != ':') { 1291 if (*cur != ':') {
1296 pr_warning("crashkernel: ':' expected\n"); 1292 pr_warn("crashkernel: ':' expected\n");
1297 return -EINVAL; 1293 return -EINVAL;
1298 } 1294 }
1299 cur++; 1295 cur++;
1300 1296
1301 size = memparse(cur, &tmp); 1297 size = memparse(cur, &tmp);
1302 if (cur == tmp) { 1298 if (cur == tmp) {
1303 pr_warning("Memory value expected\n"); 1299 pr_warn("Memory value expected\n");
1304 return -EINVAL; 1300 return -EINVAL;
1305 } 1301 }
1306 cur = tmp; 1302 cur = tmp;
1307 if (size >= system_ram) { 1303 if (size >= system_ram) {
1308 pr_warning("crashkernel: invalid size\n"); 1304 pr_warn("crashkernel: invalid size\n");
1309 return -EINVAL; 1305 return -EINVAL;
1310 } 1306 }
1311 1307
@@ -1323,8 +1319,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
1323 cur++; 1319 cur++;
1324 *crash_base = memparse(cur, &tmp); 1320 *crash_base = memparse(cur, &tmp);
1325 if (cur == tmp) { 1321 if (cur == tmp) {
1326 pr_warning("Memory value expected " 1322 pr_warn("Memory value expected after '@'\n");
1327 "after '@'\n");
1328 return -EINVAL; 1323 return -EINVAL;
1329 } 1324 }
1330 } 1325 }
@@ -1336,26 +1331,26 @@ static int __init parse_crashkernel_mem(char *cmdline,
1336/* 1331/*
1337 * That function parses "simple" (old) crashkernel command lines like 1332 * That function parses "simple" (old) crashkernel command lines like
1338 * 1333 *
1339 * crashkernel=size[@offset] 1334 * crashkernel=size[@offset]
1340 * 1335 *
1341 * It returns 0 on success and -EINVAL on failure. 1336 * It returns 0 on success and -EINVAL on failure.
1342 */ 1337 */
1343static int __init parse_crashkernel_simple(char *cmdline, 1338static int __init parse_crashkernel_simple(char *cmdline,
1344 unsigned long long *crash_size, 1339 unsigned long long *crash_size,
1345 unsigned long long *crash_base) 1340 unsigned long long *crash_base)
1346{ 1341{
1347 char *cur = cmdline; 1342 char *cur = cmdline;
1348 1343
1349 *crash_size = memparse(cmdline, &cur); 1344 *crash_size = memparse(cmdline, &cur);
1350 if (cmdline == cur) { 1345 if (cmdline == cur) {
1351 pr_warning("crashkernel: memory value expected\n"); 1346 pr_warn("crashkernel: memory value expected\n");
1352 return -EINVAL; 1347 return -EINVAL;
1353 } 1348 }
1354 1349
1355 if (*cur == '@') 1350 if (*cur == '@')
1356 *crash_base = memparse(cur+1, &cur); 1351 *crash_base = memparse(cur+1, &cur);
1357 else if (*cur != ' ' && *cur != '\0') { 1352 else if (*cur != ' ' && *cur != '\0') {
1358 pr_warning("crashkernel: unrecognized char\n"); 1353 pr_warn("crashkernel: unrecognized char\n");
1359 return -EINVAL; 1354 return -EINVAL;
1360 } 1355 }
1361 1356
@@ -1622,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1622#ifdef CONFIG_MEMORY_FAILURE 1617#ifdef CONFIG_MEMORY_FAILURE
1623 VMCOREINFO_NUMBER(PG_hwpoison); 1618 VMCOREINFO_NUMBER(PG_hwpoison);
1624#endif 1619#endif
1620 VMCOREINFO_NUMBER(PG_head_mask);
1625 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); 1621 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1626 1622
1627 arch_crash_save_vmcoreinfo(); 1623 arch_crash_save_vmcoreinfo();
@@ -1683,7 +1679,15 @@ int kernel_kexec(void)
1683 kexec_in_progress = true; 1679 kexec_in_progress = true;
1684 kernel_restart_prepare(NULL); 1680 kernel_restart_prepare(NULL);
1685 migrate_to_reboot_cpu(); 1681 migrate_to_reboot_cpu();
1686 printk(KERN_EMERG "Starting new kernel\n"); 1682
1683 /*
1684 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
1685 * no further code needs to use CPU hotplug (which is true in
1686 * the reboot case). However, the kexec path depends on using
1687 * CPU hotplug again; so re-enable it here.
1688 */
1689 cpu_hotplug_enable();
1690 pr_emerg("Starting new kernel\n");
1687 machine_shutdown(); 1691 machine_shutdown();
1688 } 1692 }
1689 1693
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6b375af4958d..8637e041a247 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -285,10 +285,7 @@ static int wait_for_helper(void *data)
285 pid_t pid; 285 pid_t pid;
286 286
287 /* If SIGCLD is ignored sys_wait4 won't populate the status. */ 287 /* If SIGCLD is ignored sys_wait4 won't populate the status. */
288 spin_lock_irq(&current->sighand->siglock); 288 kernel_sigaction(SIGCHLD, SIG_DFL);
289 current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
290 spin_unlock_irq(&current->sighand->siglock);
291
292 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 289 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
293 if (pid < 0) { 290 if (pid < 0) {
294 sub_info->retval = pid; 291 sub_info->retval = pid;
@@ -498,7 +495,7 @@ int __usermodehelper_disable(enum umh_disable_depth depth)
498static void helper_lock(void) 495static void helper_lock(void)
499{ 496{
500 atomic_inc(&running_helpers); 497 atomic_inc(&running_helpers);
501 smp_mb__after_atomic_inc(); 498 smp_mb__after_atomic();
502} 499}
503 500
504static void helper_unlock(void) 501static void helper_unlock(void)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ceeadfcabb76..3214289df5a7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -86,21 +86,8 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
86 return &(kretprobe_table_locks[hash].lock); 86 return &(kretprobe_table_locks[hash].lock);
87} 87}
88 88
89/* 89/* Blacklist -- list of struct kprobe_blacklist_entry */
90 * Normally, functions that we'd want to prohibit kprobes in, are marked 90static LIST_HEAD(kprobe_blacklist);
91 * __kprobes. But, there are cases where such functions already belong to
92 * a different section (__sched for preempt_schedule)
93 *
94 * For such cases, we now have a blacklist
95 */
96static struct kprobe_blackpoint kprobe_blacklist[] = {
97 {"preempt_schedule",},
98 {"native_get_debugreg",},
99 {"irq_entries_start",},
100 {"common_interrupt",},
101 {"mcount",}, /* mcount can be called from everywhere */
102 {NULL} /* Terminator */
103};
104 91
105#ifdef __ARCH_WANT_KPROBES_INSN_SLOT 92#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
106/* 93/*
@@ -151,13 +138,13 @@ struct kprobe_insn_cache kprobe_insn_slots = {
151 .insn_size = MAX_INSN_SIZE, 138 .insn_size = MAX_INSN_SIZE,
152 .nr_garbage = 0, 139 .nr_garbage = 0,
153}; 140};
154static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); 141static int collect_garbage_slots(struct kprobe_insn_cache *c);
155 142
156/** 143/**
157 * __get_insn_slot() - Find a slot on an executable page for an instruction. 144 * __get_insn_slot() - Find a slot on an executable page for an instruction.
158 * We allocate an executable page if there's no room on existing ones. 145 * We allocate an executable page if there's no room on existing ones.
159 */ 146 */
160kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) 147kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
161{ 148{
162 struct kprobe_insn_page *kip; 149 struct kprobe_insn_page *kip;
163 kprobe_opcode_t *slot = NULL; 150 kprobe_opcode_t *slot = NULL;
@@ -214,7 +201,7 @@ out:
214} 201}
215 202
216/* Return 1 if all garbages are collected, otherwise 0. */ 203/* Return 1 if all garbages are collected, otherwise 0. */
217static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) 204static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
218{ 205{
219 kip->slot_used[idx] = SLOT_CLEAN; 206 kip->slot_used[idx] = SLOT_CLEAN;
220 kip->nused--; 207 kip->nused--;
@@ -235,7 +222,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
235 return 0; 222 return 0;
236} 223}
237 224
238static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) 225static int collect_garbage_slots(struct kprobe_insn_cache *c)
239{ 226{
240 struct kprobe_insn_page *kip, *next; 227 struct kprobe_insn_page *kip, *next;
241 228
@@ -257,8 +244,8 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
257 return 0; 244 return 0;
258} 245}
259 246
260void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, 247void __free_insn_slot(struct kprobe_insn_cache *c,
261 kprobe_opcode_t *slot, int dirty) 248 kprobe_opcode_t *slot, int dirty)
262{ 249{
263 struct kprobe_insn_page *kip; 250 struct kprobe_insn_page *kip;
264 251
@@ -314,7 +301,7 @@ static inline void reset_kprobe_instance(void)
314 * OR 301 * OR
315 * - with preemption disabled - from arch/xxx/kernel/kprobes.c 302 * - with preemption disabled - from arch/xxx/kernel/kprobes.c
316 */ 303 */
317struct kprobe __kprobes *get_kprobe(void *addr) 304struct kprobe *get_kprobe(void *addr)
318{ 305{
319 struct hlist_head *head; 306 struct hlist_head *head;
320 struct kprobe *p; 307 struct kprobe *p;
@@ -327,8 +314,9 @@ struct kprobe __kprobes *get_kprobe(void *addr)
327 314
328 return NULL; 315 return NULL;
329} 316}
317NOKPROBE_SYMBOL(get_kprobe);
330 318
331static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); 319static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
332 320
333/* Return true if the kprobe is an aggregator */ 321/* Return true if the kprobe is an aggregator */
334static inline int kprobe_aggrprobe(struct kprobe *p) 322static inline int kprobe_aggrprobe(struct kprobe *p)
@@ -360,7 +348,7 @@ static bool kprobes_allow_optimization;
360 * Call all pre_handler on the list, but ignores its return value. 348 * Call all pre_handler on the list, but ignores its return value.
361 * This must be called from arch-dep optimized caller. 349 * This must be called from arch-dep optimized caller.
362 */ 350 */
363void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) 351void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
364{ 352{
365 struct kprobe *kp; 353 struct kprobe *kp;
366 354
@@ -372,9 +360,10 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
372 reset_kprobe_instance(); 360 reset_kprobe_instance();
373 } 361 }
374} 362}
363NOKPROBE_SYMBOL(opt_pre_handler);
375 364
376/* Free optimized instructions and optimized_kprobe */ 365/* Free optimized instructions and optimized_kprobe */
377static __kprobes void free_aggr_kprobe(struct kprobe *p) 366static void free_aggr_kprobe(struct kprobe *p)
378{ 367{
379 struct optimized_kprobe *op; 368 struct optimized_kprobe *op;
380 369
@@ -412,7 +401,7 @@ static inline int kprobe_disarmed(struct kprobe *p)
412} 401}
413 402
414/* Return true(!0) if the probe is queued on (un)optimizing lists */ 403/* Return true(!0) if the probe is queued on (un)optimizing lists */
415static int __kprobes kprobe_queued(struct kprobe *p) 404static int kprobe_queued(struct kprobe *p)
416{ 405{
417 struct optimized_kprobe *op; 406 struct optimized_kprobe *op;
418 407
@@ -428,7 +417,7 @@ static int __kprobes kprobe_queued(struct kprobe *p)
428 * Return an optimized kprobe whose optimizing code replaces 417 * Return an optimized kprobe whose optimizing code replaces
429 * instructions including addr (exclude breakpoint). 418 * instructions including addr (exclude breakpoint).
430 */ 419 */
431static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) 420static struct kprobe *get_optimized_kprobe(unsigned long addr)
432{ 421{
433 int i; 422 int i;
434 struct kprobe *p = NULL; 423 struct kprobe *p = NULL;
@@ -460,7 +449,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
460 * Optimize (replace a breakpoint with a jump) kprobes listed on 449 * Optimize (replace a breakpoint with a jump) kprobes listed on
461 * optimizing_list. 450 * optimizing_list.
462 */ 451 */
463static __kprobes void do_optimize_kprobes(void) 452static void do_optimize_kprobes(void)
464{ 453{
465 /* Optimization never be done when disarmed */ 454 /* Optimization never be done when disarmed */
466 if (kprobes_all_disarmed || !kprobes_allow_optimization || 455 if (kprobes_all_disarmed || !kprobes_allow_optimization ||
@@ -488,7 +477,7 @@ static __kprobes void do_optimize_kprobes(void)
488 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint 477 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
489 * if need) kprobes listed on unoptimizing_list. 478 * if need) kprobes listed on unoptimizing_list.
490 */ 479 */
491static __kprobes void do_unoptimize_kprobes(void) 480static void do_unoptimize_kprobes(void)
492{ 481{
493 struct optimized_kprobe *op, *tmp; 482 struct optimized_kprobe *op, *tmp;
494 483
@@ -520,7 +509,7 @@ static __kprobes void do_unoptimize_kprobes(void)
520} 509}
521 510
522/* Reclaim all kprobes on the free_list */ 511/* Reclaim all kprobes on the free_list */
523static __kprobes void do_free_cleaned_kprobes(void) 512static void do_free_cleaned_kprobes(void)
524{ 513{
525 struct optimized_kprobe *op, *tmp; 514 struct optimized_kprobe *op, *tmp;
526 515
@@ -532,13 +521,13 @@ static __kprobes void do_free_cleaned_kprobes(void)
532} 521}
533 522
534/* Start optimizer after OPTIMIZE_DELAY passed */ 523/* Start optimizer after OPTIMIZE_DELAY passed */
535static __kprobes void kick_kprobe_optimizer(void) 524static void kick_kprobe_optimizer(void)
536{ 525{
537 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); 526 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
538} 527}
539 528
540/* Kprobe jump optimizer */ 529/* Kprobe jump optimizer */
541static __kprobes void kprobe_optimizer(struct work_struct *work) 530static void kprobe_optimizer(struct work_struct *work)
542{ 531{
543 mutex_lock(&kprobe_mutex); 532 mutex_lock(&kprobe_mutex);
544 /* Lock modules while optimizing kprobes */ 533 /* Lock modules while optimizing kprobes */
@@ -574,7 +563,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
574} 563}
575 564
576/* Wait for completing optimization and unoptimization */ 565/* Wait for completing optimization and unoptimization */
577static __kprobes void wait_for_kprobe_optimizer(void) 566static void wait_for_kprobe_optimizer(void)
578{ 567{
579 mutex_lock(&kprobe_mutex); 568 mutex_lock(&kprobe_mutex);
580 569
@@ -593,7 +582,7 @@ static __kprobes void wait_for_kprobe_optimizer(void)
593} 582}
594 583
595/* Optimize kprobe if p is ready to be optimized */ 584/* Optimize kprobe if p is ready to be optimized */
596static __kprobes void optimize_kprobe(struct kprobe *p) 585static void optimize_kprobe(struct kprobe *p)
597{ 586{
598 struct optimized_kprobe *op; 587 struct optimized_kprobe *op;
599 588
@@ -627,7 +616,7 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
627} 616}
628 617
629/* Short cut to direct unoptimizing */ 618/* Short cut to direct unoptimizing */
630static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) 619static void force_unoptimize_kprobe(struct optimized_kprobe *op)
631{ 620{
632 get_online_cpus(); 621 get_online_cpus();
633 arch_unoptimize_kprobe(op); 622 arch_unoptimize_kprobe(op);
@@ -637,7 +626,7 @@ static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
637} 626}
638 627
639/* Unoptimize a kprobe if p is optimized */ 628/* Unoptimize a kprobe if p is optimized */
640static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) 629static void unoptimize_kprobe(struct kprobe *p, bool force)
641{ 630{
642 struct optimized_kprobe *op; 631 struct optimized_kprobe *op;
643 632
@@ -697,7 +686,7 @@ static void reuse_unused_kprobe(struct kprobe *ap)
697} 686}
698 687
699/* Remove optimized instructions */ 688/* Remove optimized instructions */
700static void __kprobes kill_optimized_kprobe(struct kprobe *p) 689static void kill_optimized_kprobe(struct kprobe *p)
701{ 690{
702 struct optimized_kprobe *op; 691 struct optimized_kprobe *op;
703 692
@@ -723,7 +712,7 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)
723} 712}
724 713
725/* Try to prepare optimized instructions */ 714/* Try to prepare optimized instructions */
726static __kprobes void prepare_optimized_kprobe(struct kprobe *p) 715static void prepare_optimized_kprobe(struct kprobe *p)
727{ 716{
728 struct optimized_kprobe *op; 717 struct optimized_kprobe *op;
729 718
@@ -732,7 +721,7 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
732} 721}
733 722
734/* Allocate new optimized_kprobe and try to prepare optimized instructions */ 723/* Allocate new optimized_kprobe and try to prepare optimized instructions */
735static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) 724static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
736{ 725{
737 struct optimized_kprobe *op; 726 struct optimized_kprobe *op;
738 727
@@ -747,13 +736,13 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
747 return &op->kp; 736 return &op->kp;
748} 737}
749 738
750static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); 739static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
751 740
752/* 741/*
753 * Prepare an optimized_kprobe and optimize it 742 * Prepare an optimized_kprobe and optimize it
754 * NOTE: p must be a normal registered kprobe 743 * NOTE: p must be a normal registered kprobe
755 */ 744 */
756static __kprobes void try_to_optimize_kprobe(struct kprobe *p) 745static void try_to_optimize_kprobe(struct kprobe *p)
757{ 746{
758 struct kprobe *ap; 747 struct kprobe *ap;
759 struct optimized_kprobe *op; 748 struct optimized_kprobe *op;
@@ -787,7 +776,7 @@ out:
787} 776}
788 777
789#ifdef CONFIG_SYSCTL 778#ifdef CONFIG_SYSCTL
790static void __kprobes optimize_all_kprobes(void) 779static void optimize_all_kprobes(void)
791{ 780{
792 struct hlist_head *head; 781 struct hlist_head *head;
793 struct kprobe *p; 782 struct kprobe *p;
@@ -810,7 +799,7 @@ out:
810 mutex_unlock(&kprobe_mutex); 799 mutex_unlock(&kprobe_mutex);
811} 800}
812 801
813static void __kprobes unoptimize_all_kprobes(void) 802static void unoptimize_all_kprobes(void)
814{ 803{
815 struct hlist_head *head; 804 struct hlist_head *head;
816 struct kprobe *p; 805 struct kprobe *p;
@@ -861,7 +850,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
861#endif /* CONFIG_SYSCTL */ 850#endif /* CONFIG_SYSCTL */
862 851
863/* Put a breakpoint for a probe. Must be called with text_mutex locked */ 852/* Put a breakpoint for a probe. Must be called with text_mutex locked */
864static void __kprobes __arm_kprobe(struct kprobe *p) 853static void __arm_kprobe(struct kprobe *p)
865{ 854{
866 struct kprobe *_p; 855 struct kprobe *_p;
867 856
@@ -876,7 +865,7 @@ static void __kprobes __arm_kprobe(struct kprobe *p)
876} 865}
877 866
878/* Remove the breakpoint of a probe. Must be called with text_mutex locked */ 867/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
879static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) 868static void __disarm_kprobe(struct kprobe *p, bool reopt)
880{ 869{
881 struct kprobe *_p; 870 struct kprobe *_p;
882 871
@@ -911,13 +900,13 @@ static void reuse_unused_kprobe(struct kprobe *ap)
911 BUG_ON(kprobe_unused(ap)); 900 BUG_ON(kprobe_unused(ap));
912} 901}
913 902
914static __kprobes void free_aggr_kprobe(struct kprobe *p) 903static void free_aggr_kprobe(struct kprobe *p)
915{ 904{
916 arch_remove_kprobe(p); 905 arch_remove_kprobe(p);
917 kfree(p); 906 kfree(p);
918} 907}
919 908
920static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) 909static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
921{ 910{
922 return kzalloc(sizeof(struct kprobe), GFP_KERNEL); 911 return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
923} 912}
@@ -931,7 +920,7 @@ static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
931static int kprobe_ftrace_enabled; 920static int kprobe_ftrace_enabled;
932 921
933/* Must ensure p->addr is really on ftrace */ 922/* Must ensure p->addr is really on ftrace */
934static int __kprobes prepare_kprobe(struct kprobe *p) 923static int prepare_kprobe(struct kprobe *p)
935{ 924{
936 if (!kprobe_ftrace(p)) 925 if (!kprobe_ftrace(p))
937 return arch_prepare_kprobe(p); 926 return arch_prepare_kprobe(p);
@@ -940,7 +929,7 @@ static int __kprobes prepare_kprobe(struct kprobe *p)
940} 929}
941 930
942/* Caller must lock kprobe_mutex */ 931/* Caller must lock kprobe_mutex */
943static void __kprobes arm_kprobe_ftrace(struct kprobe *p) 932static void arm_kprobe_ftrace(struct kprobe *p)
944{ 933{
945 int ret; 934 int ret;
946 935
@@ -955,7 +944,7 @@ static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
955} 944}
956 945
957/* Caller must lock kprobe_mutex */ 946/* Caller must lock kprobe_mutex */
958static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) 947static void disarm_kprobe_ftrace(struct kprobe *p)
959{ 948{
960 int ret; 949 int ret;
961 950
@@ -975,7 +964,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
975#endif 964#endif
976 965
977/* Arm a kprobe with text_mutex */ 966/* Arm a kprobe with text_mutex */
978static void __kprobes arm_kprobe(struct kprobe *kp) 967static void arm_kprobe(struct kprobe *kp)
979{ 968{
980 if (unlikely(kprobe_ftrace(kp))) { 969 if (unlikely(kprobe_ftrace(kp))) {
981 arm_kprobe_ftrace(kp); 970 arm_kprobe_ftrace(kp);
@@ -992,7 +981,7 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
992} 981}
993 982
994/* Disarm a kprobe with text_mutex */ 983/* Disarm a kprobe with text_mutex */
995static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) 984static void disarm_kprobe(struct kprobe *kp, bool reopt)
996{ 985{
997 if (unlikely(kprobe_ftrace(kp))) { 986 if (unlikely(kprobe_ftrace(kp))) {
998 disarm_kprobe_ftrace(kp); 987 disarm_kprobe_ftrace(kp);
@@ -1008,7 +997,7 @@ static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt)
1008 * Aggregate handlers for multiple kprobes support - these handlers 997 * Aggregate handlers for multiple kprobes support - these handlers
1009 * take care of invoking the individual kprobe handlers on p->list 998 * take care of invoking the individual kprobe handlers on p->list
1010 */ 999 */
1011static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) 1000static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
1012{ 1001{
1013 struct kprobe *kp; 1002 struct kprobe *kp;
1014 1003
@@ -1022,9 +1011,10 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
1022 } 1011 }
1023 return 0; 1012 return 0;
1024} 1013}
1014NOKPROBE_SYMBOL(aggr_pre_handler);
1025 1015
1026static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, 1016static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
1027 unsigned long flags) 1017 unsigned long flags)
1028{ 1018{
1029 struct kprobe *kp; 1019 struct kprobe *kp;
1030 1020
@@ -1036,9 +1026,10 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
1036 } 1026 }
1037 } 1027 }
1038} 1028}
1029NOKPROBE_SYMBOL(aggr_post_handler);
1039 1030
1040static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, 1031static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
1041 int trapnr) 1032 int trapnr)
1042{ 1033{
1043 struct kprobe *cur = __this_cpu_read(kprobe_instance); 1034 struct kprobe *cur = __this_cpu_read(kprobe_instance);
1044 1035
@@ -1052,8 +1043,9 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
1052 } 1043 }
1053 return 0; 1044 return 0;
1054} 1045}
1046NOKPROBE_SYMBOL(aggr_fault_handler);
1055 1047
1056static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) 1048static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
1057{ 1049{
1058 struct kprobe *cur = __this_cpu_read(kprobe_instance); 1050 struct kprobe *cur = __this_cpu_read(kprobe_instance);
1059 int ret = 0; 1051 int ret = 0;
@@ -1065,9 +1057,10 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
1065 reset_kprobe_instance(); 1057 reset_kprobe_instance();
1066 return ret; 1058 return ret;
1067} 1059}
1060NOKPROBE_SYMBOL(aggr_break_handler);
1068 1061
1069/* Walks the list and increments nmissed count for multiprobe case */ 1062/* Walks the list and increments nmissed count for multiprobe case */
1070void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) 1063void kprobes_inc_nmissed_count(struct kprobe *p)
1071{ 1064{
1072 struct kprobe *kp; 1065 struct kprobe *kp;
1073 if (!kprobe_aggrprobe(p)) { 1066 if (!kprobe_aggrprobe(p)) {
@@ -1078,9 +1071,10 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
1078 } 1071 }
1079 return; 1072 return;
1080} 1073}
1074NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
1081 1075
1082void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, 1076void recycle_rp_inst(struct kretprobe_instance *ri,
1083 struct hlist_head *head) 1077 struct hlist_head *head)
1084{ 1078{
1085 struct kretprobe *rp = ri->rp; 1079 struct kretprobe *rp = ri->rp;
1086 1080
@@ -1095,8 +1089,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
1095 /* Unregistering */ 1089 /* Unregistering */
1096 hlist_add_head(&ri->hlist, head); 1090 hlist_add_head(&ri->hlist, head);
1097} 1091}
1092NOKPROBE_SYMBOL(recycle_rp_inst);
1098 1093
1099void __kprobes kretprobe_hash_lock(struct task_struct *tsk, 1094void kretprobe_hash_lock(struct task_struct *tsk,
1100 struct hlist_head **head, unsigned long *flags) 1095 struct hlist_head **head, unsigned long *flags)
1101__acquires(hlist_lock) 1096__acquires(hlist_lock)
1102{ 1097{
@@ -1107,17 +1102,19 @@ __acquires(hlist_lock)
1107 hlist_lock = kretprobe_table_lock_ptr(hash); 1102 hlist_lock = kretprobe_table_lock_ptr(hash);
1108 raw_spin_lock_irqsave(hlist_lock, *flags); 1103 raw_spin_lock_irqsave(hlist_lock, *flags);
1109} 1104}
1105NOKPROBE_SYMBOL(kretprobe_hash_lock);
1110 1106
1111static void __kprobes kretprobe_table_lock(unsigned long hash, 1107static void kretprobe_table_lock(unsigned long hash,
1112 unsigned long *flags) 1108 unsigned long *flags)
1113__acquires(hlist_lock) 1109__acquires(hlist_lock)
1114{ 1110{
1115 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 1111 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
1116 raw_spin_lock_irqsave(hlist_lock, *flags); 1112 raw_spin_lock_irqsave(hlist_lock, *flags);
1117} 1113}
1114NOKPROBE_SYMBOL(kretprobe_table_lock);
1118 1115
1119void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, 1116void kretprobe_hash_unlock(struct task_struct *tsk,
1120 unsigned long *flags) 1117 unsigned long *flags)
1121__releases(hlist_lock) 1118__releases(hlist_lock)
1122{ 1119{
1123 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 1120 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
@@ -1126,14 +1123,16 @@ __releases(hlist_lock)
1126 hlist_lock = kretprobe_table_lock_ptr(hash); 1123 hlist_lock = kretprobe_table_lock_ptr(hash);
1127 raw_spin_unlock_irqrestore(hlist_lock, *flags); 1124 raw_spin_unlock_irqrestore(hlist_lock, *flags);
1128} 1125}
1126NOKPROBE_SYMBOL(kretprobe_hash_unlock);
1129 1127
1130static void __kprobes kretprobe_table_unlock(unsigned long hash, 1128static void kretprobe_table_unlock(unsigned long hash,
1131 unsigned long *flags) 1129 unsigned long *flags)
1132__releases(hlist_lock) 1130__releases(hlist_lock)
1133{ 1131{
1134 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 1132 raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
1135 raw_spin_unlock_irqrestore(hlist_lock, *flags); 1133 raw_spin_unlock_irqrestore(hlist_lock, *flags);
1136} 1134}
1135NOKPROBE_SYMBOL(kretprobe_table_unlock);
1137 1136
1138/* 1137/*
1139 * This function is called from finish_task_switch when task tk becomes dead, 1138 * This function is called from finish_task_switch when task tk becomes dead,
@@ -1141,7 +1140,7 @@ __releases(hlist_lock)
1141 * with this task. These left over instances represent probed functions 1140 * with this task. These left over instances represent probed functions
1142 * that have been called but will never return. 1141 * that have been called but will never return.
1143 */ 1142 */
1144void __kprobes kprobe_flush_task(struct task_struct *tk) 1143void kprobe_flush_task(struct task_struct *tk)
1145{ 1144{
1146 struct kretprobe_instance *ri; 1145 struct kretprobe_instance *ri;
1147 struct hlist_head *head, empty_rp; 1146 struct hlist_head *head, empty_rp;
@@ -1166,6 +1165,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1166 kfree(ri); 1165 kfree(ri);
1167 } 1166 }
1168} 1167}
1168NOKPROBE_SYMBOL(kprobe_flush_task);
1169 1169
1170static inline void free_rp_inst(struct kretprobe *rp) 1170static inline void free_rp_inst(struct kretprobe *rp)
1171{ 1171{
@@ -1178,7 +1178,7 @@ static inline void free_rp_inst(struct kretprobe *rp)
1178 } 1178 }
1179} 1179}
1180 1180
1181static void __kprobes cleanup_rp_inst(struct kretprobe *rp) 1181static void cleanup_rp_inst(struct kretprobe *rp)
1182{ 1182{
1183 unsigned long flags, hash; 1183 unsigned long flags, hash;
1184 struct kretprobe_instance *ri; 1184 struct kretprobe_instance *ri;
@@ -1197,12 +1197,13 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
1197 } 1197 }
1198 free_rp_inst(rp); 1198 free_rp_inst(rp);
1199} 1199}
1200NOKPROBE_SYMBOL(cleanup_rp_inst);
1200 1201
1201/* 1202/*
1202* Add the new probe to ap->list. Fail if this is the 1203* Add the new probe to ap->list. Fail if this is the
1203* second jprobe at the address - two jprobes can't coexist 1204* second jprobe at the address - two jprobes can't coexist
1204*/ 1205*/
1205static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) 1206static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
1206{ 1207{
1207 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 1208 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
1208 1209
@@ -1226,7 +1227,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
1226 * Fill in the required fields of the "manager kprobe". Replace the 1227 * Fill in the required fields of the "manager kprobe". Replace the
1227 * earlier kprobe in the hlist with the manager kprobe 1228 * earlier kprobe in the hlist with the manager kprobe
1228 */ 1229 */
1229static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 1230static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
1230{ 1231{
1231 /* Copy p's insn slot to ap */ 1232 /* Copy p's insn slot to ap */
1232 copy_kprobe(p, ap); 1233 copy_kprobe(p, ap);
@@ -1252,8 +1253,7 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
1252 * This is the second or subsequent kprobe at the address - handle 1253 * This is the second or subsequent kprobe at the address - handle
1253 * the intricacies 1254 * the intricacies
1254 */ 1255 */
1255static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, 1256static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
1256 struct kprobe *p)
1257{ 1257{
1258 int ret = 0; 1258 int ret = 0;
1259 struct kprobe *ap = orig_p; 1259 struct kprobe *ap = orig_p;
@@ -1324,25 +1324,29 @@ out:
1324 return ret; 1324 return ret;
1325} 1325}
1326 1326
1327static int __kprobes in_kprobes_functions(unsigned long addr) 1327bool __weak arch_within_kprobe_blacklist(unsigned long addr)
1328{ 1328{
1329 struct kprobe_blackpoint *kb; 1329 /* The __kprobes marked functions and entry code must not be probed */
1330 return addr >= (unsigned long)__kprobes_text_start &&
1331 addr < (unsigned long)__kprobes_text_end;
1332}
1330 1333
1331 if (addr >= (unsigned long)__kprobes_text_start && 1334static bool within_kprobe_blacklist(unsigned long addr)
1332 addr < (unsigned long)__kprobes_text_end) 1335{
1333 return -EINVAL; 1336 struct kprobe_blacklist_entry *ent;
1337
1338 if (arch_within_kprobe_blacklist(addr))
1339 return true;
1334 /* 1340 /*
1335 * If there exists a kprobe_blacklist, verify and 1341 * If there exists a kprobe_blacklist, verify and
1336 * fail any probe registration in the prohibited area 1342 * fail any probe registration in the prohibited area
1337 */ 1343 */
1338 for (kb = kprobe_blacklist; kb->name != NULL; kb++) { 1344 list_for_each_entry(ent, &kprobe_blacklist, list) {
1339 if (kb->start_addr) { 1345 if (addr >= ent->start_addr && addr < ent->end_addr)
1340 if (addr >= kb->start_addr && 1346 return true;
1341 addr < (kb->start_addr + kb->range))
1342 return -EINVAL;
1343 }
1344 } 1347 }
1345 return 0; 1348
1349 return false;
1346} 1350}
1347 1351
1348/* 1352/*
@@ -1351,7 +1355,7 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
1351 * This returns encoded errors if it fails to look up symbol or invalid 1355 * This returns encoded errors if it fails to look up symbol or invalid
1352 * combination of parameters. 1356 * combination of parameters.
1353 */ 1357 */
1354static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) 1358static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
1355{ 1359{
1356 kprobe_opcode_t *addr = p->addr; 1360 kprobe_opcode_t *addr = p->addr;
1357 1361
@@ -1374,7 +1378,7 @@ invalid:
1374} 1378}
1375 1379
1376/* Check passed kprobe is valid and return kprobe in kprobe_table. */ 1380/* Check passed kprobe is valid and return kprobe in kprobe_table. */
1377static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) 1381static struct kprobe *__get_valid_kprobe(struct kprobe *p)
1378{ 1382{
1379 struct kprobe *ap, *list_p; 1383 struct kprobe *ap, *list_p;
1380 1384
@@ -1406,8 +1410,8 @@ static inline int check_kprobe_rereg(struct kprobe *p)
1406 return ret; 1410 return ret;
1407} 1411}
1408 1412
1409static __kprobes int check_kprobe_address_safe(struct kprobe *p, 1413static int check_kprobe_address_safe(struct kprobe *p,
1410 struct module **probed_mod) 1414 struct module **probed_mod)
1411{ 1415{
1412 int ret = 0; 1416 int ret = 0;
1413 unsigned long ftrace_addr; 1417 unsigned long ftrace_addr;
@@ -1433,7 +1437,7 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
1433 1437
1434 /* Ensure it is not in reserved area nor out of text */ 1438 /* Ensure it is not in reserved area nor out of text */
1435 if (!kernel_text_address((unsigned long) p->addr) || 1439 if (!kernel_text_address((unsigned long) p->addr) ||
1436 in_kprobes_functions((unsigned long) p->addr) || 1440 within_kprobe_blacklist((unsigned long) p->addr) ||
1437 jump_label_text_reserved(p->addr, p->addr)) { 1441 jump_label_text_reserved(p->addr, p->addr)) {
1438 ret = -EINVAL; 1442 ret = -EINVAL;
1439 goto out; 1443 goto out;
@@ -1469,7 +1473,7 @@ out:
1469 return ret; 1473 return ret;
1470} 1474}
1471 1475
1472int __kprobes register_kprobe(struct kprobe *p) 1476int register_kprobe(struct kprobe *p)
1473{ 1477{
1474 int ret; 1478 int ret;
1475 struct kprobe *old_p; 1479 struct kprobe *old_p;
@@ -1531,7 +1535,7 @@ out:
1531EXPORT_SYMBOL_GPL(register_kprobe); 1535EXPORT_SYMBOL_GPL(register_kprobe);
1532 1536
1533/* Check if all probes on the aggrprobe are disabled */ 1537/* Check if all probes on the aggrprobe are disabled */
1534static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) 1538static int aggr_kprobe_disabled(struct kprobe *ap)
1535{ 1539{
1536 struct kprobe *kp; 1540 struct kprobe *kp;
1537 1541
@@ -1547,7 +1551,7 @@ static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
1547} 1551}
1548 1552
1549/* Disable one kprobe: Make sure called under kprobe_mutex is locked */ 1553/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
1550static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) 1554static struct kprobe *__disable_kprobe(struct kprobe *p)
1551{ 1555{
1552 struct kprobe *orig_p; 1556 struct kprobe *orig_p;
1553 1557
@@ -1574,7 +1578,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
1574/* 1578/*
1575 * Unregister a kprobe without a scheduler synchronization. 1579 * Unregister a kprobe without a scheduler synchronization.
1576 */ 1580 */
1577static int __kprobes __unregister_kprobe_top(struct kprobe *p) 1581static int __unregister_kprobe_top(struct kprobe *p)
1578{ 1582{
1579 struct kprobe *ap, *list_p; 1583 struct kprobe *ap, *list_p;
1580 1584
@@ -1631,7 +1635,7 @@ disarmed:
1631 return 0; 1635 return 0;
1632} 1636}
1633 1637
1634static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) 1638static void __unregister_kprobe_bottom(struct kprobe *p)
1635{ 1639{
1636 struct kprobe *ap; 1640 struct kprobe *ap;
1637 1641
@@ -1647,7 +1651,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
1647 /* Otherwise, do nothing. */ 1651 /* Otherwise, do nothing. */
1648} 1652}
1649 1653
1650int __kprobes register_kprobes(struct kprobe **kps, int num) 1654int register_kprobes(struct kprobe **kps, int num)
1651{ 1655{
1652 int i, ret = 0; 1656 int i, ret = 0;
1653 1657
@@ -1665,13 +1669,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
1665} 1669}
1666EXPORT_SYMBOL_GPL(register_kprobes); 1670EXPORT_SYMBOL_GPL(register_kprobes);
1667 1671
1668void __kprobes unregister_kprobe(struct kprobe *p) 1672void unregister_kprobe(struct kprobe *p)
1669{ 1673{
1670 unregister_kprobes(&p, 1); 1674 unregister_kprobes(&p, 1);
1671} 1675}
1672EXPORT_SYMBOL_GPL(unregister_kprobe); 1676EXPORT_SYMBOL_GPL(unregister_kprobe);
1673 1677
1674void __kprobes unregister_kprobes(struct kprobe **kps, int num) 1678void unregister_kprobes(struct kprobe **kps, int num)
1675{ 1679{
1676 int i; 1680 int i;
1677 1681
@@ -1700,7 +1704,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
1700 return (unsigned long)entry; 1704 return (unsigned long)entry;
1701} 1705}
1702 1706
1703int __kprobes register_jprobes(struct jprobe **jps, int num) 1707int register_jprobes(struct jprobe **jps, int num)
1704{ 1708{
1705 struct jprobe *jp; 1709 struct jprobe *jp;
1706 int ret = 0, i; 1710 int ret = 0, i;
@@ -1731,19 +1735,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
1731} 1735}
1732EXPORT_SYMBOL_GPL(register_jprobes); 1736EXPORT_SYMBOL_GPL(register_jprobes);
1733 1737
1734int __kprobes register_jprobe(struct jprobe *jp) 1738int register_jprobe(struct jprobe *jp)
1735{ 1739{
1736 return register_jprobes(&jp, 1); 1740 return register_jprobes(&jp, 1);
1737} 1741}
1738EXPORT_SYMBOL_GPL(register_jprobe); 1742EXPORT_SYMBOL_GPL(register_jprobe);
1739 1743
1740void __kprobes unregister_jprobe(struct jprobe *jp) 1744void unregister_jprobe(struct jprobe *jp)
1741{ 1745{
1742 unregister_jprobes(&jp, 1); 1746 unregister_jprobes(&jp, 1);
1743} 1747}
1744EXPORT_SYMBOL_GPL(unregister_jprobe); 1748EXPORT_SYMBOL_GPL(unregister_jprobe);
1745 1749
1746void __kprobes unregister_jprobes(struct jprobe **jps, int num) 1750void unregister_jprobes(struct jprobe **jps, int num)
1747{ 1751{
1748 int i; 1752 int i;
1749 1753
@@ -1768,8 +1772,7 @@ EXPORT_SYMBOL_GPL(unregister_jprobes);
1768 * This kprobe pre_handler is registered with every kretprobe. When probe 1772 * This kprobe pre_handler is registered with every kretprobe. When probe
1769 * hits it will set up the return probe. 1773 * hits it will set up the return probe.
1770 */ 1774 */
1771static int __kprobes pre_handler_kretprobe(struct kprobe *p, 1775static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
1772 struct pt_regs *regs)
1773{ 1776{
1774 struct kretprobe *rp = container_of(p, struct kretprobe, kp); 1777 struct kretprobe *rp = container_of(p, struct kretprobe, kp);
1775 unsigned long hash, flags = 0; 1778 unsigned long hash, flags = 0;
@@ -1807,8 +1810,9 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1807 } 1810 }
1808 return 0; 1811 return 0;
1809} 1812}
1813NOKPROBE_SYMBOL(pre_handler_kretprobe);
1810 1814
1811int __kprobes register_kretprobe(struct kretprobe *rp) 1815int register_kretprobe(struct kretprobe *rp)
1812{ 1816{
1813 int ret = 0; 1817 int ret = 0;
1814 struct kretprobe_instance *inst; 1818 struct kretprobe_instance *inst;
@@ -1861,7 +1865,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1861} 1865}
1862EXPORT_SYMBOL_GPL(register_kretprobe); 1866EXPORT_SYMBOL_GPL(register_kretprobe);
1863 1867
1864int __kprobes register_kretprobes(struct kretprobe **rps, int num) 1868int register_kretprobes(struct kretprobe **rps, int num)
1865{ 1869{
1866 int ret = 0, i; 1870 int ret = 0, i;
1867 1871
@@ -1879,13 +1883,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
1879} 1883}
1880EXPORT_SYMBOL_GPL(register_kretprobes); 1884EXPORT_SYMBOL_GPL(register_kretprobes);
1881 1885
1882void __kprobes unregister_kretprobe(struct kretprobe *rp) 1886void unregister_kretprobe(struct kretprobe *rp)
1883{ 1887{
1884 unregister_kretprobes(&rp, 1); 1888 unregister_kretprobes(&rp, 1);
1885} 1889}
1886EXPORT_SYMBOL_GPL(unregister_kretprobe); 1890EXPORT_SYMBOL_GPL(unregister_kretprobe);
1887 1891
1888void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1892void unregister_kretprobes(struct kretprobe **rps, int num)
1889{ 1893{
1890 int i; 1894 int i;
1891 1895
@@ -1908,38 +1912,38 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1908EXPORT_SYMBOL_GPL(unregister_kretprobes); 1912EXPORT_SYMBOL_GPL(unregister_kretprobes);
1909 1913
1910#else /* CONFIG_KRETPROBES */ 1914#else /* CONFIG_KRETPROBES */
1911int __kprobes register_kretprobe(struct kretprobe *rp) 1915int register_kretprobe(struct kretprobe *rp)
1912{ 1916{
1913 return -ENOSYS; 1917 return -ENOSYS;
1914} 1918}
1915EXPORT_SYMBOL_GPL(register_kretprobe); 1919EXPORT_SYMBOL_GPL(register_kretprobe);
1916 1920
1917int __kprobes register_kretprobes(struct kretprobe **rps, int num) 1921int register_kretprobes(struct kretprobe **rps, int num)
1918{ 1922{
1919 return -ENOSYS; 1923 return -ENOSYS;
1920} 1924}
1921EXPORT_SYMBOL_GPL(register_kretprobes); 1925EXPORT_SYMBOL_GPL(register_kretprobes);
1922 1926
1923void __kprobes unregister_kretprobe(struct kretprobe *rp) 1927void unregister_kretprobe(struct kretprobe *rp)
1924{ 1928{
1925} 1929}
1926EXPORT_SYMBOL_GPL(unregister_kretprobe); 1930EXPORT_SYMBOL_GPL(unregister_kretprobe);
1927 1931
1928void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1932void unregister_kretprobes(struct kretprobe **rps, int num)
1929{ 1933{
1930} 1934}
1931EXPORT_SYMBOL_GPL(unregister_kretprobes); 1935EXPORT_SYMBOL_GPL(unregister_kretprobes);
1932 1936
1933static int __kprobes pre_handler_kretprobe(struct kprobe *p, 1937static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
1934 struct pt_regs *regs)
1935{ 1938{
1936 return 0; 1939 return 0;
1937} 1940}
1941NOKPROBE_SYMBOL(pre_handler_kretprobe);
1938 1942
1939#endif /* CONFIG_KRETPROBES */ 1943#endif /* CONFIG_KRETPROBES */
1940 1944
1941/* Set the kprobe gone and remove its instruction buffer. */ 1945/* Set the kprobe gone and remove its instruction buffer. */
1942static void __kprobes kill_kprobe(struct kprobe *p) 1946static void kill_kprobe(struct kprobe *p)
1943{ 1947{
1944 struct kprobe *kp; 1948 struct kprobe *kp;
1945 1949
@@ -1963,7 +1967,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1963} 1967}
1964 1968
1965/* Disable one kprobe */ 1969/* Disable one kprobe */
1966int __kprobes disable_kprobe(struct kprobe *kp) 1970int disable_kprobe(struct kprobe *kp)
1967{ 1971{
1968 int ret = 0; 1972 int ret = 0;
1969 1973
@@ -1979,7 +1983,7 @@ int __kprobes disable_kprobe(struct kprobe *kp)
1979EXPORT_SYMBOL_GPL(disable_kprobe); 1983EXPORT_SYMBOL_GPL(disable_kprobe);
1980 1984
1981/* Enable one kprobe */ 1985/* Enable one kprobe */
1982int __kprobes enable_kprobe(struct kprobe *kp) 1986int enable_kprobe(struct kprobe *kp)
1983{ 1987{
1984 int ret = 0; 1988 int ret = 0;
1985 struct kprobe *p; 1989 struct kprobe *p;
@@ -2012,16 +2016,49 @@ out:
2012} 2016}
2013EXPORT_SYMBOL_GPL(enable_kprobe); 2017EXPORT_SYMBOL_GPL(enable_kprobe);
2014 2018
2015void __kprobes dump_kprobe(struct kprobe *kp) 2019void dump_kprobe(struct kprobe *kp)
2016{ 2020{
2017 printk(KERN_WARNING "Dumping kprobe:\n"); 2021 printk(KERN_WARNING "Dumping kprobe:\n");
2018 printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", 2022 printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
2019 kp->symbol_name, kp->addr, kp->offset); 2023 kp->symbol_name, kp->addr, kp->offset);
2020} 2024}
2025NOKPROBE_SYMBOL(dump_kprobe);
2026
2027/*
2028 * Lookup and populate the kprobe_blacklist.
2029 *
2030 * Unlike the kretprobe blacklist, we'll need to determine
2031 * the range of addresses that belong to the said functions,
2032 * since a kprobe need not necessarily be at the beginning
2033 * of a function.
2034 */
2035static int __init populate_kprobe_blacklist(unsigned long *start,
2036 unsigned long *end)
2037{
2038 unsigned long *iter;
2039 struct kprobe_blacklist_entry *ent;
2040 unsigned long offset = 0, size = 0;
2041
2042 for (iter = start; iter < end; iter++) {
2043 if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) {
2044 pr_err("Failed to find blacklist %p\n", (void *)*iter);
2045 continue;
2046 }
2047
2048 ent = kmalloc(sizeof(*ent), GFP_KERNEL);
2049 if (!ent)
2050 return -ENOMEM;
2051 ent->start_addr = *iter;
2052 ent->end_addr = *iter + size;
2053 INIT_LIST_HEAD(&ent->list);
2054 list_add_tail(&ent->list, &kprobe_blacklist);
2055 }
2056 return 0;
2057}
2021 2058
2022/* Module notifier call back, checking kprobes on the module */ 2059/* Module notifier call back, checking kprobes on the module */
2023static int __kprobes kprobes_module_callback(struct notifier_block *nb, 2060static int kprobes_module_callback(struct notifier_block *nb,
2024 unsigned long val, void *data) 2061 unsigned long val, void *data)
2025{ 2062{
2026 struct module *mod = data; 2063 struct module *mod = data;
2027 struct hlist_head *head; 2064 struct hlist_head *head;
@@ -2062,14 +2099,13 @@ static struct notifier_block kprobe_module_nb = {
2062 .priority = 0 2099 .priority = 0
2063}; 2100};
2064 2101
2102/* Markers of _kprobe_blacklist section */
2103extern unsigned long __start_kprobe_blacklist[];
2104extern unsigned long __stop_kprobe_blacklist[];
2105
2065static int __init init_kprobes(void) 2106static int __init init_kprobes(void)
2066{ 2107{
2067 int i, err = 0; 2108 int i, err = 0;
2068 unsigned long offset = 0, size = 0;
2069 char *modname, namebuf[KSYM_NAME_LEN];
2070 const char *symbol_name;
2071 void *addr;
2072 struct kprobe_blackpoint *kb;
2073 2109
2074 /* FIXME allocate the probe table, currently defined statically */ 2110 /* FIXME allocate the probe table, currently defined statically */
2075 /* initialize all list heads */ 2111 /* initialize all list heads */
@@ -2079,26 +2115,11 @@ static int __init init_kprobes(void)
2079 raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); 2115 raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
2080 } 2116 }
2081 2117
2082 /* 2118 err = populate_kprobe_blacklist(__start_kprobe_blacklist,
2083 * Lookup and populate the kprobe_blacklist. 2119 __stop_kprobe_blacklist);
2084 * 2120 if (err) {
2085 * Unlike the kretprobe blacklist, we'll need to determine 2121 pr_err("kprobes: failed to populate blacklist: %d\n", err);
2086 * the range of addresses that belong to the said functions, 2122 pr_err("Please take care of using kprobes.\n");
2087 * since a kprobe need not necessarily be at the beginning
2088 * of a function.
2089 */
2090 for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
2091 kprobe_lookup_name(kb->name, addr);
2092 if (!addr)
2093 continue;
2094
2095 kb->start_addr = (unsigned long)addr;
2096 symbol_name = kallsyms_lookup(kb->start_addr,
2097 &size, &offset, &modname, namebuf);
2098 if (!symbol_name)
2099 kb->range = 0;
2100 else
2101 kb->range = size;
2102 } 2123 }
2103 2124
2104 if (kretprobe_blacklist_size) { 2125 if (kretprobe_blacklist_size) {
@@ -2138,7 +2159,7 @@ static int __init init_kprobes(void)
2138} 2159}
2139 2160
2140#ifdef CONFIG_DEBUG_FS 2161#ifdef CONFIG_DEBUG_FS
2141static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, 2162static void report_probe(struct seq_file *pi, struct kprobe *p,
2142 const char *sym, int offset, char *modname, struct kprobe *pp) 2163 const char *sym, int offset, char *modname, struct kprobe *pp)
2143{ 2164{
2144 char *kprobe_type; 2165 char *kprobe_type;
@@ -2167,12 +2188,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
2167 (kprobe_ftrace(pp) ? "[FTRACE]" : "")); 2188 (kprobe_ftrace(pp) ? "[FTRACE]" : ""));
2168} 2189}
2169 2190
2170static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 2191static void *kprobe_seq_start(struct seq_file *f, loff_t *pos)
2171{ 2192{
2172 return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL; 2193 return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
2173} 2194}
2174 2195
2175static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) 2196static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
2176{ 2197{
2177 (*pos)++; 2198 (*pos)++;
2178 if (*pos >= KPROBE_TABLE_SIZE) 2199 if (*pos >= KPROBE_TABLE_SIZE)
@@ -2180,12 +2201,12 @@ static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
2180 return pos; 2201 return pos;
2181} 2202}
2182 2203
2183static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) 2204static void kprobe_seq_stop(struct seq_file *f, void *v)
2184{ 2205{
2185 /* Nothing to do */ 2206 /* Nothing to do */
2186} 2207}
2187 2208
2188static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) 2209static int show_kprobe_addr(struct seq_file *pi, void *v)
2189{ 2210{
2190 struct hlist_head *head; 2211 struct hlist_head *head;
2191 struct kprobe *p, *kp; 2212 struct kprobe *p, *kp;
@@ -2216,7 +2237,7 @@ static const struct seq_operations kprobes_seq_ops = {
2216 .show = show_kprobe_addr 2237 .show = show_kprobe_addr
2217}; 2238};
2218 2239
2219static int __kprobes kprobes_open(struct inode *inode, struct file *filp) 2240static int kprobes_open(struct inode *inode, struct file *filp)
2220{ 2241{
2221 return seq_open(filp, &kprobes_seq_ops); 2242 return seq_open(filp, &kprobes_seq_ops);
2222} 2243}
@@ -2228,7 +2249,47 @@ static const struct file_operations debugfs_kprobes_operations = {
2228 .release = seq_release, 2249 .release = seq_release,
2229}; 2250};
2230 2251
2231static void __kprobes arm_all_kprobes(void) 2252/* kprobes/blacklist -- shows which functions can not be probed */
2253static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
2254{
2255 return seq_list_start(&kprobe_blacklist, *pos);
2256}
2257
2258static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos)
2259{
2260 return seq_list_next(v, &kprobe_blacklist, pos);
2261}
2262
2263static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
2264{
2265 struct kprobe_blacklist_entry *ent =
2266 list_entry(v, struct kprobe_blacklist_entry, list);
2267
2268 seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr,
2269 (void *)ent->end_addr, (void *)ent->start_addr);
2270 return 0;
2271}
2272
2273static const struct seq_operations kprobe_blacklist_seq_ops = {
2274 .start = kprobe_blacklist_seq_start,
2275 .next = kprobe_blacklist_seq_next,
2276 .stop = kprobe_seq_stop, /* Reuse void function */
2277 .show = kprobe_blacklist_seq_show,
2278};
2279
2280static int kprobe_blacklist_open(struct inode *inode, struct file *filp)
2281{
2282 return seq_open(filp, &kprobe_blacklist_seq_ops);
2283}
2284
2285static const struct file_operations debugfs_kprobe_blacklist_ops = {
2286 .open = kprobe_blacklist_open,
2287 .read = seq_read,
2288 .llseek = seq_lseek,
2289 .release = seq_release,
2290};
2291
2292static void arm_all_kprobes(void)
2232{ 2293{
2233 struct hlist_head *head; 2294 struct hlist_head *head;
2234 struct kprobe *p; 2295 struct kprobe *p;
@@ -2256,7 +2317,7 @@ already_enabled:
2256 return; 2317 return;
2257} 2318}
2258 2319
2259static void __kprobes disarm_all_kprobes(void) 2320static void disarm_all_kprobes(void)
2260{ 2321{
2261 struct hlist_head *head; 2322 struct hlist_head *head;
2262 struct kprobe *p; 2323 struct kprobe *p;
@@ -2340,7 +2401,7 @@ static const struct file_operations fops_kp = {
2340 .llseek = default_llseek, 2401 .llseek = default_llseek,
2341}; 2402};
2342 2403
2343static int __kprobes debugfs_kprobe_init(void) 2404static int __init debugfs_kprobe_init(void)
2344{ 2405{
2345 struct dentry *dir, *file; 2406 struct dentry *dir, *file;
2346 unsigned int value = 1; 2407 unsigned int value = 1;
@@ -2351,19 +2412,24 @@ static int __kprobes debugfs_kprobe_init(void)
2351 2412
2352 file = debugfs_create_file("list", 0444, dir, NULL, 2413 file = debugfs_create_file("list", 0444, dir, NULL,
2353 &debugfs_kprobes_operations); 2414 &debugfs_kprobes_operations);
2354 if (!file) { 2415 if (!file)
2355 debugfs_remove(dir); 2416 goto error;
2356 return -ENOMEM;
2357 }
2358 2417
2359 file = debugfs_create_file("enabled", 0600, dir, 2418 file = debugfs_create_file("enabled", 0600, dir,
2360 &value, &fops_kp); 2419 &value, &fops_kp);
2361 if (!file) { 2420 if (!file)
2362 debugfs_remove(dir); 2421 goto error;
2363 return -ENOMEM; 2422
2364 } 2423 file = debugfs_create_file("blacklist", 0444, dir, NULL,
2424 &debugfs_kprobe_blacklist_ops);
2425 if (!file)
2426 goto error;
2365 2427
2366 return 0; 2428 return 0;
2429
2430error:
2431 debugfs_remove(dir);
2432 return -ENOMEM;
2367} 2433}
2368 2434
2369late_initcall(debugfs_kprobe_init); 2435late_initcall(debugfs_kprobe_init);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 2495a9b14ac8..6683ccef9fff 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -37,6 +37,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
37} 37}
38KERNEL_ATTR_RO(uevent_seqnum); 38KERNEL_ATTR_RO(uevent_seqnum);
39 39
40#ifdef CONFIG_UEVENT_HELPER
40/* uevent helper program, used during early boot */ 41/* uevent helper program, used during early boot */
41static ssize_t uevent_helper_show(struct kobject *kobj, 42static ssize_t uevent_helper_show(struct kobject *kobj,
42 struct kobj_attribute *attr, char *buf) 43 struct kobj_attribute *attr, char *buf)
@@ -56,7 +57,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
56 return count; 57 return count;
57} 58}
58KERNEL_ATTR_RW(uevent_helper); 59KERNEL_ATTR_RW(uevent_helper);
59 60#endif
60 61
61#ifdef CONFIG_PROFILING 62#ifdef CONFIG_PROFILING
62static ssize_t profiling_show(struct kobject *kobj, 63static ssize_t profiling_show(struct kobject *kobj,
@@ -189,7 +190,9 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
189static struct attribute * kernel_attrs[] = { 190static struct attribute * kernel_attrs[] = {
190 &fscaps_attr.attr, 191 &fscaps_attr.attr,
191 &uevent_seqnum_attr.attr, 192 &uevent_seqnum_attr.attr,
193#ifdef CONFIG_UEVENT_HELPER
192 &uevent_helper_attr.attr, 194 &uevent_helper_attr.attr,
195#endif
193#ifdef CONFIG_PROFILING 196#ifdef CONFIG_PROFILING
194 &profiling_attr.attr, 197 &profiling_attr.attr,
195#endif 198#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9a130ec06f7a..c2390f41307b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create)
262 * kthread_stop() has been called). The return value should be zero 262 * kthread_stop() has been called). The return value should be zero
263 * or a negative error number; it will be passed to kthread_stop(). 263 * or a negative error number; it will be passed to kthread_stop().
264 * 264 *
265 * Returns a task_struct or ERR_PTR(-ENOMEM). 265 * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
266 */ 266 */
267struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), 267struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
268 void *data, int node, 268 void *data, int node,
@@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
298 * that thread. 298 * that thread.
299 */ 299 */
300 if (xchg(&create->done, NULL)) 300 if (xchg(&create->done, NULL))
301 return ERR_PTR(-ENOMEM); 301 return ERR_PTR(-EINTR);
302 /* 302 /*
303 * kthreadd (or new kernel thread) will call complete() 303 * kthreadd (or new kernel thread) will call complete()
304 * shortly. 304 * shortly.
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a462b317f9a0..a02812743a7e 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void)
88} 88}
89 89
90static void __sched 90static void __sched
91account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) 91account_global_scheduler_latency(struct task_struct *tsk,
92 struct latency_record *lat)
92{ 93{
93 int firstnonnull = MAXLR + 1; 94 int firstnonnull = MAXLR + 1;
94 int i; 95 int i;
@@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v)
255 break; 256 break;
256 seq_printf(m, " %ps", (void *)bt); 257 seq_printf(m, " %ps", (void *)bt);
257 } 258 }
258 seq_printf(m, "\n"); 259 seq_puts(m, "\n");
259 } 260 }
260 } 261 }
261 return 0; 262 return 0;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index b8bdcd4785b7..8541bfdfd232 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -24,4 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
24obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o 24obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
25obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o 25obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
26obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o 26obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
27obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
27obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o 28obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 4f560cfedc8f..51c4b24b6328 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -54,9 +54,9 @@ enum {
54 * table (if it's not there yet), and we check it for lock order 54 * table (if it's not there yet), and we check it for lock order
55 * conflicts and deadlocks. 55 * conflicts and deadlocks.
56 */ 56 */
57#define MAX_LOCKDEP_ENTRIES 16384UL 57#define MAX_LOCKDEP_ENTRIES 32768UL
58 58
59#define MAX_LOCKDEP_CHAINS_BITS 15 59#define MAX_LOCKDEP_CHAINS_BITS 16
60#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) 60#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
61 61
62#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) 62#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
@@ -65,7 +65,7 @@ enum {
65 * Stack-trace: tightly packed array of stack backtrace 65 * Stack-trace: tightly packed array of stack backtrace
66 * addresses. Protected by the hash_lock. 66 * addresses. Protected by the hash_lock.
67 */ 67 */
68#define MAX_STACK_TRACE_ENTRIES 262144UL 68#define MAX_STACK_TRACE_ENTRIES 524288UL
69 69
70extern struct list_head all_lock_classes; 70extern struct list_head all_lock_classes;
71extern struct lock_chain lock_chains[]; 71extern struct lock_chain lock_chains[];
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f26b1a18e34e..0955b885d0dc 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -82,14 +82,14 @@ struct lock_writer_stress_stats {
82}; 82};
83static struct lock_writer_stress_stats *lwsa; 83static struct lock_writer_stress_stats *lwsa;
84 84
85#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE) 85#if defined(MODULE)
86#define LOCKTORTURE_RUNNABLE_INIT 1 86#define LOCKTORTURE_RUNNABLE_INIT 1
87#else 87#else
88#define LOCKTORTURE_RUNNABLE_INIT 0 88#define LOCKTORTURE_RUNNABLE_INIT 0
89#endif 89#endif
90int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; 90int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
91module_param(locktorture_runnable, int, 0444); 91module_param(locktorture_runnable, int, 0444);
92MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot"); 92MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init");
93 93
94/* Forward reference. */ 94/* Forward reference. */
95static void lock_torture_cleanup(void); 95static void lock_torture_cleanup(void);
@@ -216,10 +216,11 @@ static int lock_torture_writer(void *arg)
216 static DEFINE_TORTURE_RANDOM(rand); 216 static DEFINE_TORTURE_RANDOM(rand);
217 217
218 VERBOSE_TOROUT_STRING("lock_torture_writer task started"); 218 VERBOSE_TOROUT_STRING("lock_torture_writer task started");
219 set_user_nice(current, 19); 219 set_user_nice(current, MAX_NICE);
220 220
221 do { 221 do {
222 schedule_timeout_uninterruptible(1); 222 if ((torture_random(&rand) & 0xfffff) == 0)
223 schedule_timeout_uninterruptible(1);
223 cur_ops->writelock(); 224 cur_ops->writelock();
224 if (WARN_ON_ONCE(lock_is_write_held)) 225 if (WARN_ON_ONCE(lock_is_write_held))
225 lwsp->n_write_lock_fail++; 226 lwsp->n_write_lock_fail++;
@@ -354,7 +355,8 @@ static int __init lock_torture_init(void)
354 &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, 355 &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
355 }; 356 };
356 357
357 torture_init_begin(torture_type, verbose, &locktorture_runnable); 358 if (!torture_init_begin(torture_type, verbose, &locktorture_runnable))
359 return -EBUSY;
358 360
359 /* Process args and tell the world that the torturer is on the job. */ 361 /* Process args and tell the world that the torturer is on the job. */
360 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 362 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e00669..be9ee1559fca 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -14,21 +14,47 @@
14 * called from interrupt context and we have preemption disabled while 14 * called from interrupt context and we have preemption disabled while
15 * spinning. 15 * spinning.
16 */ 16 */
17static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node); 17static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
18
19/*
20 * We use the value 0 to represent "no CPU", thus the encoded value
21 * will be the CPU number incremented by 1.
22 */
23static inline int encode_cpu(int cpu_nr)
24{
25 return cpu_nr + 1;
26}
27
28static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
29{
30 int cpu_nr = encoded_cpu_val - 1;
31
32 return per_cpu_ptr(&osq_node, cpu_nr);
33}
18 34
19/* 35/*
20 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. 36 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
21 * Can return NULL in case we were the last queued and we updated @lock instead. 37 * Can return NULL in case we were the last queued and we updated @lock instead.
22 */ 38 */
23static inline struct optimistic_spin_queue * 39static inline struct optimistic_spin_node *
24osq_wait_next(struct optimistic_spin_queue **lock, 40osq_wait_next(struct optimistic_spin_queue *lock,
25 struct optimistic_spin_queue *node, 41 struct optimistic_spin_node *node,
26 struct optimistic_spin_queue *prev) 42 struct optimistic_spin_node *prev)
27{ 43{
28 struct optimistic_spin_queue *next = NULL; 44 struct optimistic_spin_node *next = NULL;
45 int curr = encode_cpu(smp_processor_id());
46 int old;
47
48 /*
49 * If there is a prev node in queue, then the 'old' value will be
50 * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
51 * we're currently last in queue, then the queue will then become empty.
52 */
53 old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
29 54
30 for (;;) { 55 for (;;) {
31 if (*lock == node && cmpxchg(lock, node, prev) == node) { 56 if (atomic_read(&lock->tail) == curr &&
57 atomic_cmpxchg(&lock->tail, curr, old) == curr) {
32 /* 58 /*
33 * We were the last queued, we moved @lock back. @prev 59 * We were the last queued, we moved @lock back. @prev
34 * will now observe @lock and will complete its 60 * will now observe @lock and will complete its
@@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_queue **lock,
59 return next; 85 return next;
60} 86}
61 87
62bool osq_lock(struct optimistic_spin_queue **lock) 88bool osq_lock(struct optimistic_spin_queue *lock)
63{ 89{
64 struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); 90 struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
65 struct optimistic_spin_queue *prev, *next; 91 struct optimistic_spin_node *prev, *next;
92 int curr = encode_cpu(smp_processor_id());
93 int old;
66 94
67 node->locked = 0; 95 node->locked = 0;
68 node->next = NULL; 96 node->next = NULL;
97 node->cpu = curr;
69 98
70 node->prev = prev = xchg(lock, node); 99 old = atomic_xchg(&lock->tail, curr);
71 if (likely(prev == NULL)) 100 if (old == OSQ_UNLOCKED_VAL)
72 return true; 101 return true;
73 102
103 prev = decode_cpu(old);
104 node->prev = prev;
74 ACCESS_ONCE(prev->next) = node; 105 ACCESS_ONCE(prev->next) = node;
75 106
76 /* 107 /*
@@ -149,20 +180,21 @@ unqueue:
149 return false; 180 return false;
150} 181}
151 182
152void osq_unlock(struct optimistic_spin_queue **lock) 183void osq_unlock(struct optimistic_spin_queue *lock)
153{ 184{
154 struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); 185 struct optimistic_spin_node *node, *next;
155 struct optimistic_spin_queue *next; 186 int curr = encode_cpu(smp_processor_id());
156 187
157 /* 188 /*
158 * Fast path for the uncontended case. 189 * Fast path for the uncontended case.
159 */ 190 */
160 if (likely(cmpxchg(lock, node, NULL) == node)) 191 if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
161 return; 192 return;
162 193
163 /* 194 /*
164 * Second most likely case. 195 * Second most likely case.
165 */ 196 */
197 node = this_cpu_ptr(&osq_node);
166 next = xchg(&node->next, NULL); 198 next = xchg(&node->next, NULL);
167 if (next) { 199 if (next) {
168 ACCESS_ONCE(next->locked) = 1; 200 ACCESS_ONCE(next->locked) = 1;
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4aca6b..74356dc0ce29 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -118,12 +118,13 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
118 * mutex_lock()/rwsem_down_{read,write}() etc. 118 * mutex_lock()/rwsem_down_{read,write}() etc.
119 */ 119 */
120 120
121struct optimistic_spin_queue { 121struct optimistic_spin_node {
122 struct optimistic_spin_queue *next, *prev; 122 struct optimistic_spin_node *next, *prev;
123 int locked; /* 1 if lock acquired */ 123 int locked; /* 1 if lock acquired */
124 int cpu; /* encoded CPU # value */
124}; 125};
125 126
126extern bool osq_lock(struct optimistic_spin_queue **lock); 127extern bool osq_lock(struct optimistic_spin_queue *lock);
127extern void osq_unlock(struct optimistic_spin_queue **lock); 128extern void osq_unlock(struct optimistic_spin_queue *lock);
128 129
129#endif /* __LINUX_MCS_SPINLOCK_H */ 130#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33c6760..acca2c1a3c5e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
60 INIT_LIST_HEAD(&lock->wait_list); 60 INIT_LIST_HEAD(&lock->wait_list);
61 mutex_clear_owner(lock); 61 mutex_clear_owner(lock);
62#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 62#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
63 lock->osq = NULL; 63 osq_lock_init(&lock->osq);
64#endif 64#endif
65 65
66 debug_mutex_init(lock, name, key); 66 debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
new file mode 100644
index 000000000000..fb5b8ac411a5
--- /dev/null
+++ b/kernel/locking/qrwlock.c
@@ -0,0 +1,133 @@
1/*
2 * Queue read/write lock
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
15 *
16 * Authors: Waiman Long <waiman.long@hp.com>
17 */
18#include <linux/smp.h>
19#include <linux/bug.h>
20#include <linux/cpumask.h>
21#include <linux/percpu.h>
22#include <linux/hardirq.h>
23#include <linux/mutex.h>
24#include <asm/qrwlock.h>
25
26/**
27 * rspin_until_writer_unlock - inc reader count & spin until writer is gone
28 * @lock : Pointer to queue rwlock structure
29 * @writer: Current queue rwlock writer status byte
30 *
31 * In interrupt context or at the head of the queue, the reader will just
32 * increment the reader count & wait until the writer releases the lock.
33 */
34static __always_inline void
35rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
36{
37 while ((cnts & _QW_WMASK) == _QW_LOCKED) {
38 arch_mutex_cpu_relax();
39 cnts = smp_load_acquire((u32 *)&lock->cnts);
40 }
41}
42
43/**
44 * queue_read_lock_slowpath - acquire read lock of a queue rwlock
45 * @lock: Pointer to queue rwlock structure
46 */
47void queue_read_lock_slowpath(struct qrwlock *lock)
48{
49 u32 cnts;
50
51 /*
52 * Readers come here when they cannot get the lock without waiting
53 */
54 if (unlikely(in_interrupt())) {
55 /*
56 * Readers in interrupt context will spin until the lock is
57 * available without waiting in the queue.
58 */
59 cnts = smp_load_acquire((u32 *)&lock->cnts);
60 rspin_until_writer_unlock(lock, cnts);
61 return;
62 }
63 atomic_sub(_QR_BIAS, &lock->cnts);
64
65 /*
66 * Put the reader into the wait queue
67 */
68 arch_spin_lock(&lock->lock);
69
70 /*
71 * At the head of the wait queue now, wait until the writer state
72 * goes to 0 and then try to increment the reader count and get
73 * the lock. It is possible that an incoming writer may steal the
74 * lock in the interim, so it is necessary to check the writer byte
75 * to make sure that the write lock isn't taken.
76 */
77 while (atomic_read(&lock->cnts) & _QW_WMASK)
78 arch_mutex_cpu_relax();
79
80 cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
81 rspin_until_writer_unlock(lock, cnts);
82
83 /*
84 * Signal the next one in queue to become queue head
85 */
86 arch_spin_unlock(&lock->lock);
87}
88EXPORT_SYMBOL(queue_read_lock_slowpath);
89
90/**
91 * queue_write_lock_slowpath - acquire write lock of a queue rwlock
92 * @lock : Pointer to queue rwlock structure
93 */
94void queue_write_lock_slowpath(struct qrwlock *lock)
95{
96 u32 cnts;
97
98 /* Put the writer into the wait queue */
99 arch_spin_lock(&lock->lock);
100
101 /* Try to acquire the lock directly if no reader is present */
102 if (!atomic_read(&lock->cnts) &&
103 (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
104 goto unlock;
105
106 /*
107 * Set the waiting flag to notify readers that a writer is pending,
108 * or wait for a previous writer to go away.
109 */
110 for (;;) {
111 cnts = atomic_read(&lock->cnts);
112 if (!(cnts & _QW_WMASK) &&
113 (atomic_cmpxchg(&lock->cnts, cnts,
114 cnts | _QW_WAITING) == cnts))
115 break;
116
117 arch_mutex_cpu_relax();
118 }
119
120 /* When no more readers, set the locked flag */
121 for (;;) {
122 cnts = atomic_read(&lock->cnts);
123 if ((cnts == _QW_WAITING) &&
124 (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
125 _QW_LOCKED) == _QW_WAITING))
126 break;
127
128 arch_mutex_cpu_relax();
129 }
130unlock:
131 arch_spin_unlock(&lock->lock);
132}
133EXPORT_SYMBOL(queue_write_lock_slowpath);
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 14193d596d78..ab29b6a22669 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -31,3 +31,8 @@ static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
31{ 31{
32 return (waiter != NULL); 32 return (waiter != NULL);
33} 33}
34
35static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
36{
37 debug_rt_mutex_print_deadlock(w);
38}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index aa4dff04b594..fc605941b9b8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
83 owner = *p; 83 owner = *p;
84 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); 84 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
85} 85}
86
87/*
88 * Safe fastpath aware unlock:
89 * 1) Clear the waiters bit
90 * 2) Drop lock->wait_lock
91 * 3) Try to unlock the lock with cmpxchg
92 */
93static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
94 __releases(lock->wait_lock)
95{
96 struct task_struct *owner = rt_mutex_owner(lock);
97
98 clear_rt_mutex_waiters(lock);
99 raw_spin_unlock(&lock->wait_lock);
100 /*
101 * If a new waiter comes in between the unlock and the cmpxchg
102 * we have two situations:
103 *
104 * unlock(wait_lock);
105 * lock(wait_lock);
106 * cmpxchg(p, owner, 0) == owner
107 * mark_rt_mutex_waiters(lock);
108 * acquire(lock);
109 * or:
110 *
111 * unlock(wait_lock);
112 * lock(wait_lock);
113 * mark_rt_mutex_waiters(lock);
114 *
115 * cmpxchg(p, owner, 0) != owner
116 * enqueue_waiter();
117 * unlock(wait_lock);
118 * lock(wait_lock);
119 * wake waiter();
120 * unlock(wait_lock);
121 * lock(wait_lock);
122 * acquire(lock);
123 */
124 return rt_mutex_cmpxchg(lock, owner, NULL);
125}
126
86#else 127#else
87# define rt_mutex_cmpxchg(l,c,n) (0) 128# define rt_mutex_cmpxchg(l,c,n) (0)
88static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) 129static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
@@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
90 lock->owner = (struct task_struct *) 131 lock->owner = (struct task_struct *)
91 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); 132 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
92} 133}
134
135/*
136 * Simple slow path only version: lock->owner is protected by lock->wait_lock.
137 */
138static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
139 __releases(lock->wait_lock)
140{
141 lock->owner = NULL;
142 raw_spin_unlock(&lock->wait_lock);
143 return true;
144}
93#endif 145#endif
94 146
95static inline int 147static inline int
@@ -260,27 +312,36 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
260 */ 312 */
261int max_lock_depth = 1024; 313int max_lock_depth = 1024;
262 314
315static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
316{
317 return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
318}
319
263/* 320/*
264 * Adjust the priority chain. Also used for deadlock detection. 321 * Adjust the priority chain. Also used for deadlock detection.
265 * Decreases task's usage by one - may thus free the task. 322 * Decreases task's usage by one - may thus free the task.
266 * 323 *
267 * @task: the task owning the mutex (owner) for which a chain walk is probably 324 * @task: the task owning the mutex (owner) for which a chain walk is
268 * needed 325 * probably needed
269 * @deadlock_detect: do we have to carry out deadlock detection? 326 * @deadlock_detect: do we have to carry out deadlock detection?
270 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck 327 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
271 * things for a task that has just got its priority adjusted, and 328 * things for a task that has just got its priority adjusted, and
272 * is waiting on a mutex) 329 * is waiting on a mutex)
330 * @next_lock: the mutex on which the owner of @orig_lock was blocked before
331 * we dropped its pi_lock. Is never dereferenced, only used for
332 * comparison to detect lock chain changes.
273 * @orig_waiter: rt_mutex_waiter struct for the task that has just donated 333 * @orig_waiter: rt_mutex_waiter struct for the task that has just donated
274 * its priority to the mutex owner (can be NULL in the case 334 * its priority to the mutex owner (can be NULL in the case
275 * depicted above or if the top waiter is gone away and we are 335 * depicted above or if the top waiter is gone away and we are
276 * actually deboosting the owner) 336 * actually deboosting the owner)
277 * @top_task: the current top waiter 337 * @top_task: the current top waiter
278 * 338 *
279 * Returns 0 or -EDEADLK. 339 * Returns 0 or -EDEADLK.
280 */ 340 */
281static int rt_mutex_adjust_prio_chain(struct task_struct *task, 341static int rt_mutex_adjust_prio_chain(struct task_struct *task,
282 int deadlock_detect, 342 int deadlock_detect,
283 struct rt_mutex *orig_lock, 343 struct rt_mutex *orig_lock,
344 struct rt_mutex *next_lock,
284 struct rt_mutex_waiter *orig_waiter, 345 struct rt_mutex_waiter *orig_waiter,
285 struct task_struct *top_task) 346 struct task_struct *top_task)
286{ 347{
@@ -314,7 +375,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
314 } 375 }
315 put_task_struct(task); 376 put_task_struct(task);
316 377
317 return deadlock_detect ? -EDEADLK : 0; 378 return -EDEADLK;
318 } 379 }
319 retry: 380 retry:
320 /* 381 /*
@@ -339,13 +400,32 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
339 goto out_unlock_pi; 400 goto out_unlock_pi;
340 401
341 /* 402 /*
403 * We dropped all locks after taking a refcount on @task, so
404 * the task might have moved on in the lock chain or even left
405 * the chain completely and blocks now on an unrelated lock or
406 * on @orig_lock.
407 *
408 * We stored the lock on which @task was blocked in @next_lock,
409 * so we can detect the chain change.
410 */
411 if (next_lock != waiter->lock)
412 goto out_unlock_pi;
413
414 /*
342 * Drop out, when the task has no waiters. Note, 415 * Drop out, when the task has no waiters. Note,
343 * top_waiter can be NULL, when we are in the deboosting 416 * top_waiter can be NULL, when we are in the deboosting
344 * mode! 417 * mode!
345 */ 418 */
346 if (top_waiter && (!task_has_pi_waiters(task) || 419 if (top_waiter) {
347 top_waiter != task_top_pi_waiter(task))) 420 if (!task_has_pi_waiters(task))
348 goto out_unlock_pi; 421 goto out_unlock_pi;
422 /*
423 * If deadlock detection is off, we stop here if we
424 * are not the top pi waiter of the task.
425 */
426 if (!detect_deadlock && top_waiter != task_top_pi_waiter(task))
427 goto out_unlock_pi;
428 }
349 429
350 /* 430 /*
351 * When deadlock detection is off then we check, if further 431 * When deadlock detection is off then we check, if further
@@ -361,11 +441,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
361 goto retry; 441 goto retry;
362 } 442 }
363 443
364 /* Deadlock detection */ 444 /*
445 * Deadlock detection. If the lock is the same as the original
446 * lock which caused us to walk the lock chain or if the
447 * current lock is owned by the task which initiated the chain
448 * walk, we detected a deadlock.
449 */
365 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 450 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
366 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); 451 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
367 raw_spin_unlock(&lock->wait_lock); 452 raw_spin_unlock(&lock->wait_lock);
368 ret = deadlock_detect ? -EDEADLK : 0; 453 ret = -EDEADLK;
369 goto out_unlock_pi; 454 goto out_unlock_pi;
370 } 455 }
371 456
@@ -410,11 +495,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
410 __rt_mutex_adjust_prio(task); 495 __rt_mutex_adjust_prio(task);
411 } 496 }
412 497
498 /*
499 * Check whether the task which owns the current lock is pi
500 * blocked itself. If yes we store a pointer to the lock for
501 * the lock chain change detection above. After we dropped
502 * task->pi_lock next_lock cannot be dereferenced anymore.
503 */
504 next_lock = task_blocked_on_lock(task);
505
413 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 506 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
414 507
415 top_waiter = rt_mutex_top_waiter(lock); 508 top_waiter = rt_mutex_top_waiter(lock);
416 raw_spin_unlock(&lock->wait_lock); 509 raw_spin_unlock(&lock->wait_lock);
417 510
511 /*
512 * We reached the end of the lock chain. Stop right here. No
513 * point to go back just to figure that out.
514 */
515 if (!next_lock)
516 goto out_put_task;
517
418 if (!detect_deadlock && waiter != top_waiter) 518 if (!detect_deadlock && waiter != top_waiter)
419 goto out_put_task; 519 goto out_put_task;
420 520
@@ -524,8 +624,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
524{ 624{
525 struct task_struct *owner = rt_mutex_owner(lock); 625 struct task_struct *owner = rt_mutex_owner(lock);
526 struct rt_mutex_waiter *top_waiter = waiter; 626 struct rt_mutex_waiter *top_waiter = waiter;
527 unsigned long flags; 627 struct rt_mutex *next_lock;
528 int chain_walk = 0, res; 628 int chain_walk = 0, res;
629 unsigned long flags;
630
631 /*
632 * Early deadlock detection. We really don't want the task to
633 * enqueue on itself just to untangle the mess later. It's not
634 * only an optimization. We drop the locks, so another waiter
635 * can come in before the chain walk detects the deadlock. So
636 * the other will detect the deadlock and return -EDEADLOCK,
637 * which is wrong, as the other waiter is not in a deadlock
638 * situation.
639 */
640 if (owner == task)
641 return -EDEADLK;
529 642
530 raw_spin_lock_irqsave(&task->pi_lock, flags); 643 raw_spin_lock_irqsave(&task->pi_lock, flags);
531 __rt_mutex_adjust_prio(task); 644 __rt_mutex_adjust_prio(task);
@@ -545,20 +658,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
545 if (!owner) 658 if (!owner)
546 return 0; 659 return 0;
547 660
661 raw_spin_lock_irqsave(&owner->pi_lock, flags);
548 if (waiter == rt_mutex_top_waiter(lock)) { 662 if (waiter == rt_mutex_top_waiter(lock)) {
549 raw_spin_lock_irqsave(&owner->pi_lock, flags);
550 rt_mutex_dequeue_pi(owner, top_waiter); 663 rt_mutex_dequeue_pi(owner, top_waiter);
551 rt_mutex_enqueue_pi(owner, waiter); 664 rt_mutex_enqueue_pi(owner, waiter);
552 665
553 __rt_mutex_adjust_prio(owner); 666 __rt_mutex_adjust_prio(owner);
554 if (owner->pi_blocked_on) 667 if (owner->pi_blocked_on)
555 chain_walk = 1; 668 chain_walk = 1;
556 raw_spin_unlock_irqrestore(&owner->pi_lock, flags); 669 } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
557 }
558 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
559 chain_walk = 1; 670 chain_walk = 1;
671 }
672
673 /* Store the lock on which owner is blocked or NULL */
674 next_lock = task_blocked_on_lock(owner);
560 675
561 if (!chain_walk) 676 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
677 /*
678 * Even if full deadlock detection is on, if the owner is not
679 * blocked itself, we can avoid finding this out in the chain
680 * walk.
681 */
682 if (!chain_walk || !next_lock)
562 return 0; 683 return 0;
563 684
564 /* 685 /*
@@ -570,8 +691,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
570 691
571 raw_spin_unlock(&lock->wait_lock); 692 raw_spin_unlock(&lock->wait_lock);
572 693
573 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 694 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
574 task); 695 next_lock, waiter, task);
575 696
576 raw_spin_lock(&lock->wait_lock); 697 raw_spin_lock(&lock->wait_lock);
577 698
@@ -581,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
581/* 702/*
582 * Wake up the next waiter on the lock. 703 * Wake up the next waiter on the lock.
583 * 704 *
584 * Remove the top waiter from the current tasks waiter list and wake it up. 705 * Remove the top waiter from the current tasks pi waiter list and
706 * wake it up.
585 * 707 *
586 * Called with lock->wait_lock held. 708 * Called with lock->wait_lock held.
587 */ 709 */
@@ -602,10 +724,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
602 */ 724 */
603 rt_mutex_dequeue_pi(current, waiter); 725 rt_mutex_dequeue_pi(current, waiter);
604 726
605 rt_mutex_set_owner(lock, NULL); 727 /*
728 * As we are waking up the top waiter, and the waiter stays
729 * queued on the lock until it gets the lock, this lock
730 * obviously has waiters. Just set the bit here and this has
731 * the added benefit of forcing all new tasks into the
732 * slow path making sure no task of lower priority than
733 * the top waiter can steal this lock.
734 */
735 lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
606 736
607 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 737 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
608 738
739 /*
740 * It's safe to dereference waiter as it cannot go away as
741 * long as we hold lock->wait_lock. The waiter task needs to
742 * acquire it in order to dequeue the waiter.
743 */
609 wake_up_process(waiter->task); 744 wake_up_process(waiter->task);
610} 745}
611 746
@@ -620,8 +755,8 @@ static void remove_waiter(struct rt_mutex *lock,
620{ 755{
621 int first = (waiter == rt_mutex_top_waiter(lock)); 756 int first = (waiter == rt_mutex_top_waiter(lock));
622 struct task_struct *owner = rt_mutex_owner(lock); 757 struct task_struct *owner = rt_mutex_owner(lock);
758 struct rt_mutex *next_lock = NULL;
623 unsigned long flags; 759 unsigned long flags;
624 int chain_walk = 0;
625 760
626 raw_spin_lock_irqsave(&current->pi_lock, flags); 761 raw_spin_lock_irqsave(&current->pi_lock, flags);
627 rt_mutex_dequeue(lock, waiter); 762 rt_mutex_dequeue(lock, waiter);
@@ -645,13 +780,13 @@ static void remove_waiter(struct rt_mutex *lock,
645 } 780 }
646 __rt_mutex_adjust_prio(owner); 781 __rt_mutex_adjust_prio(owner);
647 782
648 if (owner->pi_blocked_on) 783 /* Store the lock on which owner is blocked or NULL */
649 chain_walk = 1; 784 next_lock = task_blocked_on_lock(owner);
650 785
651 raw_spin_unlock_irqrestore(&owner->pi_lock, flags); 786 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
652 } 787 }
653 788
654 if (!chain_walk) 789 if (!next_lock)
655 return; 790 return;
656 791
657 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 792 /* gets dropped in rt_mutex_adjust_prio_chain()! */
@@ -659,7 +794,7 @@ static void remove_waiter(struct rt_mutex *lock,
659 794
660 raw_spin_unlock(&lock->wait_lock); 795 raw_spin_unlock(&lock->wait_lock);
661 796
662 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); 797 rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current);
663 798
664 raw_spin_lock(&lock->wait_lock); 799 raw_spin_lock(&lock->wait_lock);
665} 800}
@@ -672,6 +807,7 @@ static void remove_waiter(struct rt_mutex *lock,
672void rt_mutex_adjust_pi(struct task_struct *task) 807void rt_mutex_adjust_pi(struct task_struct *task)
673{ 808{
674 struct rt_mutex_waiter *waiter; 809 struct rt_mutex_waiter *waiter;
810 struct rt_mutex *next_lock;
675 unsigned long flags; 811 unsigned long flags;
676 812
677 raw_spin_lock_irqsave(&task->pi_lock, flags); 813 raw_spin_lock_irqsave(&task->pi_lock, flags);
@@ -682,12 +818,13 @@ void rt_mutex_adjust_pi(struct task_struct *task)
682 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 818 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
683 return; 819 return;
684 } 820 }
685 821 next_lock = waiter->lock;
686 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 822 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
687 823
688 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 824 /* gets dropped in rt_mutex_adjust_prio_chain()! */
689 get_task_struct(task); 825 get_task_struct(task);
690 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); 826
827 rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
691} 828}
692 829
693/** 830/**
@@ -739,6 +876,26 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
739 return ret; 876 return ret;
740} 877}
741 878
879static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
880 struct rt_mutex_waiter *w)
881{
882 /*
883 * If the result is not -EDEADLOCK or the caller requested
884 * deadlock detection, nothing to do here.
885 */
886 if (res != -EDEADLOCK || detect_deadlock)
887 return;
888
889 /*
890 * Yell lowdly and stop the task right here.
891 */
892 rt_mutex_print_deadlock(w);
893 while (1) {
894 set_current_state(TASK_INTERRUPTIBLE);
895 schedule();
896 }
897}
898
742/* 899/*
743 * Slow path lock function: 900 * Slow path lock function:
744 */ 901 */
@@ -778,8 +935,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
778 935
779 set_current_state(TASK_RUNNING); 936 set_current_state(TASK_RUNNING);
780 937
781 if (unlikely(ret)) 938 if (unlikely(ret)) {
782 remove_waiter(lock, &waiter); 939 remove_waiter(lock, &waiter);
940 rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter);
941 }
783 942
784 /* 943 /*
785 * try_to_take_rt_mutex() sets the waiter bit 944 * try_to_take_rt_mutex() sets the waiter bit
@@ -835,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
835 994
836 rt_mutex_deadlock_account_unlock(current); 995 rt_mutex_deadlock_account_unlock(current);
837 996
838 if (!rt_mutex_has_waiters(lock)) { 997 /*
839 lock->owner = NULL; 998 * We must be careful here if the fast path is enabled. If we
840 raw_spin_unlock(&lock->wait_lock); 999 * have no waiters queued we cannot set owner to NULL here
841 return; 1000 * because of:
1001 *
1002 * foo->lock->owner = NULL;
1003 * rtmutex_lock(foo->lock); <- fast path
1004 * free = atomic_dec_and_test(foo->refcnt);
1005 * rtmutex_unlock(foo->lock); <- fast path
1006 * if (free)
1007 * kfree(foo);
1008 * raw_spin_unlock(foo->lock->wait_lock);
1009 *
1010 * So for the fastpath enabled kernel:
1011 *
1012 * Nothing can set the waiters bit as long as we hold
1013 * lock->wait_lock. So we do the following sequence:
1014 *
1015 * owner = rt_mutex_owner(lock);
1016 * clear_rt_mutex_waiters(lock);
1017 * raw_spin_unlock(&lock->wait_lock);
1018 * if (cmpxchg(&lock->owner, owner, 0) == owner)
1019 * return;
1020 * goto retry;
1021 *
1022 * The fastpath disabled variant is simple as all access to
1023 * lock->owner is serialized by lock->wait_lock:
1024 *
1025 * lock->owner = NULL;
1026 * raw_spin_unlock(&lock->wait_lock);
1027 */
1028 while (!rt_mutex_has_waiters(lock)) {
1029 /* Drops lock->wait_lock ! */
1030 if (unlock_rt_mutex_safe(lock) == true)
1031 return;
1032 /* Relock the rtmutex and try again */
1033 raw_spin_lock(&lock->wait_lock);
842 } 1034 }
843 1035
1036 /*
1037 * The wakeup next waiter path does not suffer from the above
1038 * race. See the comments there.
1039 */
844 wakeup_next_waiter(lock); 1040 wakeup_next_waiter(lock);
845 1041
846 raw_spin_unlock(&lock->wait_lock); 1042 raw_spin_unlock(&lock->wait_lock);
@@ -1088,7 +1284,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1088 return 1; 1284 return 1;
1089 } 1285 }
1090 1286
1091 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 1287 /* We enforce deadlock detection for futexes */
1288 ret = task_blocks_on_rt_mutex(lock, waiter, task, 1);
1092 1289
1093 if (ret && !rt_mutex_owner(lock)) { 1290 if (ret && !rt_mutex_owner(lock)) {
1094 /* 1291 /*
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index a1a1dd06421d..f6a1f3c133b1 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -24,3 +24,8 @@
24#define debug_rt_mutex_print_deadlock(w) do { } while (0) 24#define debug_rt_mutex_print_deadlock(w) do { } while (0)
25#define debug_rt_mutex_detect_deadlock(w,d) (d) 25#define debug_rt_mutex_detect_deadlock(w,d) (d)
26#define debug_rt_mutex_reset_waiter(w) do { } while (0) 26#define debug_rt_mutex_reset_waiter(w) do { } while (0)
27
28static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
29{
30 WARN(1, "rtmutex deadlock detected\n");
31}
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 9be8a9144978..2c93571162cb 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem)
26 unsigned long flags; 26 unsigned long flags;
27 27
28 if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { 28 if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
29 ret = (sem->activity != 0); 29 ret = (sem->count != 0);
30 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 30 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
31 } 31 }
32 return ret; 32 return ret;
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
46 debug_check_no_locks_freed((void *)sem, sizeof(*sem)); 46 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
47 lockdep_init_map(&sem->dep_map, name, key, 0); 47 lockdep_init_map(&sem->dep_map, name, key, 0);
48#endif 48#endif
49 sem->activity = 0; 49 sem->count = 0;
50 raw_spin_lock_init(&sem->wait_lock); 50 raw_spin_lock_init(&sem->wait_lock);
51 INIT_LIST_HEAD(&sem->wait_list); 51 INIT_LIST_HEAD(&sem->wait_list);
52} 52}
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
95 waiter = list_entry(next, struct rwsem_waiter, list); 95 waiter = list_entry(next, struct rwsem_waiter, list);
96 } while (waiter->type != RWSEM_WAITING_FOR_WRITE); 96 } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
97 97
98 sem->activity += woken; 98 sem->count += woken;
99 99
100 out: 100 out:
101 return sem; 101 return sem;
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem)
126 126
127 raw_spin_lock_irqsave(&sem->wait_lock, flags); 127 raw_spin_lock_irqsave(&sem->wait_lock, flags);
128 128
129 if (sem->activity >= 0 && list_empty(&sem->wait_list)) { 129 if (sem->count >= 0 && list_empty(&sem->wait_list)) {
130 /* granted */ 130 /* granted */
131 sem->activity++; 131 sem->count++;
132 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 132 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
133 goto out; 133 goto out;
134 } 134 }
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem)
170 170
171 raw_spin_lock_irqsave(&sem->wait_lock, flags); 171 raw_spin_lock_irqsave(&sem->wait_lock, flags);
172 172
173 if (sem->activity >= 0 && list_empty(&sem->wait_list)) { 173 if (sem->count >= 0 && list_empty(&sem->wait_list)) {
174 /* granted */ 174 /* granted */
175 sem->activity++; 175 sem->count++;
176 ret = 1; 176 ret = 1;
177 } 177 }
178 178
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
206 * itself into sleep and waiting for system woke it or someone 206 * itself into sleep and waiting for system woke it or someone
207 * else in the head of the wait list up. 207 * else in the head of the wait list up.
208 */ 208 */
209 if (sem->activity == 0) 209 if (sem->count == 0)
210 break; 210 break;
211 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 211 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
212 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 212 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
214 raw_spin_lock_irqsave(&sem->wait_lock, flags); 214 raw_spin_lock_irqsave(&sem->wait_lock, flags);
215 } 215 }
216 /* got the lock */ 216 /* got the lock */
217 sem->activity = -1; 217 sem->count = -1;
218 list_del(&waiter.list); 218 list_del(&waiter.list);
219 219
220 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 220 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem)
235 235
236 raw_spin_lock_irqsave(&sem->wait_lock, flags); 236 raw_spin_lock_irqsave(&sem->wait_lock, flags);
237 237
238 if (sem->activity == 0) { 238 if (sem->count == 0) {
239 /* got the lock */ 239 /* got the lock */
240 sem->activity = -1; 240 sem->count = -1;
241 ret = 1; 241 ret = 1;
242 } 242 }
243 243
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem)
255 255
256 raw_spin_lock_irqsave(&sem->wait_lock, flags); 256 raw_spin_lock_irqsave(&sem->wait_lock, flags);
257 257
258 if (--sem->activity == 0 && !list_empty(&sem->wait_list)) 258 if (--sem->count == 0 && !list_empty(&sem->wait_list))
259 sem = __rwsem_wake_one_writer(sem); 259 sem = __rwsem_wake_one_writer(sem);
260 260
261 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 261 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem)
270 270
271 raw_spin_lock_irqsave(&sem->wait_lock, flags); 271 raw_spin_lock_irqsave(&sem->wait_lock, flags);
272 272
273 sem->activity = 0; 273 sem->count = 0;
274 if (!list_empty(&sem->wait_list)) 274 if (!list_empty(&sem->wait_list))
275 sem = __rwsem_do_wake(sem, 1); 275 sem = __rwsem_do_wake(sem, 1);
276 276
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem)
287 287
288 raw_spin_lock_irqsave(&sem->wait_lock, flags); 288 raw_spin_lock_irqsave(&sem->wait_lock, flags);
289 289
290 sem->activity = 1; 290 sem->count = 1;
291 if (!list_empty(&sem->wait_list)) 291 if (!list_empty(&sem->wait_list))
292 sem = __rwsem_do_wake(sem, 0); 292 sem = __rwsem_do_wake(sem, 0);
293 293
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 1d66e08e897d..a2391ac135c8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -5,11 +5,66 @@
5 * 5 *
6 * Writer lock-stealing by Alex Shi <alex.shi@intel.com> 6 * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
7 * and Michel Lespinasse <walken@google.com> 7 * and Michel Lespinasse <walken@google.com>
8 *
9 * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
10 * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
8 */ 11 */
9#include <linux/rwsem.h> 12#include <linux/rwsem.h>
10#include <linux/sched.h> 13#include <linux/sched.h>
11#include <linux/init.h> 14#include <linux/init.h>
12#include <linux/export.h> 15#include <linux/export.h>
16#include <linux/sched/rt.h>
17
18#include "mcs_spinlock.h"
19
20/*
21 * Guide to the rw_semaphore's count field for common values.
22 * (32-bit case illustrated, similar for 64-bit)
23 *
24 * 0x0000000X (1) X readers active or attempting lock, no writer waiting
25 * X = #active_readers + #readers attempting to lock
26 * (X*ACTIVE_BIAS)
27 *
28 * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or
29 * attempting to read lock or write lock.
30 *
31 * 0xffff000X (1) X readers active or attempting lock, with waiters for lock
32 * X = #active readers + # readers attempting lock
33 * (X*ACTIVE_BIAS + WAITING_BIAS)
34 * (2) 1 writer attempting lock, no waiters for lock
35 * X-1 = #active readers + #readers attempting lock
36 * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
37 * (3) 1 writer active, no waiters for lock
38 * X-1 = #active readers + #readers attempting lock
39 * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
40 *
41 * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock
42 * (WAITING_BIAS + ACTIVE_BIAS)
43 * (2) 1 writer active or attempting lock, no waiters for lock
44 * (ACTIVE_WRITE_BIAS)
45 *
46 * 0xffff0000 (1) There are writers or readers queued but none active
47 * or in the process of attempting lock.
48 * (WAITING_BIAS)
49 * Note: writer can attempt to steal lock for this count by adding
50 * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count
51 *
52 * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue.
53 * (ACTIVE_WRITE_BIAS + WAITING_BIAS)
54 *
55 * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking
56 * the count becomes more than 0 for successful lock acquisition,
57 * i.e. the case where there are only readers or nobody has lock.
58 * (1st and 2nd case above).
59 *
60 * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and
61 * checking the count becomes ACTIVE_WRITE_BIAS for successful lock
62 * acquisition (i.e. nobody else has lock or attempts lock). If
63 * unsuccessful, in rwsem_down_write_failed, we'll check to see if there
64 * are only waiters but none active (5th case above), and attempt to
65 * steal the lock.
66 *
67 */
13 68
14/* 69/*
15 * Initialize an rwsem: 70 * Initialize an rwsem:
@@ -27,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
27 sem->count = RWSEM_UNLOCKED_VALUE; 82 sem->count = RWSEM_UNLOCKED_VALUE;
28 raw_spin_lock_init(&sem->wait_lock); 83 raw_spin_lock_init(&sem->wait_lock);
29 INIT_LIST_HEAD(&sem->wait_list); 84 INIT_LIST_HEAD(&sem->wait_list);
85#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
86 sem->owner = NULL;
87 osq_lock_init(&sem->osq);
88#endif
30} 89}
31 90
32EXPORT_SYMBOL(__init_rwsem); 91EXPORT_SYMBOL(__init_rwsem);
@@ -141,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
141} 200}
142 201
143/* 202/*
144 * wait for the read lock to be granted 203 * Wait for the read lock to be granted
145 */ 204 */
146__visible 205__visible
147struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) 206struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
@@ -188,64 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
188 return sem; 247 return sem;
189} 248}
190 249
250static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
251{
252 if (!(count & RWSEM_ACTIVE_MASK)) {
253 /* try acquiring the write lock */
254 if (sem->count == RWSEM_WAITING_BIAS &&
255 cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
256 RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
257 if (!list_is_singular(&sem->wait_list))
258 rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
259 return true;
260 }
261 }
262 return false;
263}
264
265#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
191/* 266/*
192 * wait until we successfully acquire the write lock 267 * Try to acquire write lock before the writer has been put on wait queue.
268 */
269static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
270{
271 long old, count = ACCESS_ONCE(sem->count);
272
273 while (true) {
274 if (!(count == 0 || count == RWSEM_WAITING_BIAS))
275 return false;
276
277 old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
278 if (old == count)
279 return true;
280
281 count = old;
282 }
283}
284
285static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
286{
287 struct task_struct *owner;
288 bool on_cpu = false;
289
290 if (need_resched())
291 return false;
292
293 rcu_read_lock();
294 owner = ACCESS_ONCE(sem->owner);
295 if (owner)
296 on_cpu = owner->on_cpu;
297 rcu_read_unlock();
298
299 /*
300 * If sem->owner is not set, yet we have just recently entered the
301 * slowpath, then there is a possibility reader(s) may have the lock.
302 * To be safe, avoid spinning in these situations.
303 */
304 return on_cpu;
305}
306
307static inline bool owner_running(struct rw_semaphore *sem,
308 struct task_struct *owner)
309{
310 if (sem->owner != owner)
311 return false;
312
313 /*
314 * Ensure we emit the owner->on_cpu, dereference _after_ checking
315 * sem->owner still matches owner, if that fails, owner might
316 * point to free()d memory, if it still matches, the rcu_read_lock()
317 * ensures the memory stays valid.
318 */
319 barrier();
320
321 return owner->on_cpu;
322}
323
324static noinline
325bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
326{
327 rcu_read_lock();
328 while (owner_running(sem, owner)) {
329 if (need_resched())
330 break;
331
332 arch_mutex_cpu_relax();
333 }
334 rcu_read_unlock();
335
336 /*
337 * We break out the loop above on need_resched() or when the
338 * owner changed, which is a sign for heavy contention. Return
339 * success only when sem->owner is NULL.
340 */
341 return sem->owner == NULL;
342}
343
344static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
345{
346 struct task_struct *owner;
347 bool taken = false;
348
349 preempt_disable();
350
351 /* sem->wait_lock should not be held when doing optimistic spinning */
352 if (!rwsem_can_spin_on_owner(sem))
353 goto done;
354
355 if (!osq_lock(&sem->osq))
356 goto done;
357
358 while (true) {
359 owner = ACCESS_ONCE(sem->owner);
360 if (owner && !rwsem_spin_on_owner(sem, owner))
361 break;
362
363 /* wait_lock will be acquired if write_lock is obtained */
364 if (rwsem_try_write_lock_unqueued(sem)) {
365 taken = true;
366 break;
367 }
368
369 /*
370 * When there's no owner, we might have preempted between the
371 * owner acquiring the lock and setting the owner field. If
372 * we're an RT task that will live-lock because we won't let
373 * the owner complete.
374 */
375 if (!owner && (need_resched() || rt_task(current)))
376 break;
377
378 /*
379 * The cpu_relax() call is a compiler barrier which forces
380 * everything in this loop to be re-loaded. We don't need
381 * memory barriers as we'll eventually observe the right
382 * values at the cost of a few extra spins.
383 */
384 arch_mutex_cpu_relax();
385 }
386 osq_unlock(&sem->osq);
387done:
388 preempt_enable();
389 return taken;
390}
391
392#else
393static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
394{
395 return false;
396}
397#endif
398
399/*
400 * Wait until we successfully acquire the write lock
193 */ 401 */
194__visible 402__visible
195struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) 403struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
196{ 404{
197 long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; 405 long count;
406 bool waiting = true; /* any queued threads before us */
198 struct rwsem_waiter waiter; 407 struct rwsem_waiter waiter;
199 struct task_struct *tsk = current;
200 408
201 /* set up my own style of waitqueue */ 409 /* undo write bias from down_write operation, stop active locking */
202 waiter.task = tsk; 410 count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
411
412 /* do optimistic spinning and steal lock if possible */
413 if (rwsem_optimistic_spin(sem))
414 return sem;
415
416 /*
417 * Optimistic spinning failed, proceed to the slowpath
418 * and block until we can acquire the sem.
419 */
420 waiter.task = current;
203 waiter.type = RWSEM_WAITING_FOR_WRITE; 421 waiter.type = RWSEM_WAITING_FOR_WRITE;
204 422
205 raw_spin_lock_irq(&sem->wait_lock); 423 raw_spin_lock_irq(&sem->wait_lock);
424
425 /* account for this before adding a new element to the list */
206 if (list_empty(&sem->wait_list)) 426 if (list_empty(&sem->wait_list))
207 adjustment += RWSEM_WAITING_BIAS; 427 waiting = false;
428
208 list_add_tail(&waiter.list, &sem->wait_list); 429 list_add_tail(&waiter.list, &sem->wait_list);
209 430
210 /* we're now waiting on the lock, but no longer actively locking */ 431 /* we're now waiting on the lock, but no longer actively locking */
211 count = rwsem_atomic_update(adjustment, sem); 432 if (waiting) {
433 count = ACCESS_ONCE(sem->count);
434
435 /*
436 * If there were already threads queued before us and there are
437 * no active writers, the lock must be read owned; so we try to
438 * wake any read locks that were queued ahead of us.
439 */
440 if (count > RWSEM_WAITING_BIAS)
441 sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
212 442
213 /* If there were already threads queued before us and there are no 443 } else
214 * active writers, the lock must be read owned; so we try to wake 444 count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
215 * any read locks that were queued ahead of us. */
216 if (count > RWSEM_WAITING_BIAS &&
217 adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
218 sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
219 445
220 /* wait until we successfully acquire the lock */ 446 /* wait until we successfully acquire the lock */
221 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 447 set_current_state(TASK_UNINTERRUPTIBLE);
222 while (true) { 448 while (true) {
223 if (!(count & RWSEM_ACTIVE_MASK)) { 449 if (rwsem_try_write_lock(count, sem))
224 /* Try acquiring the write lock. */ 450 break;
225 count = RWSEM_ACTIVE_WRITE_BIAS;
226 if (!list_is_singular(&sem->wait_list))
227 count += RWSEM_WAITING_BIAS;
228
229 if (sem->count == RWSEM_WAITING_BIAS &&
230 cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
231 RWSEM_WAITING_BIAS)
232 break;
233 }
234
235 raw_spin_unlock_irq(&sem->wait_lock); 451 raw_spin_unlock_irq(&sem->wait_lock);
236 452
237 /* Block until there are no active lockers. */ 453 /* Block until there are no active lockers. */
238 do { 454 do {
239 schedule(); 455 schedule();
240 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 456 set_current_state(TASK_UNINTERRUPTIBLE);
241 } while ((count = sem->count) & RWSEM_ACTIVE_MASK); 457 } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
242 458
243 raw_spin_lock_irq(&sem->wait_lock); 459 raw_spin_lock_irq(&sem->wait_lock);
244 } 460 }
461 __set_current_state(TASK_RUNNING);
245 462
246 list_del(&waiter.list); 463 list_del(&waiter.list);
247 raw_spin_unlock_irq(&sem->wait_lock); 464 raw_spin_unlock_irq(&sem->wait_lock);
248 tsk->state = TASK_RUNNING;
249 465
250 return sem; 466 return sem;
251} 467}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index cfff1435bdfb..e2d3bc7f03b4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,6 +12,27 @@
12 12
13#include <linux/atomic.h> 13#include <linux/atomic.h>
14 14
15#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
16static inline void rwsem_set_owner(struct rw_semaphore *sem)
17{
18 sem->owner = current;
19}
20
21static inline void rwsem_clear_owner(struct rw_semaphore *sem)
22{
23 sem->owner = NULL;
24}
25
26#else
27static inline void rwsem_set_owner(struct rw_semaphore *sem)
28{
29}
30
31static inline void rwsem_clear_owner(struct rw_semaphore *sem)
32{
33}
34#endif
35
15/* 36/*
16 * lock for reading 37 * lock for reading
17 */ 38 */
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
48 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 69 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
49 70
50 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 71 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
72 rwsem_set_owner(sem);
51} 73}
52 74
53EXPORT_SYMBOL(down_write); 75EXPORT_SYMBOL(down_write);
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
59{ 81{
60 int ret = __down_write_trylock(sem); 82 int ret = __down_write_trylock(sem);
61 83
62 if (ret == 1) 84 if (ret == 1) {
63 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); 85 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
86 rwsem_set_owner(sem);
87 }
88
64 return ret; 89 return ret;
65} 90}
66 91
@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
85{ 110{
86 rwsem_release(&sem->dep_map, 1, _RET_IP_); 111 rwsem_release(&sem->dep_map, 1, _RET_IP_);
87 112
113 rwsem_clear_owner(sem);
88 __up_write(sem); 114 __up_write(sem);
89} 115}
90 116
@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
99 * lockdep: a downgraded write will live on as a write 125 * lockdep: a downgraded write will live on as a write
100 * dependency. 126 * dependency.
101 */ 127 */
128 rwsem_clear_owner(sem);
102 __downgrade_write(sem); 129 __downgrade_write(sem);
103} 130}
104 131
@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
122 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); 149 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
123 150
124 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 151 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
152 rwsem_set_owner(sem);
125} 153}
126 154
127EXPORT_SYMBOL(_down_write_nest_lock); 155EXPORT_SYMBOL(_down_write_nest_lock);
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
141 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); 169 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
142 170
143 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 171 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
172 rwsem_set_owner(sem);
144} 173}
145 174
146EXPORT_SYMBOL(down_write_nested); 175EXPORT_SYMBOL(down_write_nested);
diff --git a/kernel/module.c b/kernel/module.c
index 079c4615607d..81e727cf6df9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3020,21 +3020,6 @@ static int do_init_module(struct module *mod)
3020 */ 3020 */
3021 current->flags &= ~PF_USED_ASYNC; 3021 current->flags &= ~PF_USED_ASYNC;
3022 3022
3023 blocking_notifier_call_chain(&module_notify_list,
3024 MODULE_STATE_COMING, mod);
3025
3026 /* Set RO and NX regions for core */
3027 set_section_ro_nx(mod->module_core,
3028 mod->core_text_size,
3029 mod->core_ro_size,
3030 mod->core_size);
3031
3032 /* Set RO and NX regions for init */
3033 set_section_ro_nx(mod->module_init,
3034 mod->init_text_size,
3035 mod->init_ro_size,
3036 mod->init_size);
3037
3038 do_mod_ctors(mod); 3023 do_mod_ctors(mod);
3039 /* Start the module */ 3024 /* Start the module */
3040 if (mod->init != NULL) 3025 if (mod->init != NULL)
@@ -3165,9 +3150,26 @@ static int complete_formation(struct module *mod, struct load_info *info)
3165 /* This relies on module_mutex for list integrity. */ 3150 /* This relies on module_mutex for list integrity. */
3166 module_bug_finalize(info->hdr, info->sechdrs, mod); 3151 module_bug_finalize(info->hdr, info->sechdrs, mod);
3167 3152
3153 /* Set RO and NX regions for core */
3154 set_section_ro_nx(mod->module_core,
3155 mod->core_text_size,
3156 mod->core_ro_size,
3157 mod->core_size);
3158
3159 /* Set RO and NX regions for init */
3160 set_section_ro_nx(mod->module_init,
3161 mod->init_text_size,
3162 mod->init_ro_size,
3163 mod->init_size);
3164
3168 /* Mark state as coming so strong_try_module_get() ignores us, 3165 /* Mark state as coming so strong_try_module_get() ignores us,
3169 * but kallsyms etc. can see us. */ 3166 * but kallsyms etc. can see us. */
3170 mod->state = MODULE_STATE_COMING; 3167 mod->state = MODULE_STATE_COMING;
3168 mutex_unlock(&module_mutex);
3169
3170 blocking_notifier_call_chain(&module_notify_list,
3171 MODULE_STATE_COMING, mod);
3172 return 0;
3171 3173
3172out: 3174out:
3173 mutex_unlock(&module_mutex); 3175 mutex_unlock(&module_mutex);
@@ -3190,6 +3192,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
3190{ 3192{
3191 struct module *mod; 3193 struct module *mod;
3192 long err; 3194 long err;
3195 char *after_dashes;
3193 3196
3194 err = module_sig_check(info); 3197 err = module_sig_check(info);
3195 if (err) 3198 if (err)
@@ -3277,10 +3280,15 @@ static int load_module(struct load_info *info, const char __user *uargs,
3277 goto ddebug_cleanup; 3280 goto ddebug_cleanup;
3278 3281
3279 /* Module is ready to execute: parsing args may do that. */ 3282 /* Module is ready to execute: parsing args may do that. */
3280 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 3283 after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
3281 -32768, 32767, unknown_module_param_cb); 3284 -32768, 32767, unknown_module_param_cb);
3282 if (err < 0) 3285 if (IS_ERR(after_dashes)) {
3286 err = PTR_ERR(after_dashes);
3283 goto bug_cleanup; 3287 goto bug_cleanup;
3288 } else if (after_dashes) {
3289 pr_warn("%s: parameters '%s' after `--' ignored\n",
3290 mod->name, after_dashes);
3291 }
3284 3292
3285 /* Link in to syfs. */ 3293 /* Link in to syfs. */
3286 err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); 3294 err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index db4c8b08a50c..4803da6eab62 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -71,9 +71,9 @@ static int notifier_chain_unregister(struct notifier_block **nl,
71 * @returns: notifier_call_chain returns the value returned by the 71 * @returns: notifier_call_chain returns the value returned by the
72 * last notifier function called. 72 * last notifier function called.
73 */ 73 */
74static int __kprobes notifier_call_chain(struct notifier_block **nl, 74static int notifier_call_chain(struct notifier_block **nl,
75 unsigned long val, void *v, 75 unsigned long val, void *v,
76 int nr_to_call, int *nr_calls) 76 int nr_to_call, int *nr_calls)
77{ 77{
78 int ret = NOTIFY_DONE; 78 int ret = NOTIFY_DONE;
79 struct notifier_block *nb, *next_nb; 79 struct notifier_block *nb, *next_nb;
@@ -102,6 +102,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
102 } 102 }
103 return ret; 103 return ret;
104} 104}
105NOKPROBE_SYMBOL(notifier_call_chain);
105 106
106/* 107/*
107 * Atomic notifier chain routines. Registration and unregistration 108 * Atomic notifier chain routines. Registration and unregistration
@@ -172,9 +173,9 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
172 * Otherwise the return value is the return value 173 * Otherwise the return value is the return value
173 * of the last notifier function called. 174 * of the last notifier function called.
174 */ 175 */
175int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, 176int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
176 unsigned long val, void *v, 177 unsigned long val, void *v,
177 int nr_to_call, int *nr_calls) 178 int nr_to_call, int *nr_calls)
178{ 179{
179 int ret; 180 int ret;
180 181
@@ -184,13 +185,15 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
184 return ret; 185 return ret;
185} 186}
186EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); 187EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
188NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
187 189
188int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, 190int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
189 unsigned long val, void *v) 191 unsigned long val, void *v)
190{ 192{
191 return __atomic_notifier_call_chain(nh, val, v, -1, NULL); 193 return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
192} 194}
193EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); 195EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
196NOKPROBE_SYMBOL(atomic_notifier_call_chain);
194 197
195/* 198/*
196 * Blocking notifier chain routines. All access to the chain is 199 * Blocking notifier chain routines. All access to the chain is
@@ -527,7 +530,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
527 530
528static ATOMIC_NOTIFIER_HEAD(die_chain); 531static ATOMIC_NOTIFIER_HEAD(die_chain);
529 532
530int notrace __kprobes notify_die(enum die_val val, const char *str, 533int notrace notify_die(enum die_val val, const char *str,
531 struct pt_regs *regs, long err, int trap, int sig) 534 struct pt_regs *regs, long err, int trap, int sig)
532{ 535{
533 struct die_args args = { 536 struct die_args args = {
@@ -540,6 +543,7 @@ int notrace __kprobes notify_die(enum die_val val, const char *str,
540 }; 543 };
541 return atomic_notifier_call_chain(&die_chain, val, &args); 544 return atomic_notifier_call_chain(&die_chain, val, &args);
542} 545}
546NOKPROBE_SYMBOL(notify_die);
543 547
544int register_die_notifier(struct notifier_block *nb) 548int register_die_notifier(struct notifier_block *nb)
545{ 549{
diff --git a/kernel/panic.c b/kernel/panic.c
index d02fa9fef46a..62e16cef9cc2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -32,6 +32,7 @@ static unsigned long tainted_mask;
32static int pause_on_oops; 32static int pause_on_oops;
33static int pause_on_oops_flag; 33static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35static bool crash_kexec_post_notifiers;
35 36
36int panic_timeout = CONFIG_PANIC_TIMEOUT; 37int panic_timeout = CONFIG_PANIC_TIMEOUT;
37EXPORT_SYMBOL_GPL(panic_timeout); 38EXPORT_SYMBOL_GPL(panic_timeout);
@@ -112,9 +113,11 @@ void panic(const char *fmt, ...)
112 /* 113 /*
113 * If we have crashed and we have a crash kernel loaded let it handle 114 * If we have crashed and we have a crash kernel loaded let it handle
114 * everything else. 115 * everything else.
115 * Do we want to call this before we try to display a message? 116 * If we want to run this after calling panic_notifiers, pass
117 * the "crash_kexec_post_notifiers" option to the kernel.
116 */ 118 */
117 crash_kexec(NULL); 119 if (!crash_kexec_post_notifiers)
120 crash_kexec(NULL);
118 121
119 /* 122 /*
120 * Note smp_send_stop is the usual smp shutdown function, which 123 * Note smp_send_stop is the usual smp shutdown function, which
@@ -131,6 +134,15 @@ void panic(const char *fmt, ...)
131 134
132 kmsg_dump(KMSG_DUMP_PANIC); 135 kmsg_dump(KMSG_DUMP_PANIC);
133 136
137 /*
138 * If you doubt kdump always works fine in any situation,
139 * "crash_kexec_post_notifiers" offers you a chance to run
140 * panic_notifiers and dumping kmsg before kdump.
141 * Note: since some panic_notifiers can make crashed kernel
142 * more unstable, it can increase risks of the kdump failure too.
143 */
144 crash_kexec(NULL);
145
134 bust_spinlocks(0); 146 bust_spinlocks(0);
135 147
136 if (!panic_blink) 148 if (!panic_blink)
@@ -472,6 +484,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
472core_param(panic, panic_timeout, int, 0644); 484core_param(panic, panic_timeout, int, 0644);
473core_param(pause_on_oops, pause_on_oops, int, 0644); 485core_param(pause_on_oops, pause_on_oops, int, 0644);
474 486
487static int __init setup_crash_kexec_post_notifiers(char *s)
488{
489 crash_kexec_post_notifiers = true;
490 return 0;
491}
492early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers);
493
475static int __init oops_setup(char *s) 494static int __init oops_setup(char *s)
476{ 495{
477 if (!s) 496 if (!s)
diff --git a/kernel/params.c b/kernel/params.c
index b00142e7f3ba..1e52ca233fd9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -177,13 +177,13 @@ static char *next_arg(char *args, char **param, char **val)
177} 177}
178 178
179/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 179/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
180int parse_args(const char *doing, 180char *parse_args(const char *doing,
181 char *args, 181 char *args,
182 const struct kernel_param *params, 182 const struct kernel_param *params,
183 unsigned num, 183 unsigned num,
184 s16 min_level, 184 s16 min_level,
185 s16 max_level, 185 s16 max_level,
186 int (*unknown)(char *param, char *val, const char *doing)) 186 int (*unknown)(char *param, char *val, const char *doing))
187{ 187{
188 char *param, *val; 188 char *param, *val;
189 189
@@ -198,6 +198,9 @@ int parse_args(const char *doing,
198 int irq_was_disabled; 198 int irq_was_disabled;
199 199
200 args = next_arg(args, &param, &val); 200 args = next_arg(args, &param, &val);
201 /* Stop at -- */
202 if (!val && strcmp(param, "--") == 0)
203 return args;
201 irq_was_disabled = irqs_disabled(); 204 irq_was_disabled = irqs_disabled();
202 ret = parse_one(param, val, doing, params, num, 205 ret = parse_one(param, val, doing, params, num,
203 min_level, max_level, unknown); 206 min_level, max_level, unknown);
@@ -208,22 +211,22 @@ int parse_args(const char *doing,
208 switch (ret) { 211 switch (ret) {
209 case -ENOENT: 212 case -ENOENT:
210 pr_err("%s: Unknown parameter `%s'\n", doing, param); 213 pr_err("%s: Unknown parameter `%s'\n", doing, param);
211 return ret; 214 return ERR_PTR(ret);
212 case -ENOSPC: 215 case -ENOSPC:
213 pr_err("%s: `%s' too large for parameter `%s'\n", 216 pr_err("%s: `%s' too large for parameter `%s'\n",
214 doing, val ?: "", param); 217 doing, val ?: "", param);
215 return ret; 218 return ERR_PTR(ret);
216 case 0: 219 case 0:
217 break; 220 break;
218 default: 221 default:
219 pr_err("%s: `%s' invalid for parameter `%s'\n", 222 pr_err("%s: `%s' invalid for parameter `%s'\n",
220 doing, val ?: "", param); 223 doing, val ?: "", param);
221 return ret; 224 return ERR_PTR(ret);
222 } 225 }
223 } 226 }
224 227
225 /* All parsed OK. */ 228 /* All parsed OK. */
226 return 0; 229 return NULL;
227} 230}
228 231
229/* Lazy bastard, eh? */ 232/* Lazy bastard, eh? */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 2fac9cc79b3d..9a83d780facd 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -257,8 +257,7 @@ config ARCH_HAS_OPP
257 bool 257 bool
258 258
259config PM_OPP 259config PM_OPP
260 bool "Operating Performance Point (OPP) Layer library" 260 bool
261 depends on ARCH_HAS_OPP
262 ---help--- 261 ---help---
263 SOCs have a standard set of tuples consisting of frequency and 262 SOCs have a standard set of tuples consisting of frequency and
264 voltage pairs that the device will support per voltage domain. This 263 voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index f4f2073711d3..fcc2611d3f14 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,14 +28,16 @@
28#include <linux/syscore_ops.h> 28#include <linux/syscore_ops.h>
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/genhd.h> 30#include <linux/genhd.h>
31#include <trace/events/power.h>
31 32
32#include "power.h" 33#include "power.h"
33 34
34 35
35static int nocompress; 36static int nocompress;
36static int noresume; 37static int noresume;
38static int nohibernate;
37static int resume_wait; 39static int resume_wait;
38static int resume_delay; 40static unsigned int resume_delay;
39static char resume_file[256] = CONFIG_PM_STD_PARTITION; 41static char resume_file[256] = CONFIG_PM_STD_PARTITION;
40dev_t swsusp_resume_device; 42dev_t swsusp_resume_device;
41sector_t swsusp_resume_block; 43sector_t swsusp_resume_block;
@@ -61,6 +63,11 @@ bool freezer_test_done;
61 63
62static const struct platform_hibernation_ops *hibernation_ops; 64static const struct platform_hibernation_ops *hibernation_ops;
63 65
66bool hibernation_available(void)
67{
68 return (nohibernate == 0);
69}
70
64/** 71/**
65 * hibernation_set_ops - Set the global hibernate operations. 72 * hibernation_set_ops - Set the global hibernate operations.
66 * @ops: Hibernation operations to use in subsequent hibernation transitions. 73 * @ops: Hibernation operations to use in subsequent hibernation transitions.
@@ -228,19 +235,23 @@ static void platform_recover(int platform_mode)
228void swsusp_show_speed(struct timeval *start, struct timeval *stop, 235void swsusp_show_speed(struct timeval *start, struct timeval *stop,
229 unsigned nr_pages, char *msg) 236 unsigned nr_pages, char *msg)
230{ 237{
231 s64 elapsed_centisecs64; 238 u64 elapsed_centisecs64;
232 int centisecs; 239 unsigned int centisecs;
233 int k; 240 unsigned int k;
234 int kps; 241 unsigned int kps;
235 242
236 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); 243 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
244 /*
245 * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
246 * it is obvious enough for what went wrong.
247 */
237 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); 248 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
238 centisecs = elapsed_centisecs64; 249 centisecs = elapsed_centisecs64;
239 if (centisecs == 0) 250 if (centisecs == 0)
240 centisecs = 1; /* avoid div-by-zero */ 251 centisecs = 1; /* avoid div-by-zero */
241 k = nr_pages * (PAGE_SIZE / 1024); 252 k = nr_pages * (PAGE_SIZE / 1024);
242 kps = (k * 100) / centisecs; 253 kps = (k * 100) / centisecs;
243 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", 254 printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
244 msg, k, 255 msg, k,
245 centisecs / 100, centisecs % 100, 256 centisecs / 100, centisecs % 100,
246 kps / 1000, (kps % 1000) / 10); 257 kps / 1000, (kps % 1000) / 10);
@@ -288,7 +299,9 @@ static int create_image(int platform_mode)
288 299
289 in_suspend = 1; 300 in_suspend = 1;
290 save_processor_state(); 301 save_processor_state();
302 trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
291 error = swsusp_arch_suspend(); 303 error = swsusp_arch_suspend();
304 trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
292 if (error) 305 if (error)
293 printk(KERN_ERR "PM: Error %d creating hibernation image\n", 306 printk(KERN_ERR "PM: Error %d creating hibernation image\n",
294 error); 307 error);
@@ -595,7 +608,8 @@ static void power_down(void)
595 case HIBERNATION_PLATFORM: 608 case HIBERNATION_PLATFORM:
596 hibernation_platform_enter(); 609 hibernation_platform_enter();
597 case HIBERNATION_SHUTDOWN: 610 case HIBERNATION_SHUTDOWN:
598 kernel_power_off(); 611 if (pm_power_off)
612 kernel_power_off();
599 break; 613 break;
600#ifdef CONFIG_SUSPEND 614#ifdef CONFIG_SUSPEND
601 case HIBERNATION_SUSPEND: 615 case HIBERNATION_SUSPEND:
@@ -623,7 +637,8 @@ static void power_down(void)
623 * corruption after resume. 637 * corruption after resume.
624 */ 638 */
625 printk(KERN_CRIT "PM: Please power down manually\n"); 639 printk(KERN_CRIT "PM: Please power down manually\n");
626 while(1); 640 while (1)
641 cpu_relax();
627} 642}
628 643
629/** 644/**
@@ -633,6 +648,11 @@ int hibernate(void)
633{ 648{
634 int error; 649 int error;
635 650
651 if (!hibernation_available()) {
652 pr_debug("PM: Hibernation not available.\n");
653 return -EPERM;
654 }
655
636 lock_system_sleep(); 656 lock_system_sleep();
637 /* The snapshot device should not be opened while we're running */ 657 /* The snapshot device should not be opened while we're running */
638 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 658 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -725,7 +745,7 @@ static int software_resume(void)
725 /* 745 /*
726 * If the user said "noresume".. bail out early. 746 * If the user said "noresume".. bail out early.
727 */ 747 */
728 if (noresume) 748 if (noresume || !hibernation_available())
729 return 0; 749 return 0;
730 750
731 /* 751 /*
@@ -891,6 +911,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
891 int i; 911 int i;
892 char *start = buf; 912 char *start = buf;
893 913
914 if (!hibernation_available())
915 return sprintf(buf, "[disabled]\n");
916
894 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { 917 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
895 if (!hibernation_modes[i]) 918 if (!hibernation_modes[i])
896 continue; 919 continue;
@@ -925,6 +948,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
925 char *p; 948 char *p;
926 int mode = HIBERNATION_INVALID; 949 int mode = HIBERNATION_INVALID;
927 950
951 if (!hibernation_available())
952 return -EPERM;
953
928 p = memchr(buf, '\n', n); 954 p = memchr(buf, '\n', n);
929 len = p ? p - buf : n; 955 len = p ? p - buf : n;
930 956
@@ -1092,6 +1118,10 @@ static int __init hibernate_setup(char *str)
1092 noresume = 1; 1118 noresume = 1;
1093 else if (!strncmp(str, "nocompress", 10)) 1119 else if (!strncmp(str, "nocompress", 10))
1094 nocompress = 1; 1120 nocompress = 1;
1121 else if (!strncmp(str, "no", 2)) {
1122 noresume = 1;
1123 nohibernate = 1;
1124 }
1095 return 1; 1125 return 1;
1096} 1126}
1097 1127
@@ -1109,13 +1139,30 @@ static int __init resumewait_setup(char *str)
1109 1139
1110static int __init resumedelay_setup(char *str) 1140static int __init resumedelay_setup(char *str)
1111{ 1141{
1112 resume_delay = simple_strtoul(str, NULL, 0); 1142 int rc = kstrtouint(str, 0, &resume_delay);
1143
1144 if (rc)
1145 return rc;
1146 return 1;
1147}
1148
1149static int __init nohibernate_setup(char *str)
1150{
1151 noresume = 1;
1152 nohibernate = 1;
1113 return 1; 1153 return 1;
1114} 1154}
1115 1155
1156static int __init kaslr_nohibernate_setup(char *str)
1157{
1158 return nohibernate_setup(str);
1159}
1160
1116__setup("noresume", noresume_setup); 1161__setup("noresume", noresume_setup);
1117__setup("resume_offset=", resume_offset_setup); 1162__setup("resume_offset=", resume_offset_setup);
1118__setup("resume=", resume_setup); 1163__setup("resume=", resume_setup);
1119__setup("hibernate=", hibernate_setup); 1164__setup("hibernate=", hibernate_setup);
1120__setup("resumewait", resumewait_setup); 1165__setup("resumewait", resumewait_setup);
1121__setup("resumedelay=", resumedelay_setup); 1166__setup("resumedelay=", resumedelay_setup);
1167__setup("nohibernate", nohibernate_setup);
1168__setup("kaslr", kaslr_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6271bc4073ef..8e90f330f139 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -279,34 +279,32 @@ static inline void pm_print_times_init(void) {}
279struct kobject *power_kobj; 279struct kobject *power_kobj;
280 280
281/** 281/**
282 * state - control system power state. 282 * state - control system sleep states.
283 * 283 *
284 * show() returns what states are supported, which is hard-coded to 284 * show() returns available sleep state labels, which may be "mem", "standby",
285 * 'freeze' (Low-Power Idle), 'standby' (Power-On Suspend), 285 * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a
286 * 'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk). 286 * description of what they mean.
287 * 287 *
288 * store() accepts one of those strings, translates it into the 288 * store() accepts one of those strings, translates it into the proper
289 * proper enumerated value, and initiates a suspend transition. 289 * enumerated value, and initiates a suspend transition.
290 */ 290 */
291static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, 291static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
292 char *buf) 292 char *buf)
293{ 293{
294 char *s = buf; 294 char *s = buf;
295#ifdef CONFIG_SUSPEND 295#ifdef CONFIG_SUSPEND
296 int i; 296 suspend_state_t i;
297
298 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
299 if (pm_states[i].state)
300 s += sprintf(s,"%s ", pm_states[i].label);
297 301
298 for (i = 0; i < PM_SUSPEND_MAX; i++) {
299 if (pm_states[i] && valid_state(i))
300 s += sprintf(s,"%s ", pm_states[i]);
301 }
302#endif 302#endif
303#ifdef CONFIG_HIBERNATION 303 if (hibernation_available())
304 s += sprintf(s, "%s\n", "disk"); 304 s += sprintf(s, "disk ");
305#else
306 if (s != buf) 305 if (s != buf)
307 /* convert the last space to a newline */ 306 /* convert the last space to a newline */
308 *(s-1) = '\n'; 307 *(s-1) = '\n';
309#endif
310 return (s - buf); 308 return (s - buf);
311} 309}
312 310
@@ -314,7 +312,7 @@ static suspend_state_t decode_state(const char *buf, size_t n)
314{ 312{
315#ifdef CONFIG_SUSPEND 313#ifdef CONFIG_SUSPEND
316 suspend_state_t state = PM_SUSPEND_MIN; 314 suspend_state_t state = PM_SUSPEND_MIN;
317 const char * const *s; 315 struct pm_sleep_state *s;
318#endif 316#endif
319 char *p; 317 char *p;
320 int len; 318 int len;
@@ -328,8 +326,9 @@ static suspend_state_t decode_state(const char *buf, size_t n)
328 326
329#ifdef CONFIG_SUSPEND 327#ifdef CONFIG_SUSPEND
330 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) 328 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
331 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) 329 if (s->state && len == strlen(s->label)
332 return state; 330 && !strncmp(buf, s->label, len))
331 return s->state;
333#endif 332#endif
334 333
335 return PM_SUSPEND_ON; 334 return PM_SUSPEND_ON;
@@ -447,8 +446,8 @@ static ssize_t autosleep_show(struct kobject *kobj,
447 446
448#ifdef CONFIG_SUSPEND 447#ifdef CONFIG_SUSPEND
449 if (state < PM_SUSPEND_MAX) 448 if (state < PM_SUSPEND_MAX)
450 return sprintf(buf, "%s\n", valid_state(state) ? 449 return sprintf(buf, "%s\n", pm_states[state].state ?
451 pm_states[state] : "error"); 450 pm_states[state].label : "error");
452#endif 451#endif
453#ifdef CONFIG_HIBERNATION 452#ifdef CONFIG_HIBERNATION
454 return sprintf(buf, "disk\n"); 453 return sprintf(buf, "disk\n");
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 15f37ea08719..c60f13b5270a 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -178,17 +178,20 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
178 unsigned int, char *); 178 unsigned int, char *);
179 179
180#ifdef CONFIG_SUSPEND 180#ifdef CONFIG_SUSPEND
181struct pm_sleep_state {
182 const char *label;
183 suspend_state_t state;
184};
185
181/* kernel/power/suspend.c */ 186/* kernel/power/suspend.c */
182extern const char *const pm_states[]; 187extern struct pm_sleep_state pm_states[];
183 188
184extern bool valid_state(suspend_state_t state);
185extern int suspend_devices_and_enter(suspend_state_t state); 189extern int suspend_devices_and_enter(suspend_state_t state);
186#else /* !CONFIG_SUSPEND */ 190#else /* !CONFIG_SUSPEND */
187static inline int suspend_devices_and_enter(suspend_state_t state) 191static inline int suspend_devices_and_enter(suspend_state_t state)
188{ 192{
189 return -ENOSYS; 193 return -ENOSYS;
190} 194}
191static inline bool valid_state(suspend_state_t state) { return false; }
192#endif /* !CONFIG_SUSPEND */ 195#endif /* !CONFIG_SUSPEND */
193 196
194#ifdef CONFIG_PM_TEST_SUSPEND 197#ifdef CONFIG_PM_TEST_SUSPEND
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 06ec8869dbf1..4ee194eb524b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -17,6 +17,7 @@
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/workqueue.h> 18#include <linux/workqueue.h>
19#include <linux/kmod.h> 19#include <linux/kmod.h>
20#include <trace/events/power.h>
20 21
21/* 22/*
22 * Timeout for stopping processes 23 * Timeout for stopping processes
@@ -175,6 +176,7 @@ void thaw_processes(void)
175 struct task_struct *g, *p; 176 struct task_struct *g, *p;
176 struct task_struct *curr = current; 177 struct task_struct *curr = current;
177 178
179 trace_suspend_resume(TPS("thaw_processes"), 0, true);
178 if (pm_freezing) 180 if (pm_freezing)
179 atomic_dec(&system_freezing_cnt); 181 atomic_dec(&system_freezing_cnt);
180 pm_freezing = false; 182 pm_freezing = false;
@@ -184,6 +186,7 @@ void thaw_processes(void)
184 186
185 printk("Restarting tasks ... "); 187 printk("Restarting tasks ... ");
186 188
189 __usermodehelper_set_disable_depth(UMH_FREEZING);
187 thaw_workqueues(); 190 thaw_workqueues();
188 191
189 read_lock(&tasklist_lock); 192 read_lock(&tasklist_lock);
@@ -201,6 +204,7 @@ void thaw_processes(void)
201 204
202 schedule(); 205 schedule();
203 printk("done.\n"); 206 printk("done.\n");
207 trace_suspend_resume(TPS("thaw_processes"), 0, false);
204} 208}
205 209
206void thaw_kernel_threads(void) 210void thaw_kernel_threads(void)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8233cd4047d7..ed35a4790afe 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,13 +31,14 @@
31 31
32#include "power.h" 32#include "power.h"
33 33
34const char *const pm_states[PM_SUSPEND_MAX] = { 34struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = {
35 [PM_SUSPEND_FREEZE] = "freeze", 35 [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE },
36 [PM_SUSPEND_STANDBY] = "standby", 36 [PM_SUSPEND_STANDBY] = { .label = "standby", },
37 [PM_SUSPEND_MEM] = "mem", 37 [PM_SUSPEND_MEM] = { .label = "mem", },
38}; 38};
39 39
40static const struct platform_suspend_ops *suspend_ops; 40static const struct platform_suspend_ops *suspend_ops;
41static const struct platform_freeze_ops *freeze_ops;
41 42
42static bool need_suspend_ops(suspend_state_t state) 43static bool need_suspend_ops(suspend_state_t state)
43{ 44{
@@ -47,6 +48,13 @@ static bool need_suspend_ops(suspend_state_t state)
47static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); 48static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
48static bool suspend_freeze_wake; 49static bool suspend_freeze_wake;
49 50
51void freeze_set_ops(const struct platform_freeze_ops *ops)
52{
53 lock_system_sleep();
54 freeze_ops = ops;
55 unlock_system_sleep();
56}
57
50static void freeze_begin(void) 58static void freeze_begin(void)
51{ 59{
52 suspend_freeze_wake = false; 60 suspend_freeze_wake = false;
@@ -54,9 +62,11 @@ static void freeze_begin(void)
54 62
55static void freeze_enter(void) 63static void freeze_enter(void)
56{ 64{
65 cpuidle_use_deepest_state(true);
57 cpuidle_resume(); 66 cpuidle_resume();
58 wait_event(suspend_freeze_wait_head, suspend_freeze_wake); 67 wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
59 cpuidle_pause(); 68 cpuidle_pause();
69 cpuidle_use_deepest_state(false);
60} 70}
61 71
62void freeze_wake(void) 72void freeze_wake(void)
@@ -66,42 +76,62 @@ void freeze_wake(void)
66} 76}
67EXPORT_SYMBOL_GPL(freeze_wake); 77EXPORT_SYMBOL_GPL(freeze_wake);
68 78
79static bool valid_state(suspend_state_t state)
80{
81 /*
82 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
83 * support and need to be valid to the low level
84 * implementation, no valid callback implies that none are valid.
85 */
86 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
87}
88
89/*
90 * If this is set, the "mem" label always corresponds to the deepest sleep state
91 * available, the "standby" label corresponds to the second deepest sleep state
92 * available (if any), and the "freeze" label corresponds to the remaining
93 * available sleep state (if there is one).
94 */
95static bool relative_states;
96
97static int __init sleep_states_setup(char *str)
98{
99 relative_states = !strncmp(str, "1", 1);
100 if (relative_states) {
101 pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
102 pm_states[PM_SUSPEND_FREEZE].state = 0;
103 }
104 return 1;
105}
106
107__setup("relative_sleep_states=", sleep_states_setup);
108
69/** 109/**
70 * suspend_set_ops - Set the global suspend method table. 110 * suspend_set_ops - Set the global suspend method table.
71 * @ops: Suspend operations to use. 111 * @ops: Suspend operations to use.
72 */ 112 */
73void suspend_set_ops(const struct platform_suspend_ops *ops) 113void suspend_set_ops(const struct platform_suspend_ops *ops)
74{ 114{
115 suspend_state_t i;
116 int j = PM_SUSPEND_MAX - 1;
117
75 lock_system_sleep(); 118 lock_system_sleep();
119
76 suspend_ops = ops; 120 suspend_ops = ops;
121 for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
122 if (valid_state(i))
123 pm_states[j--].state = i;
124 else if (!relative_states)
125 pm_states[j--].state = 0;
126
127 pm_states[j--].state = PM_SUSPEND_FREEZE;
128 while (j >= PM_SUSPEND_MIN)
129 pm_states[j--].state = 0;
130
77 unlock_system_sleep(); 131 unlock_system_sleep();
78} 132}
79EXPORT_SYMBOL_GPL(suspend_set_ops); 133EXPORT_SYMBOL_GPL(suspend_set_ops);
80 134
81bool valid_state(suspend_state_t state)
82{
83 if (state == PM_SUSPEND_FREEZE) {
84#ifdef CONFIG_PM_DEBUG
85 if (pm_test_level != TEST_NONE &&
86 pm_test_level != TEST_FREEZER &&
87 pm_test_level != TEST_DEVICES &&
88 pm_test_level != TEST_PLATFORM) {
89 printk(KERN_WARNING "Unsupported pm_test mode for "
90 "freeze state, please choose "
91 "none/freezer/devices/platform.\n");
92 return false;
93 }
94#endif
95 return true;
96 }
97 /*
98 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
99 * support and need to be valid to the lowlevel
100 * implementation, no valid callback implies that none are valid.
101 */
102 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
103}
104
105/** 135/**
106 * suspend_valid_only_mem - Generic memory-only valid callback. 136 * suspend_valid_only_mem - Generic memory-only valid callback.
107 * 137 *
@@ -147,7 +177,9 @@ static int suspend_prepare(suspend_state_t state)
147 if (error) 177 if (error)
148 goto Finish; 178 goto Finish;
149 179
180 trace_suspend_resume(TPS("freeze_processes"), 0, true);
150 error = suspend_freeze_processes(); 181 error = suspend_freeze_processes();
182 trace_suspend_resume(TPS("freeze_processes"), 0, false);
151 if (!error) 183 if (!error)
152 return 0; 184 return 0;
153 185
@@ -210,7 +242,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
210 * all the devices are suspended. 242 * all the devices are suspended.
211 */ 243 */
212 if (state == PM_SUSPEND_FREEZE) { 244 if (state == PM_SUSPEND_FREEZE) {
245 trace_suspend_resume(TPS("machine_suspend"), state, true);
213 freeze_enter(); 246 freeze_enter();
247 trace_suspend_resume(TPS("machine_suspend"), state, false);
214 goto Platform_wake; 248 goto Platform_wake;
215 } 249 }
216 250
@@ -226,7 +260,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
226 if (!error) { 260 if (!error) {
227 *wakeup = pm_wakeup_pending(); 261 *wakeup = pm_wakeup_pending();
228 if (!(suspend_test(TEST_CORE) || *wakeup)) { 262 if (!(suspend_test(TEST_CORE) || *wakeup)) {
263 trace_suspend_resume(TPS("machine_suspend"),
264 state, true);
229 error = suspend_ops->enter(state); 265 error = suspend_ops->enter(state);
266 trace_suspend_resume(TPS("machine_suspend"),
267 state, false);
230 events_check_enabled = false; 268 events_check_enabled = false;
231 } 269 }
232 syscore_resume(); 270 syscore_resume();
@@ -264,11 +302,14 @@ int suspend_devices_and_enter(suspend_state_t state)
264 if (need_suspend_ops(state) && !suspend_ops) 302 if (need_suspend_ops(state) && !suspend_ops)
265 return -ENOSYS; 303 return -ENOSYS;
266 304
267 trace_machine_suspend(state);
268 if (need_suspend_ops(state) && suspend_ops->begin) { 305 if (need_suspend_ops(state) && suspend_ops->begin) {
269 error = suspend_ops->begin(state); 306 error = suspend_ops->begin(state);
270 if (error) 307 if (error)
271 goto Close; 308 goto Close;
309 } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) {
310 error = freeze_ops->begin();
311 if (error)
312 goto Close;
272 } 313 }
273 suspend_console(); 314 suspend_console();
274 suspend_test_start(); 315 suspend_test_start();
@@ -294,7 +335,9 @@ int suspend_devices_and_enter(suspend_state_t state)
294 Close: 335 Close:
295 if (need_suspend_ops(state) && suspend_ops->end) 336 if (need_suspend_ops(state) && suspend_ops->end)
296 suspend_ops->end(); 337 suspend_ops->end();
297 trace_machine_suspend(PWR_EVENT_EXIT); 338 else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
339 freeze_ops->end();
340
298 return error; 341 return error;
299 342
300 Recover_platform: 343 Recover_platform:
@@ -328,20 +371,31 @@ static int enter_state(suspend_state_t state)
328{ 371{
329 int error; 372 int error;
330 373
331 if (!valid_state(state)) 374 trace_suspend_resume(TPS("suspend_enter"), state, true);
332 return -ENODEV; 375 if (state == PM_SUSPEND_FREEZE) {
333 376#ifdef CONFIG_PM_DEBUG
377 if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
378 pr_warning("PM: Unsupported test mode for freeze state,"
379 "please choose none/freezer/devices/platform.\n");
380 return -EAGAIN;
381 }
382#endif
383 } else if (!valid_state(state)) {
384 return -EINVAL;
385 }
334 if (!mutex_trylock(&pm_mutex)) 386 if (!mutex_trylock(&pm_mutex))
335 return -EBUSY; 387 return -EBUSY;
336 388
337 if (state == PM_SUSPEND_FREEZE) 389 if (state == PM_SUSPEND_FREEZE)
338 freeze_begin(); 390 freeze_begin();
339 391
392 trace_suspend_resume(TPS("sync_filesystems"), 0, true);
340 printk(KERN_INFO "PM: Syncing filesystems ... "); 393 printk(KERN_INFO "PM: Syncing filesystems ... ");
341 sys_sync(); 394 sys_sync();
342 printk("done.\n"); 395 printk("done.\n");
396 trace_suspend_resume(TPS("sync_filesystems"), 0, false);
343 397
344 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 398 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
345 error = suspend_prepare(state); 399 error = suspend_prepare(state);
346 if (error) 400 if (error)
347 goto Unlock; 401 goto Unlock;
@@ -349,7 +403,8 @@ static int enter_state(suspend_state_t state)
349 if (suspend_test(TEST_FREEZER)) 403 if (suspend_test(TEST_FREEZER))
350 goto Finish; 404 goto Finish;
351 405
352 pr_debug("PM: Entering %s sleep\n", pm_states[state]); 406 trace_suspend_resume(TPS("suspend_enter"), state, false);
407 pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
353 pm_restrict_gfp_mask(); 408 pm_restrict_gfp_mask();
354 error = suspend_devices_and_enter(state); 409 error = suspend_devices_and_enter(state);
355 pm_restore_gfp_mask(); 410 pm_restore_gfp_mask();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 9b2a1d58558d..269b097e78ea 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
92 } 92 }
93 93
94 if (state == PM_SUSPEND_MEM) { 94 if (state == PM_SUSPEND_MEM) {
95 printk(info_test, pm_states[state]); 95 printk(info_test, pm_states[state].label);
96 status = pm_suspend(state); 96 status = pm_suspend(state);
97 if (status == -ENODEV) 97 if (status == -ENODEV)
98 state = PM_SUSPEND_STANDBY; 98 state = PM_SUSPEND_STANDBY;
99 } 99 }
100 if (state == PM_SUSPEND_STANDBY) { 100 if (state == PM_SUSPEND_STANDBY) {
101 printk(info_test, pm_states[state]); 101 printk(info_test, pm_states[state].label);
102 status = pm_suspend(state); 102 status = pm_suspend(state);
103 } 103 }
104 if (status < 0) 104 if (status < 0)
@@ -136,18 +136,16 @@ static char warn_bad_state[] __initdata =
136 136
137static int __init setup_test_suspend(char *value) 137static int __init setup_test_suspend(char *value)
138{ 138{
139 unsigned i; 139 suspend_state_t i;
140 140
141 /* "=mem" ==> "mem" */ 141 /* "=mem" ==> "mem" */
142 value++; 142 value++;
143 for (i = 0; i < PM_SUSPEND_MAX; i++) { 143 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
144 if (!pm_states[i]) 144 if (!strcmp(pm_states[i].label, value)) {
145 continue; 145 test_state = pm_states[i].state;
146 if (strcmp(pm_states[i], value) != 0) 146 return 0;
147 continue; 147 }
148 test_state = (__force suspend_state_t) i; 148
149 return 0;
150 }
151 printk(warn_bad_state, value); 149 printk(warn_bad_state, value);
152 return 0; 150 return 0;
153} 151}
@@ -164,8 +162,8 @@ static int __init test_suspend(void)
164 /* PM is initialized by now; is that state testable? */ 162 /* PM is initialized by now; is that state testable? */
165 if (test_state == PM_SUSPEND_ON) 163 if (test_state == PM_SUSPEND_ON)
166 goto done; 164 goto done;
167 if (!valid_state(test_state)) { 165 if (!pm_states[test_state].state) {
168 printk(warn_bad_state, pm_states[test_state]); 166 printk(warn_bad_state, pm_states[test_state].label);
169 goto done; 167 goto done;
170 } 168 }
171 169
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8c9a4819f798..aaa3261dea5d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -567,7 +567,7 @@ static int lzo_compress_threadfn(void *data)
567 567
568/** 568/**
569 * save_image_lzo - Save the suspend image data compressed with LZO. 569 * save_image_lzo - Save the suspend image data compressed with LZO.
570 * @handle: Swap mam handle to use for saving the image. 570 * @handle: Swap map handle to use for saving the image.
571 * @snapshot: Image to read data from. 571 * @snapshot: Image to read data from.
572 * @nr_to_write: Number of pages to save. 572 * @nr_to_write: Number of pages to save.
573 */ 573 */
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 98d357584cd6..526e8911460a 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -49,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
49 struct snapshot_data *data; 49 struct snapshot_data *data;
50 int error; 50 int error;
51 51
52 if (!hibernation_available())
53 return -EPERM;
54
52 lock_system_sleep(); 55 lock_system_sleep();
53 56
54 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 57 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 7228258b85ec..13e839dbca07 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -54,20 +54,16 @@
54#include "console_cmdline.h" 54#include "console_cmdline.h"
55#include "braille.h" 55#include "braille.h"
56 56
57/* printk's without a loglevel use this.. */
58#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
59
60/* We show everything that is MORE important than this.. */
61#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
62#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
63
64int console_printk[4] = { 57int console_printk[4] = {
65 DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ 58 CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
66 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ 59 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
67 MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ 60 CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
68 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 61 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
69}; 62};
70 63
64/* Deferred messaged from sched code are marked by this special level */
65#define SCHED_MESSAGE_LOGLEVEL -2
66
71/* 67/*
72 * Low level drivers may need that to know if they can schedule in 68 * Low level drivers may need that to know if they can schedule in
73 * their unblank() callback or not. So let's export it. 69 * their unblank() callback or not. So let's export it.
@@ -91,6 +87,29 @@ static struct lockdep_map console_lock_dep_map = {
91#endif 87#endif
92 88
93/* 89/*
90 * Helper macros to handle lockdep when locking/unlocking console_sem. We use
91 * macros instead of functions so that _RET_IP_ contains useful information.
92 */
93#define down_console_sem() do { \
94 down(&console_sem);\
95 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\
96} while (0)
97
98static int __down_trylock_console_sem(unsigned long ip)
99{
100 if (down_trylock(&console_sem))
101 return 1;
102 mutex_acquire(&console_lock_dep_map, 0, 1, ip);
103 return 0;
104}
105#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)
106
107#define up_console_sem() do { \
108 mutex_release(&console_lock_dep_map, 1, _RET_IP_);\
109 up(&console_sem);\
110} while (0)
111
112/*
94 * This is used for debugging the mess that is the VT code by 113 * This is used for debugging the mess that is the VT code by
95 * keeping track if we have the console semaphore held. It's 114 * keeping track if we have the console semaphore held. It's
96 * definitely not the perfect debug tool (we don't know if _WE_ 115 * definitely not the perfect debug tool (we don't know if _WE_
@@ -206,8 +225,9 @@ struct printk_log {
206}; 225};
207 226
208/* 227/*
209 * The logbuf_lock protects kmsg buffer, indices, counters. It is also 228 * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
210 * used in interesting ways to provide interlocking in console_unlock(); 229 * within the scheduler's rq lock. It must be released before calling
230 * console_unlock() or anything else that might wake up a process.
211 */ 231 */
212static DEFINE_RAW_SPINLOCK(logbuf_lock); 232static DEFINE_RAW_SPINLOCK(logbuf_lock);
213 233
@@ -250,9 +270,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
250static char *log_buf = __log_buf; 270static char *log_buf = __log_buf;
251static u32 log_buf_len = __LOG_BUF_LEN; 271static u32 log_buf_len = __LOG_BUF_LEN;
252 272
253/* cpu currently holding logbuf_lock */
254static volatile unsigned int logbuf_cpu = UINT_MAX;
255
256/* human readable text of the record */ 273/* human readable text of the record */
257static char *log_text(const struct printk_log *msg) 274static char *log_text(const struct printk_log *msg)
258{ 275{
@@ -297,34 +314,106 @@ static u32 log_next(u32 idx)
297 return idx + msg->len; 314 return idx + msg->len;
298} 315}
299 316
300/* insert record into the buffer, discard old ones, update heads */ 317/*
301static void log_store(int facility, int level, 318 * Check whether there is enough free space for the given message.
302 enum log_flags flags, u64 ts_nsec, 319 *
303 const char *dict, u16 dict_len, 320 * The same values of first_idx and next_idx mean that the buffer
304 const char *text, u16 text_len) 321 * is either empty or full.
322 *
323 * If the buffer is empty, we must respect the position of the indexes.
324 * They cannot be reset to the beginning of the buffer.
325 */
326static int logbuf_has_space(u32 msg_size, bool empty)
305{ 327{
306 struct printk_log *msg; 328 u32 free;
307 u32 size, pad_len;
308 329
309 /* number of '\0' padding bytes to next message */ 330 if (log_next_idx > log_first_idx || empty)
310 size = sizeof(struct printk_log) + text_len + dict_len; 331 free = max(log_buf_len - log_next_idx, log_first_idx);
311 pad_len = (-size) & (LOG_ALIGN - 1); 332 else
312 size += pad_len; 333 free = log_first_idx - log_next_idx;
334
335 /*
336 * We need space also for an empty header that signalizes wrapping
337 * of the buffer.
338 */
339 return free >= msg_size + sizeof(struct printk_log);
340}
313 341
342static int log_make_free_space(u32 msg_size)
343{
314 while (log_first_seq < log_next_seq) { 344 while (log_first_seq < log_next_seq) {
315 u32 free; 345 if (logbuf_has_space(msg_size, false))
346 return 0;
347 /* drop old messages until we have enough continuous space */
348 log_first_idx = log_next(log_first_idx);
349 log_first_seq++;
350 }
316 351
317 if (log_next_idx > log_first_idx) 352 /* sequence numbers are equal, so the log buffer is empty */
318 free = max(log_buf_len - log_next_idx, log_first_idx); 353 if (logbuf_has_space(msg_size, true))
319 else 354 return 0;
320 free = log_first_idx - log_next_idx;
321 355
322 if (free >= size + sizeof(struct printk_log)) 356 return -ENOMEM;
323 break; 357}
324 358
325 /* drop old messages until we have enough contiuous space */ 359/* compute the message size including the padding bytes */
326 log_first_idx = log_next(log_first_idx); 360static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
327 log_first_seq++; 361{
362 u32 size;
363
364 size = sizeof(struct printk_log) + text_len + dict_len;
365 *pad_len = (-size) & (LOG_ALIGN - 1);
366 size += *pad_len;
367
368 return size;
369}
370
371/*
372 * Define how much of the log buffer we could take at maximum. The value
373 * must be greater than two. Note that only half of the buffer is available
374 * when the index points to the middle.
375 */
376#define MAX_LOG_TAKE_PART 4
377static const char trunc_msg[] = "<truncated>";
378
379static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
380 u16 *dict_len, u32 *pad_len)
381{
382 /*
383 * The message should not take the whole buffer. Otherwise, it might
384 * get removed too soon.
385 */
386 u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
387 if (*text_len > max_text_len)
388 *text_len = max_text_len;
389 /* enable the warning message */
390 *trunc_msg_len = strlen(trunc_msg);
391 /* disable the "dict" completely */
392 *dict_len = 0;
393 /* compute the size again, count also the warning message */
394 return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
395}
396
397/* insert record into the buffer, discard old ones, update heads */
398static int log_store(int facility, int level,
399 enum log_flags flags, u64 ts_nsec,
400 const char *dict, u16 dict_len,
401 const char *text, u16 text_len)
402{
403 struct printk_log *msg;
404 u32 size, pad_len;
405 u16 trunc_msg_len = 0;
406
407 /* number of '\0' padding bytes to next message */
408 size = msg_used_size(text_len, dict_len, &pad_len);
409
410 if (log_make_free_space(size)) {
411 /* truncate the message if it is too long for empty buffer */
412 size = truncate_msg(&text_len, &trunc_msg_len,
413 &dict_len, &pad_len);
414 /* survive when the log buffer is too small for trunc_msg */
415 if (log_make_free_space(size))
416 return 0;
328 } 417 }
329 418
330 if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { 419 if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
@@ -341,6 +430,10 @@ static void log_store(int facility, int level,
341 msg = (struct printk_log *)(log_buf + log_next_idx); 430 msg = (struct printk_log *)(log_buf + log_next_idx);
342 memcpy(log_text(msg), text, text_len); 431 memcpy(log_text(msg), text, text_len);
343 msg->text_len = text_len; 432 msg->text_len = text_len;
433 if (trunc_msg_len) {
434 memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
435 msg->text_len += trunc_msg_len;
436 }
344 memcpy(log_dict(msg), dict, dict_len); 437 memcpy(log_dict(msg), dict, dict_len);
345 msg->dict_len = dict_len; 438 msg->dict_len = dict_len;
346 msg->facility = facility; 439 msg->facility = facility;
@@ -356,6 +449,8 @@ static void log_store(int facility, int level,
356 /* insert message */ 449 /* insert message */
357 log_next_idx += msg->len; 450 log_next_idx += msg->len;
358 log_next_seq++; 451 log_next_seq++;
452
453 return msg->text_len;
359} 454}
360 455
361#ifdef CONFIG_SECURITY_DMESG_RESTRICT 456#ifdef CONFIG_SECURITY_DMESG_RESTRICT
@@ -1303,7 +1398,10 @@ static void zap_locks(void)
1303 sema_init(&console_sem, 1); 1398 sema_init(&console_sem, 1);
1304} 1399}
1305 1400
1306/* Check if we have any console registered that can be called early in boot. */ 1401/*
1402 * Check if we have any console that is capable of printing while cpu is
1403 * booting or shutting down. Requires console_sem.
1404 */
1307static int have_callable_console(void) 1405static int have_callable_console(void)
1308{ 1406{
1309 struct console *con; 1407 struct console *con;
@@ -1333,36 +1431,22 @@ static inline int can_use_console(unsigned int cpu)
1333 * messages from a 'printk'. Return true (and with the 1431 * messages from a 'printk'. Return true (and with the
1334 * console_lock held, and 'console_locked' set) if it 1432 * console_lock held, and 'console_locked' set) if it
1335 * is successful, false otherwise. 1433 * is successful, false otherwise.
1336 *
1337 * This gets called with the 'logbuf_lock' spinlock held and
1338 * interrupts disabled. It should return with 'lockbuf_lock'
1339 * released but interrupts still disabled.
1340 */ 1434 */
1341static int console_trylock_for_printk(unsigned int cpu) 1435static int console_trylock_for_printk(unsigned int cpu)
1342 __releases(&logbuf_lock)
1343{ 1436{
1344 int retval = 0, wake = 0; 1437 if (!console_trylock())
1345 1438 return 0;
1346 if (console_trylock()) { 1439 /*
1347 retval = 1; 1440 * If we can't use the console, we need to release the console
1348 1441 * semaphore by hand to avoid flushing the buffer. We need to hold the
1349 /* 1442 * console semaphore in order to do this test safely.
1350 * If we can't use the console, we need to release 1443 */
1351 * the console semaphore by hand to avoid flushing 1444 if (!can_use_console(cpu)) {
1352 * the buffer. We need to hold the console semaphore 1445 console_locked = 0;
1353 * in order to do this test safely. 1446 up_console_sem();
1354 */ 1447 return 0;
1355 if (!can_use_console(cpu)) {
1356 console_locked = 0;
1357 wake = 1;
1358 retval = 0;
1359 }
1360 } 1448 }
1361 logbuf_cpu = UINT_MAX; 1449 return 1;
1362 raw_spin_unlock(&logbuf_lock);
1363 if (wake)
1364 up(&console_sem);
1365 return retval;
1366} 1450}
1367 1451
1368int printk_delay_msec __read_mostly; 1452int printk_delay_msec __read_mostly;
@@ -1490,11 +1574,19 @@ asmlinkage int vprintk_emit(int facility, int level,
1490 static int recursion_bug; 1574 static int recursion_bug;
1491 static char textbuf[LOG_LINE_MAX]; 1575 static char textbuf[LOG_LINE_MAX];
1492 char *text = textbuf; 1576 char *text = textbuf;
1493 size_t text_len; 1577 size_t text_len = 0;
1494 enum log_flags lflags = 0; 1578 enum log_flags lflags = 0;
1495 unsigned long flags; 1579 unsigned long flags;
1496 int this_cpu; 1580 int this_cpu;
1497 int printed_len = 0; 1581 int printed_len = 0;
1582 bool in_sched = false;
1583 /* cpu currently holding logbuf_lock in this function */
1584 static volatile unsigned int logbuf_cpu = UINT_MAX;
1585
1586 if (level == SCHED_MESSAGE_LOGLEVEL) {
1587 level = -1;
1588 in_sched = true;
1589 }
1498 1590
1499 boot_delay_msec(level); 1591 boot_delay_msec(level);
1500 printk_delay(); 1592 printk_delay();
@@ -1530,17 +1622,22 @@ asmlinkage int vprintk_emit(int facility, int level,
1530 "BUG: recent printk recursion!"; 1622 "BUG: recent printk recursion!";
1531 1623
1532 recursion_bug = 0; 1624 recursion_bug = 0;
1533 printed_len += strlen(recursion_msg); 1625 text_len = strlen(recursion_msg);
1534 /* emit KERN_CRIT message */ 1626 /* emit KERN_CRIT message */
1535 log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, 1627 printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
1536 NULL, 0, recursion_msg, printed_len); 1628 NULL, 0, recursion_msg, text_len);
1537 } 1629 }
1538 1630
1539 /* 1631 /*
1540 * The printf needs to come first; we need the syslog 1632 * The printf needs to come first; we need the syslog
1541 * prefix which might be passed-in as a parameter. 1633 * prefix which might be passed-in as a parameter.
1542 */ 1634 */
1543 text_len = vscnprintf(text, sizeof(textbuf), fmt, args); 1635 if (in_sched)
1636 text_len = scnprintf(text, sizeof(textbuf),
1637 KERN_WARNING "[sched_delayed] ");
1638
1639 text_len += vscnprintf(text + text_len,
1640 sizeof(textbuf) - text_len, fmt, args);
1544 1641
1545 /* mark and strip a trailing newline */ 1642 /* mark and strip a trailing newline */
1546 if (text_len && text[text_len-1] == '\n') { 1643 if (text_len && text[text_len-1] == '\n') {
@@ -1586,9 +1683,12 @@ asmlinkage int vprintk_emit(int facility, int level,
1586 cont_flush(LOG_NEWLINE); 1683 cont_flush(LOG_NEWLINE);
1587 1684
1588 /* buffer line if possible, otherwise store it right away */ 1685 /* buffer line if possible, otherwise store it right away */
1589 if (!cont_add(facility, level, text, text_len)) 1686 if (cont_add(facility, level, text, text_len))
1590 log_store(facility, level, lflags | LOG_CONT, 0, 1687 printed_len += text_len;
1591 dict, dictlen, text, text_len); 1688 else
1689 printed_len += log_store(facility, level,
1690 lflags | LOG_CONT, 0,
1691 dict, dictlen, text, text_len);
1592 } else { 1692 } else {
1593 bool stored = false; 1693 bool stored = false;
1594 1694
@@ -1607,27 +1707,30 @@ asmlinkage int vprintk_emit(int facility, int level,
1607 cont_flush(LOG_NEWLINE); 1707 cont_flush(LOG_NEWLINE);
1608 } 1708 }
1609 1709
1610 if (!stored) 1710 if (stored)
1611 log_store(facility, level, lflags, 0, 1711 printed_len += text_len;
1612 dict, dictlen, text, text_len); 1712 else
1713 printed_len += log_store(facility, level, lflags, 0,
1714 dict, dictlen, text, text_len);
1613 } 1715 }
1614 printed_len += text_len;
1615 1716
1616 /* 1717 logbuf_cpu = UINT_MAX;
1617 * Try to acquire and then immediately release the console semaphore. 1718 raw_spin_unlock(&logbuf_lock);
1618 * The release will print out buffers and wake up /dev/kmsg and syslog() 1719
1619 * users. 1720 /* If called from the scheduler, we can not call up(). */
1620 * 1721 if (!in_sched) {
1621 * The console_trylock_for_printk() function will release 'logbuf_lock' 1722 /*
1622 * regardless of whether it actually gets the console semaphore or not. 1723 * Try to acquire and then immediately release the console
1623 */ 1724 * semaphore. The release will print out buffers and wake up
1624 if (console_trylock_for_printk(this_cpu)) 1725 * /dev/kmsg and syslog() users.
1625 console_unlock(); 1726 */
1727 if (console_trylock_for_printk(this_cpu))
1728 console_unlock();
1729 }
1626 1730
1627 lockdep_on(); 1731 lockdep_on();
1628out_restore_irqs: 1732out_restore_irqs:
1629 local_irq_restore(flags); 1733 local_irq_restore(flags);
1630
1631 return printed_len; 1734 return printed_len;
1632} 1735}
1633EXPORT_SYMBOL(vprintk_emit); 1736EXPORT_SYMBOL(vprintk_emit);
@@ -1882,16 +1985,14 @@ void suspend_console(void)
1882 printk("Suspending console(s) (use no_console_suspend to debug)\n"); 1985 printk("Suspending console(s) (use no_console_suspend to debug)\n");
1883 console_lock(); 1986 console_lock();
1884 console_suspended = 1; 1987 console_suspended = 1;
1885 up(&console_sem); 1988 up_console_sem();
1886 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
1887} 1989}
1888 1990
1889void resume_console(void) 1991void resume_console(void)
1890{ 1992{
1891 if (!console_suspend_enabled) 1993 if (!console_suspend_enabled)
1892 return; 1994 return;
1893 down(&console_sem); 1995 down_console_sem();
1894 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1895 console_suspended = 0; 1996 console_suspended = 0;
1896 console_unlock(); 1997 console_unlock();
1897} 1998}
@@ -1933,12 +2034,11 @@ void console_lock(void)
1933{ 2034{
1934 might_sleep(); 2035 might_sleep();
1935 2036
1936 down(&console_sem); 2037 down_console_sem();
1937 if (console_suspended) 2038 if (console_suspended)
1938 return; 2039 return;
1939 console_locked = 1; 2040 console_locked = 1;
1940 console_may_schedule = 1; 2041 console_may_schedule = 1;
1941 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1942} 2042}
1943EXPORT_SYMBOL(console_lock); 2043EXPORT_SYMBOL(console_lock);
1944 2044
@@ -1952,15 +2052,14 @@ EXPORT_SYMBOL(console_lock);
1952 */ 2052 */
1953int console_trylock(void) 2053int console_trylock(void)
1954{ 2054{
1955 if (down_trylock(&console_sem)) 2055 if (down_trylock_console_sem())
1956 return 0; 2056 return 0;
1957 if (console_suspended) { 2057 if (console_suspended) {
1958 up(&console_sem); 2058 up_console_sem();
1959 return 0; 2059 return 0;
1960 } 2060 }
1961 console_locked = 1; 2061 console_locked = 1;
1962 console_may_schedule = 0; 2062 console_may_schedule = 0;
1963 mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
1964 return 1; 2063 return 1;
1965} 2064}
1966EXPORT_SYMBOL(console_trylock); 2065EXPORT_SYMBOL(console_trylock);
@@ -2022,7 +2121,7 @@ void console_unlock(void)
2022 bool retry; 2121 bool retry;
2023 2122
2024 if (console_suspended) { 2123 if (console_suspended) {
2025 up(&console_sem); 2124 up_console_sem();
2026 return; 2125 return;
2027 } 2126 }
2028 2127
@@ -2043,10 +2142,15 @@ again:
2043 } 2142 }
2044 2143
2045 if (console_seq < log_first_seq) { 2144 if (console_seq < log_first_seq) {
2145 len = sprintf(text, "** %u printk messages dropped ** ",
2146 (unsigned)(log_first_seq - console_seq));
2147
2046 /* messages are gone, move to first one */ 2148 /* messages are gone, move to first one */
2047 console_seq = log_first_seq; 2149 console_seq = log_first_seq;
2048 console_idx = log_first_idx; 2150 console_idx = log_first_idx;
2049 console_prev = 0; 2151 console_prev = 0;
2152 } else {
2153 len = 0;
2050 } 2154 }
2051skip: 2155skip:
2052 if (console_seq == log_next_seq) 2156 if (console_seq == log_next_seq)
@@ -2071,8 +2175,8 @@ skip:
2071 } 2175 }
2072 2176
2073 level = msg->level; 2177 level = msg->level;
2074 len = msg_print_text(msg, console_prev, false, 2178 len += msg_print_text(msg, console_prev, false,
2075 text, sizeof(text)); 2179 text + len, sizeof(text) - len);
2076 console_idx = log_next(console_idx); 2180 console_idx = log_next(console_idx);
2077 console_seq++; 2181 console_seq++;
2078 console_prev = msg->flags; 2182 console_prev = msg->flags;
@@ -2084,7 +2188,6 @@ skip:
2084 local_irq_restore(flags); 2188 local_irq_restore(flags);
2085 } 2189 }
2086 console_locked = 0; 2190 console_locked = 0;
2087 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
2088 2191
2089 /* Release the exclusive_console once it is used */ 2192 /* Release the exclusive_console once it is used */
2090 if (unlikely(exclusive_console)) 2193 if (unlikely(exclusive_console))
@@ -2092,7 +2195,7 @@ skip:
2092 2195
2093 raw_spin_unlock(&logbuf_lock); 2196 raw_spin_unlock(&logbuf_lock);
2094 2197
2095 up(&console_sem); 2198 up_console_sem();
2096 2199
2097 /* 2200 /*
2098 * Someone could have filled up the buffer again, so re-check if there's 2201 * Someone could have filled up the buffer again, so re-check if there's
@@ -2137,7 +2240,7 @@ void console_unblank(void)
2137 * oops_in_progress is set to 1.. 2240 * oops_in_progress is set to 1..
2138 */ 2241 */
2139 if (oops_in_progress) { 2242 if (oops_in_progress) {
2140 if (down_trylock(&console_sem) != 0) 2243 if (down_trylock_console_sem() != 0)
2141 return; 2244 return;
2142 } else 2245 } else
2143 console_lock(); 2246 console_lock();
@@ -2413,6 +2516,7 @@ int unregister_console(struct console *console)
2413 if (console_drivers != NULL && console->flags & CON_CONSDEV) 2516 if (console_drivers != NULL && console->flags & CON_CONSDEV)
2414 console_drivers->flags |= CON_CONSDEV; 2517 console_drivers->flags |= CON_CONSDEV;
2415 2518
2519 console->flags &= ~CON_ENABLED;
2416 console_unlock(); 2520 console_unlock();
2417 console_sysfs_notify(); 2521 console_sysfs_notify();
2418 return res; 2522 return res;
@@ -2437,21 +2541,19 @@ late_initcall(printk_late_init);
2437/* 2541/*
2438 * Delayed printk version, for scheduler-internal messages: 2542 * Delayed printk version, for scheduler-internal messages:
2439 */ 2543 */
2440#define PRINTK_BUF_SIZE 512
2441
2442#define PRINTK_PENDING_WAKEUP 0x01 2544#define PRINTK_PENDING_WAKEUP 0x01
2443#define PRINTK_PENDING_SCHED 0x02 2545#define PRINTK_PENDING_OUTPUT 0x02
2444 2546
2445static DEFINE_PER_CPU(int, printk_pending); 2547static DEFINE_PER_CPU(int, printk_pending);
2446static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
2447 2548
2448static void wake_up_klogd_work_func(struct irq_work *irq_work) 2549static void wake_up_klogd_work_func(struct irq_work *irq_work)
2449{ 2550{
2450 int pending = __this_cpu_xchg(printk_pending, 0); 2551 int pending = __this_cpu_xchg(printk_pending, 0);
2451 2552
2452 if (pending & PRINTK_PENDING_SCHED) { 2553 if (pending & PRINTK_PENDING_OUTPUT) {
2453 char *buf = __get_cpu_var(printk_sched_buf); 2554 /* If trylock fails, someone else is doing the printing */
2454 pr_warn("[sched_delayed] %s", buf); 2555 if (console_trylock())
2556 console_unlock();
2455 } 2557 }
2456 2558
2457 if (pending & PRINTK_PENDING_WAKEUP) 2559 if (pending & PRINTK_PENDING_WAKEUP)
@@ -2473,23 +2575,19 @@ void wake_up_klogd(void)
2473 preempt_enable(); 2575 preempt_enable();
2474} 2576}
2475 2577
2476int printk_sched(const char *fmt, ...) 2578int printk_deferred(const char *fmt, ...)
2477{ 2579{
2478 unsigned long flags;
2479 va_list args; 2580 va_list args;
2480 char *buf;
2481 int r; 2581 int r;
2482 2582
2483 local_irq_save(flags); 2583 preempt_disable();
2484 buf = __get_cpu_var(printk_sched_buf);
2485
2486 va_start(args, fmt); 2584 va_start(args, fmt);
2487 r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); 2585 r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args);
2488 va_end(args); 2586 va_end(args);
2489 2587
2490 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); 2588 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
2491 irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); 2589 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
2492 local_irq_restore(flags); 2590 preempt_enable();
2493 2591
2494 return r; 2592 return r;
2495} 2593}
diff --git a/kernel/profile.c b/kernel/profile.c
index cb980f0c731b..54bf5ba26420 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -52,9 +52,9 @@ static DEFINE_MUTEX(profile_flip_mutex);
52 52
53int profile_setup(char *str) 53int profile_setup(char *str)
54{ 54{
55 static char schedstr[] = "schedule"; 55 static const char schedstr[] = "schedule";
56 static char sleepstr[] = "sleep"; 56 static const char sleepstr[] = "sleep";
57 static char kvmstr[] = "kvm"; 57 static const char kvmstr[] = "kvm";
58 int par; 58 int par;
59 59
60 if (!strncmp(str, sleepstr, strlen(sleepstr))) { 60 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
@@ -64,12 +64,10 @@ int profile_setup(char *str)
64 str += strlen(sleepstr) + 1; 64 str += strlen(sleepstr) + 1;
65 if (get_option(&str, &par)) 65 if (get_option(&str, &par))
66 prof_shift = par; 66 prof_shift = par;
67 printk(KERN_INFO 67 pr_info("kernel sleep profiling enabled (shift: %ld)\n",
68 "kernel sleep profiling enabled (shift: %ld)\n",
69 prof_shift); 68 prof_shift);
70#else 69#else
71 printk(KERN_WARNING 70 pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
72 "kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
73#endif /* CONFIG_SCHEDSTATS */ 71#endif /* CONFIG_SCHEDSTATS */
74 } else if (!strncmp(str, schedstr, strlen(schedstr))) { 72 } else if (!strncmp(str, schedstr, strlen(schedstr))) {
75 prof_on = SCHED_PROFILING; 73 prof_on = SCHED_PROFILING;
@@ -77,8 +75,7 @@ int profile_setup(char *str)
77 str += strlen(schedstr) + 1; 75 str += strlen(schedstr) + 1;
78 if (get_option(&str, &par)) 76 if (get_option(&str, &par))
79 prof_shift = par; 77 prof_shift = par;
80 printk(KERN_INFO 78 pr_info("kernel schedule profiling enabled (shift: %ld)\n",
81 "kernel schedule profiling enabled (shift: %ld)\n",
82 prof_shift); 79 prof_shift);
83 } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { 80 } else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
84 prof_on = KVM_PROFILING; 81 prof_on = KVM_PROFILING;
@@ -86,13 +83,12 @@ int profile_setup(char *str)
86 str += strlen(kvmstr) + 1; 83 str += strlen(kvmstr) + 1;
87 if (get_option(&str, &par)) 84 if (get_option(&str, &par))
88 prof_shift = par; 85 prof_shift = par;
89 printk(KERN_INFO 86 pr_info("kernel KVM profiling enabled (shift: %ld)\n",
90 "kernel KVM profiling enabled (shift: %ld)\n",
91 prof_shift); 87 prof_shift);
92 } else if (get_option(&str, &par)) { 88 } else if (get_option(&str, &par)) {
93 prof_shift = par; 89 prof_shift = par;
94 prof_on = CPU_PROFILING; 90 prof_on = CPU_PROFILING;
95 printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", 91 pr_info("kernel profiling enabled (shift: %ld)\n",
96 prof_shift); 92 prof_shift);
97 } 93 }
98 return 1; 94 return 1;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index bd30bc61bc05..7fa34f86e5ba 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -58,9 +58,11 @@ torture_param(int, fqs_duration, 0,
58 "Duration of fqs bursts (us), 0 to disable"); 58 "Duration of fqs bursts (us), 0 to disable");
59torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); 59torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
60torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); 60torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
61torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
61torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); 62torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
62torture_param(bool, gp_normal, false, 63torture_param(bool, gp_normal, false,
63 "Use normal (non-expedited) GP wait primitives"); 64 "Use normal (non-expedited) GP wait primitives");
65torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
64torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); 66torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
65torture_param(int, n_barrier_cbs, 0, 67torture_param(int, n_barrier_cbs, 0,
66 "# of callbacks/kthreads for barrier testing"); 68 "# of callbacks/kthreads for barrier testing");
@@ -138,6 +140,18 @@ static long n_barrier_attempts;
138static long n_barrier_successes; 140static long n_barrier_successes;
139static struct list_head rcu_torture_removed; 141static struct list_head rcu_torture_removed;
140 142
143static int rcu_torture_writer_state;
144#define RTWS_FIXED_DELAY 0
145#define RTWS_DELAY 1
146#define RTWS_REPLACE 2
147#define RTWS_DEF_FREE 3
148#define RTWS_EXP_SYNC 4
149#define RTWS_COND_GET 5
150#define RTWS_COND_SYNC 6
151#define RTWS_SYNC 7
152#define RTWS_STUTTER 8
153#define RTWS_STOPPING 9
154
141#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) 155#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
142#define RCUTORTURE_RUNNABLE_INIT 1 156#define RCUTORTURE_RUNNABLE_INIT 1
143#else 157#else
@@ -214,6 +228,7 @@ rcu_torture_free(struct rcu_torture *p)
214 */ 228 */
215 229
216struct rcu_torture_ops { 230struct rcu_torture_ops {
231 int ttype;
217 void (*init)(void); 232 void (*init)(void);
218 int (*readlock)(void); 233 int (*readlock)(void);
219 void (*read_delay)(struct torture_random_state *rrsp); 234 void (*read_delay)(struct torture_random_state *rrsp);
@@ -222,6 +237,8 @@ struct rcu_torture_ops {
222 void (*deferred_free)(struct rcu_torture *p); 237 void (*deferred_free)(struct rcu_torture *p);
223 void (*sync)(void); 238 void (*sync)(void);
224 void (*exp_sync)(void); 239 void (*exp_sync)(void);
240 unsigned long (*get_state)(void);
241 void (*cond_sync)(unsigned long oldstate);
225 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 242 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
226 void (*cb_barrier)(void); 243 void (*cb_barrier)(void);
227 void (*fqs)(void); 244 void (*fqs)(void);
@@ -273,10 +290,48 @@ static int rcu_torture_completed(void)
273 return rcu_batches_completed(); 290 return rcu_batches_completed();
274} 291}
275 292
293/*
294 * Update callback in the pipe. This should be invoked after a grace period.
295 */
296static bool
297rcu_torture_pipe_update_one(struct rcu_torture *rp)
298{
299 int i;
300
301 i = rp->rtort_pipe_count;
302 if (i > RCU_TORTURE_PIPE_LEN)
303 i = RCU_TORTURE_PIPE_LEN;
304 atomic_inc(&rcu_torture_wcount[i]);
305 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
306 rp->rtort_mbtest = 0;
307 return true;
308 }
309 return false;
310}
311
312/*
313 * Update all callbacks in the pipe. Suitable for synchronous grace-period
314 * primitives.
315 */
316static void
317rcu_torture_pipe_update(struct rcu_torture *old_rp)
318{
319 struct rcu_torture *rp;
320 struct rcu_torture *rp1;
321
322 if (old_rp)
323 list_add(&old_rp->rtort_free, &rcu_torture_removed);
324 list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
325 if (rcu_torture_pipe_update_one(rp)) {
326 list_del(&rp->rtort_free);
327 rcu_torture_free(rp);
328 }
329 }
330}
331
276static void 332static void
277rcu_torture_cb(struct rcu_head *p) 333rcu_torture_cb(struct rcu_head *p)
278{ 334{
279 int i;
280 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); 335 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
281 336
282 if (torture_must_stop_irq()) { 337 if (torture_must_stop_irq()) {
@@ -284,16 +339,10 @@ rcu_torture_cb(struct rcu_head *p)
284 /* The next initialization will pick up the pieces. */ 339 /* The next initialization will pick up the pieces. */
285 return; 340 return;
286 } 341 }
287 i = rp->rtort_pipe_count; 342 if (rcu_torture_pipe_update_one(rp))
288 if (i > RCU_TORTURE_PIPE_LEN)
289 i = RCU_TORTURE_PIPE_LEN;
290 atomic_inc(&rcu_torture_wcount[i]);
291 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
292 rp->rtort_mbtest = 0;
293 rcu_torture_free(rp); 343 rcu_torture_free(rp);
294 } else { 344 else
295 cur_ops->deferred_free(rp); 345 cur_ops->deferred_free(rp);
296 }
297} 346}
298 347
299static int rcu_no_completed(void) 348static int rcu_no_completed(void)
@@ -312,6 +361,7 @@ static void rcu_sync_torture_init(void)
312} 361}
313 362
314static struct rcu_torture_ops rcu_ops = { 363static struct rcu_torture_ops rcu_ops = {
364 .ttype = RCU_FLAVOR,
315 .init = rcu_sync_torture_init, 365 .init = rcu_sync_torture_init,
316 .readlock = rcu_torture_read_lock, 366 .readlock = rcu_torture_read_lock,
317 .read_delay = rcu_read_delay, 367 .read_delay = rcu_read_delay,
@@ -320,6 +370,8 @@ static struct rcu_torture_ops rcu_ops = {
320 .deferred_free = rcu_torture_deferred_free, 370 .deferred_free = rcu_torture_deferred_free,
321 .sync = synchronize_rcu, 371 .sync = synchronize_rcu,
322 .exp_sync = synchronize_rcu_expedited, 372 .exp_sync = synchronize_rcu_expedited,
373 .get_state = get_state_synchronize_rcu,
374 .cond_sync = cond_synchronize_rcu,
323 .call = call_rcu, 375 .call = call_rcu,
324 .cb_barrier = rcu_barrier, 376 .cb_barrier = rcu_barrier,
325 .fqs = rcu_force_quiescent_state, 377 .fqs = rcu_force_quiescent_state,
@@ -355,6 +407,7 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
355} 407}
356 408
357static struct rcu_torture_ops rcu_bh_ops = { 409static struct rcu_torture_ops rcu_bh_ops = {
410 .ttype = RCU_BH_FLAVOR,
358 .init = rcu_sync_torture_init, 411 .init = rcu_sync_torture_init,
359 .readlock = rcu_bh_torture_read_lock, 412 .readlock = rcu_bh_torture_read_lock,
360 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 413 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
@@ -397,6 +450,7 @@ call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
397} 450}
398 451
399static struct rcu_torture_ops rcu_busted_ops = { 452static struct rcu_torture_ops rcu_busted_ops = {
453 .ttype = INVALID_RCU_FLAVOR,
400 .init = rcu_sync_torture_init, 454 .init = rcu_sync_torture_init,
401 .readlock = rcu_torture_read_lock, 455 .readlock = rcu_torture_read_lock,
402 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 456 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
@@ -479,9 +533,11 @@ static void srcu_torture_stats(char *page)
479 page += sprintf(page, "%s%s per-CPU(idx=%d):", 533 page += sprintf(page, "%s%s per-CPU(idx=%d):",
480 torture_type, TORTURE_FLAG, idx); 534 torture_type, TORTURE_FLAG, idx);
481 for_each_possible_cpu(cpu) { 535 for_each_possible_cpu(cpu) {
482 page += sprintf(page, " %d(%lu,%lu)", cpu, 536 long c0, c1;
483 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], 537
484 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); 538 c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
539 c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
540 page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1);
485 } 541 }
486 sprintf(page, "\n"); 542 sprintf(page, "\n");
487} 543}
@@ -492,6 +548,7 @@ static void srcu_torture_synchronize_expedited(void)
492} 548}
493 549
494static struct rcu_torture_ops srcu_ops = { 550static struct rcu_torture_ops srcu_ops = {
551 .ttype = SRCU_FLAVOR,
495 .init = rcu_sync_torture_init, 552 .init = rcu_sync_torture_init,
496 .readlock = srcu_torture_read_lock, 553 .readlock = srcu_torture_read_lock,
497 .read_delay = srcu_read_delay, 554 .read_delay = srcu_read_delay,
@@ -527,6 +584,7 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
527} 584}
528 585
529static struct rcu_torture_ops sched_ops = { 586static struct rcu_torture_ops sched_ops = {
587 .ttype = RCU_SCHED_FLAVOR,
530 .init = rcu_sync_torture_init, 588 .init = rcu_sync_torture_init,
531 .readlock = sched_torture_read_lock, 589 .readlock = sched_torture_read_lock,
532 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 590 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
@@ -688,23 +746,59 @@ rcu_torture_fqs(void *arg)
688static int 746static int
689rcu_torture_writer(void *arg) 747rcu_torture_writer(void *arg)
690{ 748{
691 bool exp; 749 unsigned long gp_snap;
750 bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
751 bool gp_sync1 = gp_sync;
692 int i; 752 int i;
693 struct rcu_torture *rp; 753 struct rcu_torture *rp;
694 struct rcu_torture *rp1;
695 struct rcu_torture *old_rp; 754 struct rcu_torture *old_rp;
696 static DEFINE_TORTURE_RANDOM(rand); 755 static DEFINE_TORTURE_RANDOM(rand);
756 int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
757 RTWS_COND_GET, RTWS_SYNC };
758 int nsynctypes = 0;
697 759
698 VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); 760 VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
699 set_user_nice(current, MAX_NICE); 761
762 /* Initialize synctype[] array. If none set, take default. */
763 if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
764 gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
765 if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
766 synctype[nsynctypes++] = RTWS_COND_GET;
767 else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync))
768 pr_alert("rcu_torture_writer: gp_cond without primitives.\n");
769 if (gp_exp1 && cur_ops->exp_sync)
770 synctype[nsynctypes++] = RTWS_EXP_SYNC;
771 else if (gp_exp && !cur_ops->exp_sync)
772 pr_alert("rcu_torture_writer: gp_exp without primitives.\n");
773 if (gp_normal1 && cur_ops->deferred_free)
774 synctype[nsynctypes++] = RTWS_DEF_FREE;
775 else if (gp_normal && !cur_ops->deferred_free)
776 pr_alert("rcu_torture_writer: gp_normal without primitives.\n");
777 if (gp_sync1 && cur_ops->sync)
778 synctype[nsynctypes++] = RTWS_SYNC;
779 else if (gp_sync && !cur_ops->sync)
780 pr_alert("rcu_torture_writer: gp_sync without primitives.\n");
781 if (WARN_ONCE(nsynctypes == 0,
782 "rcu_torture_writer: No update-side primitives.\n")) {
783 /*
784 * No updates primitives, so don't try updating.
785 * The resulting test won't be testing much, hence the
786 * above WARN_ONCE().
787 */
788 rcu_torture_writer_state = RTWS_STOPPING;
789 torture_kthread_stopping("rcu_torture_writer");
790 }
700 791
701 do { 792 do {
793 rcu_torture_writer_state = RTWS_FIXED_DELAY;
702 schedule_timeout_uninterruptible(1); 794 schedule_timeout_uninterruptible(1);
703 rp = rcu_torture_alloc(); 795 rp = rcu_torture_alloc();
704 if (rp == NULL) 796 if (rp == NULL)
705 continue; 797 continue;
706 rp->rtort_pipe_count = 0; 798 rp->rtort_pipe_count = 0;
799 rcu_torture_writer_state = RTWS_DELAY;
707 udelay(torture_random(&rand) & 0x3ff); 800 udelay(torture_random(&rand) & 0x3ff);
801 rcu_torture_writer_state = RTWS_REPLACE;
708 old_rp = rcu_dereference_check(rcu_torture_current, 802 old_rp = rcu_dereference_check(rcu_torture_current,
709 current == writer_task); 803 current == writer_task);
710 rp->rtort_mbtest = 1; 804 rp->rtort_mbtest = 1;
@@ -716,35 +810,42 @@ rcu_torture_writer(void *arg)
716 i = RCU_TORTURE_PIPE_LEN; 810 i = RCU_TORTURE_PIPE_LEN;
717 atomic_inc(&rcu_torture_wcount[i]); 811 atomic_inc(&rcu_torture_wcount[i]);
718 old_rp->rtort_pipe_count++; 812 old_rp->rtort_pipe_count++;
719 if (gp_normal == gp_exp) 813 switch (synctype[torture_random(&rand) % nsynctypes]) {
720 exp = !!(torture_random(&rand) & 0x80); 814 case RTWS_DEF_FREE:
721 else 815 rcu_torture_writer_state = RTWS_DEF_FREE;
722 exp = gp_exp;
723 if (!exp) {
724 cur_ops->deferred_free(old_rp); 816 cur_ops->deferred_free(old_rp);
725 } else { 817 break;
818 case RTWS_EXP_SYNC:
819 rcu_torture_writer_state = RTWS_EXP_SYNC;
726 cur_ops->exp_sync(); 820 cur_ops->exp_sync();
727 list_add(&old_rp->rtort_free, 821 rcu_torture_pipe_update(old_rp);
728 &rcu_torture_removed); 822 break;
729 list_for_each_entry_safe(rp, rp1, 823 case RTWS_COND_GET:
730 &rcu_torture_removed, 824 rcu_torture_writer_state = RTWS_COND_GET;
731 rtort_free) { 825 gp_snap = cur_ops->get_state();
732 i = rp->rtort_pipe_count; 826 i = torture_random(&rand) % 16;
733 if (i > RCU_TORTURE_PIPE_LEN) 827 if (i != 0)
734 i = RCU_TORTURE_PIPE_LEN; 828 schedule_timeout_interruptible(i);
735 atomic_inc(&rcu_torture_wcount[i]); 829 udelay(torture_random(&rand) % 1000);
736 if (++rp->rtort_pipe_count >= 830 rcu_torture_writer_state = RTWS_COND_SYNC;
737 RCU_TORTURE_PIPE_LEN) { 831 cur_ops->cond_sync(gp_snap);
738 rp->rtort_mbtest = 0; 832 rcu_torture_pipe_update(old_rp);
739 list_del(&rp->rtort_free); 833 break;
740 rcu_torture_free(rp); 834 case RTWS_SYNC:
741 } 835 rcu_torture_writer_state = RTWS_SYNC;
742 } 836 cur_ops->sync();
837 rcu_torture_pipe_update(old_rp);
838 break;
839 default:
840 WARN_ON_ONCE(1);
841 break;
743 } 842 }
744 } 843 }
745 rcutorture_record_progress(++rcu_torture_current_version); 844 rcutorture_record_progress(++rcu_torture_current_version);
845 rcu_torture_writer_state = RTWS_STUTTER;
746 stutter_wait("rcu_torture_writer"); 846 stutter_wait("rcu_torture_writer");
747 } while (!torture_must_stop()); 847 } while (!torture_must_stop());
848 rcu_torture_writer_state = RTWS_STOPPING;
748 torture_kthread_stopping("rcu_torture_writer"); 849 torture_kthread_stopping("rcu_torture_writer");
749 return 0; 850 return 0;
750} 851}
@@ -784,7 +885,7 @@ rcu_torture_fakewriter(void *arg)
784 return 0; 885 return 0;
785} 886}
786 887
787void rcutorture_trace_dump(void) 888static void rcutorture_trace_dump(void)
788{ 889{
789 static atomic_t beenhere = ATOMIC_INIT(0); 890 static atomic_t beenhere = ATOMIC_INIT(0);
790 891
@@ -918,11 +1019,13 @@ rcu_torture_reader(void *arg)
918 __this_cpu_inc(rcu_torture_batch[completed]); 1019 __this_cpu_inc(rcu_torture_batch[completed]);
919 preempt_enable(); 1020 preempt_enable();
920 cur_ops->readunlock(idx); 1021 cur_ops->readunlock(idx);
921 schedule(); 1022 cond_resched();
922 stutter_wait("rcu_torture_reader"); 1023 stutter_wait("rcu_torture_reader");
923 } while (!torture_must_stop()); 1024 } while (!torture_must_stop());
924 if (irqreader && cur_ops->irq_capable) 1025 if (irqreader && cur_ops->irq_capable) {
925 del_timer_sync(&t); 1026 del_timer_sync(&t);
1027 destroy_timer_on_stack(&t);
1028 }
926 torture_kthread_stopping("rcu_torture_reader"); 1029 torture_kthread_stopping("rcu_torture_reader");
927 return 0; 1030 return 0;
928} 1031}
@@ -937,6 +1040,7 @@ rcu_torture_printk(char *page)
937 int i; 1040 int i;
938 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 1041 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
939 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 1042 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
1043 static unsigned long rtcv_snap = ULONG_MAX;
940 1044
941 for_each_possible_cpu(cpu) { 1045 for_each_possible_cpu(cpu) {
942 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 1046 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
@@ -997,6 +1101,22 @@ rcu_torture_printk(char *page)
997 page += sprintf(page, "\n"); 1101 page += sprintf(page, "\n");
998 if (cur_ops->stats) 1102 if (cur_ops->stats)
999 cur_ops->stats(page); 1103 cur_ops->stats(page);
1104 if (rtcv_snap == rcu_torture_current_version &&
1105 rcu_torture_current != NULL) {
1106 int __maybe_unused flags;
1107 unsigned long __maybe_unused gpnum;
1108 unsigned long __maybe_unused completed;
1109
1110 rcutorture_get_gp_data(cur_ops->ttype,
1111 &flags, &gpnum, &completed);
1112 page += sprintf(page,
1113 "??? Writer stall state %d g%lu c%lu f%#x\n",
1114 rcu_torture_writer_state,
1115 gpnum, completed, flags);
1116 show_rcu_gp_kthreads();
1117 rcutorture_trace_dump();
1118 }
1119 rtcv_snap = rcu_torture_current_version;
1000} 1120}
1001 1121
1002/* 1122/*
@@ -1146,7 +1266,7 @@ static int __init rcu_torture_stall_init(void)
1146} 1266}
1147 1267
1148/* Callback function for RCU barrier testing. */ 1268/* Callback function for RCU barrier testing. */
1149void rcu_torture_barrier_cbf(struct rcu_head *rcu) 1269static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
1150{ 1270{
1151 atomic_inc(&barrier_cbs_invoked); 1271 atomic_inc(&barrier_cbs_invoked);
1152} 1272}
@@ -1416,7 +1536,8 @@ rcu_torture_init(void)
1416 &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, 1536 &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
1417 }; 1537 };
1418 1538
1419 torture_init_begin(torture_type, verbose, &rcutorture_runnable); 1539 if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable))
1540 return -EBUSY;
1420 1541
1421 /* Process args and tell the world that the torturer is on the job. */ 1542 /* Process args and tell the world that the torturer is on the job. */
1422 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1543 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -1441,10 +1562,13 @@ rcu_torture_init(void)
1441 if (cur_ops->init) 1562 if (cur_ops->init)
1442 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1563 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
1443 1564
1444 if (nreaders >= 0) 1565 if (nreaders >= 0) {
1445 nrealreaders = nreaders; 1566 nrealreaders = nreaders;
1446 else 1567 } else {
1447 nrealreaders = 2 * num_online_cpus(); 1568 nrealreaders = num_online_cpus() - 1;
1569 if (nrealreaders <= 0)
1570 nrealreaders = 1;
1571 }
1448 rcu_torture_print_module_parms(cur_ops, "Start of test"); 1572 rcu_torture_print_module_parms(cur_ops, "Start of test");
1449 1573
1450 /* Set up the freelist. */ 1574 /* Set up the freelist. */
@@ -1533,7 +1657,8 @@ rcu_torture_init(void)
1533 fqs_duration = 0; 1657 fqs_duration = 0;
1534 if (fqs_duration) { 1658 if (fqs_duration) {
1535 /* Create the fqs thread */ 1659 /* Create the fqs thread */
1536 torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); 1660 firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
1661 fqs_task);
1537 if (firsterr) 1662 if (firsterr)
1538 goto unwind; 1663 goto unwind;
1539 } 1664 }
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 431528520562..858c56569127 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -144,7 +144,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
144 return; 144 return;
145 rcp->ticks_this_gp++; 145 rcp->ticks_this_gp++;
146 j = jiffies; 146 j = jiffies;
147 js = rcp->jiffies_stall; 147 js = ACCESS_ONCE(rcp->jiffies_stall);
148 if (*rcp->curtail && ULONG_CMP_GE(j, js)) { 148 if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", 149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
150 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, 150 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
@@ -152,17 +152,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
152 dump_stack(); 152 dump_stack();
153 } 153 }
154 if (*rcp->curtail && ULONG_CMP_GE(j, js)) 154 if (*rcp->curtail && ULONG_CMP_GE(j, js))
155 rcp->jiffies_stall = jiffies + 155 ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
156 3 * rcu_jiffies_till_stall_check() + 3; 156 3 * rcu_jiffies_till_stall_check() + 3;
157 else if (ULONG_CMP_GE(j, js)) 157 else if (ULONG_CMP_GE(j, js))
158 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); 158 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
159} 159}
160 160
161static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) 161static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
162{ 162{
163 rcp->ticks_this_gp = 0; 163 rcp->ticks_this_gp = 0;
164 rcp->gp_start = jiffies; 164 rcp->gp_start = jiffies;
165 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); 165 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
166} 166}
167 167
168static void check_cpu_stalls(void) 168static void check_cpu_stalls(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0c47e300210a..625d0b0cd75a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -101,7 +101,7 @@ DEFINE_PER_CPU(struct rcu_data, sname##_data)
101RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 101RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
102RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 102RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
103 103
104static struct rcu_state *rcu_state; 104static struct rcu_state *rcu_state_p;
105LIST_HEAD(rcu_struct_flavors); 105LIST_HEAD(rcu_struct_flavors);
106 106
107/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ 107/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
@@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu)
206 rdp->passed_quiesce = 1; 206 rdp->passed_quiesce = 1;
207} 207}
208 208
209static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
210
211static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
212 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
213 .dynticks = ATOMIC_INIT(1),
214#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
215 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
216 .dynticks_idle = ATOMIC_INIT(1),
217#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
218};
219
220/*
221 * Let the RCU core know that this CPU has gone through the scheduler,
222 * which is a quiescent state. This is called when the need for a
223 * quiescent state is urgent, so we burn an atomic operation and full
224 * memory barriers to let the RCU core know about it, regardless of what
225 * this CPU might (or might not) do in the near future.
226 *
227 * We inform the RCU core by emulating a zero-duration dyntick-idle
228 * period, which we in turn do by incrementing the ->dynticks counter
229 * by two.
230 */
231static void rcu_momentary_dyntick_idle(void)
232{
233 unsigned long flags;
234 struct rcu_data *rdp;
235 struct rcu_dynticks *rdtp;
236 int resched_mask;
237 struct rcu_state *rsp;
238
239 local_irq_save(flags);
240
241 /*
242 * Yes, we can lose flag-setting operations. This is OK, because
243 * the flag will be set again after some delay.
244 */
245 resched_mask = raw_cpu_read(rcu_sched_qs_mask);
246 raw_cpu_write(rcu_sched_qs_mask, 0);
247
248 /* Find the flavor that needs a quiescent state. */
249 for_each_rcu_flavor(rsp) {
250 rdp = raw_cpu_ptr(rsp->rda);
251 if (!(resched_mask & rsp->flavor_mask))
252 continue;
253 smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
254 if (ACCESS_ONCE(rdp->mynode->completed) !=
255 ACCESS_ONCE(rdp->cond_resched_completed))
256 continue;
257
258 /*
259 * Pretend to be momentarily idle for the quiescent state.
260 * This allows the grace-period kthread to record the
261 * quiescent state, with no need for this CPU to do anything
262 * further.
263 */
264 rdtp = this_cpu_ptr(&rcu_dynticks);
265 smp_mb__before_atomic(); /* Earlier stuff before QS. */
266 atomic_add(2, &rdtp->dynticks); /* QS. */
267 smp_mb__after_atomic(); /* Later stuff after QS. */
268 break;
269 }
270 local_irq_restore(flags);
271}
272
209/* 273/*
210 * Note a context switch. This is a quiescent state for RCU-sched, 274 * Note a context switch. This is a quiescent state for RCU-sched,
211 * and requires special handling for preemptible RCU. 275 * and requires special handling for preemptible RCU.
@@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu)
216 trace_rcu_utilization(TPS("Start context switch")); 280 trace_rcu_utilization(TPS("Start context switch"));
217 rcu_sched_qs(cpu); 281 rcu_sched_qs(cpu);
218 rcu_preempt_note_context_switch(cpu); 282 rcu_preempt_note_context_switch(cpu);
283 if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
284 rcu_momentary_dyntick_idle();
219 trace_rcu_utilization(TPS("End context switch")); 285 trace_rcu_utilization(TPS("End context switch"));
220} 286}
221EXPORT_SYMBOL_GPL(rcu_note_context_switch); 287EXPORT_SYMBOL_GPL(rcu_note_context_switch);
222 288
223static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
224 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
225 .dynticks = ATOMIC_INIT(1),
226#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
227 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
228 .dynticks_idle = ATOMIC_INIT(1),
229#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
230};
231
232static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 289static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
233static long qhimark = 10000; /* If this many pending, ignore blimit. */ 290static long qhimark = 10000; /* If this many pending, ignore blimit. */
234static long qlowmark = 100; /* Once only this many pending, use blimit. */ 291static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -243,7 +300,14 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
243module_param(jiffies_till_first_fqs, ulong, 0644); 300module_param(jiffies_till_first_fqs, ulong, 0644);
244module_param(jiffies_till_next_fqs, ulong, 0644); 301module_param(jiffies_till_next_fqs, ulong, 0644);
245 302
246static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 303/*
304 * How long the grace period must be before we start recruiting
305 * quiescent-state help from rcu_note_context_switch().
306 */
307static ulong jiffies_till_sched_qs = HZ / 20;
308module_param(jiffies_till_sched_qs, ulong, 0644);
309
310static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
247 struct rcu_data *rdp); 311 struct rcu_data *rdp);
248static void force_qs_rnp(struct rcu_state *rsp, 312static void force_qs_rnp(struct rcu_state *rsp,
249 int (*f)(struct rcu_data *rsp, bool *isidle, 313 int (*f)(struct rcu_data *rsp, bool *isidle,
@@ -271,6 +335,15 @@ long rcu_batches_completed_bh(void)
271EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); 335EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
272 336
273/* 337/*
338 * Force a quiescent state.
339 */
340void rcu_force_quiescent_state(void)
341{
342 force_quiescent_state(rcu_state_p);
343}
344EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
345
346/*
274 * Force a quiescent state for RCU BH. 347 * Force a quiescent state for RCU BH.
275 */ 348 */
276void rcu_bh_force_quiescent_state(void) 349void rcu_bh_force_quiescent_state(void)
@@ -280,6 +353,21 @@ void rcu_bh_force_quiescent_state(void)
280EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 353EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
281 354
282/* 355/*
356 * Show the state of the grace-period kthreads.
357 */
358void show_rcu_gp_kthreads(void)
359{
360 struct rcu_state *rsp;
361
362 for_each_rcu_flavor(rsp) {
363 pr_info("%s: wait state: %d ->state: %#lx\n",
364 rsp->name, rsp->gp_state, rsp->gp_kthread->state);
365 /* sched_show_task(rsp->gp_kthread); */
366 }
367}
368EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
369
370/*
283 * Record the number of times rcutorture tests have been initiated and 371 * Record the number of times rcutorture tests have been initiated and
284 * terminated. This information allows the debugfs tracing stats to be 372 * terminated. This information allows the debugfs tracing stats to be
285 * correlated to the rcutorture messages, even when the rcutorture module 373 * correlated to the rcutorture messages, even when the rcutorture module
@@ -294,6 +382,39 @@ void rcutorture_record_test_transition(void)
294EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); 382EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
295 383
296/* 384/*
385 * Send along grace-period-related data for rcutorture diagnostics.
386 */
387void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
388 unsigned long *gpnum, unsigned long *completed)
389{
390 struct rcu_state *rsp = NULL;
391
392 switch (test_type) {
393 case RCU_FLAVOR:
394 rsp = rcu_state_p;
395 break;
396 case RCU_BH_FLAVOR:
397 rsp = &rcu_bh_state;
398 break;
399 case RCU_SCHED_FLAVOR:
400 rsp = &rcu_sched_state;
401 break;
402 default:
403 break;
404 }
405 if (rsp != NULL) {
406 *flags = ACCESS_ONCE(rsp->gp_flags);
407 *gpnum = ACCESS_ONCE(rsp->gpnum);
408 *completed = ACCESS_ONCE(rsp->completed);
409 return;
410 }
411 *flags = 0;
412 *gpnum = 0;
413 *completed = 0;
414}
415EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
416
417/*
297 * Record the number of writer passes through the current rcutorture test. 418 * Record the number of writer passes through the current rcutorture test.
298 * This is also used to correlate debugfs tracing stats with the rcutorture 419 * This is also used to correlate debugfs tracing stats with the rcutorture
299 * messages. 420 * messages.
@@ -324,6 +445,28 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
324} 445}
325 446
326/* 447/*
448 * Return the root node of the specified rcu_state structure.
449 */
450static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
451{
452 return &rsp->node[0];
453}
454
455/*
456 * Is there any need for future grace periods?
457 * Interrupts must be disabled. If the caller does not hold the root
458 * rnp_node structure's ->lock, the results are advisory only.
459 */
460static int rcu_future_needs_gp(struct rcu_state *rsp)
461{
462 struct rcu_node *rnp = rcu_get_root(rsp);
463 int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
464 int *fp = &rnp->need_future_gp[idx];
465
466 return ACCESS_ONCE(*fp);
467}
468
469/*
327 * Does the current CPU require a not-yet-started grace period? 470 * Does the current CPU require a not-yet-started grace period?
328 * The caller must have disabled interrupts to prevent races with 471 * The caller must have disabled interrupts to prevent races with
329 * normal callback registry. 472 * normal callback registry.
@@ -335,7 +478,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
335 478
336 if (rcu_gp_in_progress(rsp)) 479 if (rcu_gp_in_progress(rsp))
337 return 0; /* No, a grace period is already in progress. */ 480 return 0; /* No, a grace period is already in progress. */
338 if (rcu_nocb_needs_gp(rsp)) 481 if (rcu_future_needs_gp(rsp))
339 return 1; /* Yes, a no-CBs CPU needs one. */ 482 return 1; /* Yes, a no-CBs CPU needs one. */
340 if (!rdp->nxttail[RCU_NEXT_TAIL]) 483 if (!rdp->nxttail[RCU_NEXT_TAIL])
341 return 0; /* No, this is a no-CBs (or offline) CPU. */ 484 return 0; /* No, this is a no-CBs (or offline) CPU. */
@@ -350,14 +493,6 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
350} 493}
351 494
352/* 495/*
353 * Return the root node of the specified rcu_state structure.
354 */
355static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
356{
357 return &rsp->node[0];
358}
359
360/*
361 * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state 496 * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
362 * 497 *
363 * If the new value of the ->dynticks_nesting counter now is zero, 498 * If the new value of the ->dynticks_nesting counter now is zero,
@@ -387,9 +522,9 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
387 } 522 }
388 rcu_prepare_for_idle(smp_processor_id()); 523 rcu_prepare_for_idle(smp_processor_id());
389 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 524 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
390 smp_mb__before_atomic_inc(); /* See above. */ 525 smp_mb__before_atomic(); /* See above. */
391 atomic_inc(&rdtp->dynticks); 526 atomic_inc(&rdtp->dynticks);
392 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ 527 smp_mb__after_atomic(); /* Force ordering with next sojourn. */
393 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 528 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
394 529
395 /* 530 /*
@@ -507,10 +642,10 @@ void rcu_irq_exit(void)
507static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, 642static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
508 int user) 643 int user)
509{ 644{
510 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ 645 smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
511 atomic_inc(&rdtp->dynticks); 646 atomic_inc(&rdtp->dynticks);
512 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 647 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
513 smp_mb__after_atomic_inc(); /* See above. */ 648 smp_mb__after_atomic(); /* See above. */
514 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 649 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
515 rcu_cleanup_after_idle(smp_processor_id()); 650 rcu_cleanup_after_idle(smp_processor_id());
516 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); 651 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
@@ -635,10 +770,10 @@ void rcu_nmi_enter(void)
635 (atomic_read(&rdtp->dynticks) & 0x1)) 770 (atomic_read(&rdtp->dynticks) & 0x1))
636 return; 771 return;
637 rdtp->dynticks_nmi_nesting++; 772 rdtp->dynticks_nmi_nesting++;
638 smp_mb__before_atomic_inc(); /* Force delay from prior write. */ 773 smp_mb__before_atomic(); /* Force delay from prior write. */
639 atomic_inc(&rdtp->dynticks); 774 atomic_inc(&rdtp->dynticks);
640 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 775 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
641 smp_mb__after_atomic_inc(); /* See above. */ 776 smp_mb__after_atomic(); /* See above. */
642 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 777 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
643} 778}
644 779
@@ -657,9 +792,9 @@ void rcu_nmi_exit(void)
657 --rdtp->dynticks_nmi_nesting != 0) 792 --rdtp->dynticks_nmi_nesting != 0)
658 return; 793 return;
659 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 794 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
660 smp_mb__before_atomic_inc(); /* See above. */ 795 smp_mb__before_atomic(); /* See above. */
661 atomic_inc(&rdtp->dynticks); 796 atomic_inc(&rdtp->dynticks);
662 smp_mb__after_atomic_inc(); /* Force delay to next write. */ 797 smp_mb__after_atomic(); /* Force delay to next write. */
663 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 798 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
664} 799}
665 800
@@ -758,7 +893,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
758{ 893{
759 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 894 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
760 rcu_sysidle_check_cpu(rdp, isidle, maxj); 895 rcu_sysidle_check_cpu(rdp, isidle, maxj);
761 return (rdp->dynticks_snap & 0x1) == 0; 896 if ((rdp->dynticks_snap & 0x1) == 0) {
897 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
898 return 1;
899 } else {
900 return 0;
901 }
762} 902}
763 903
764/* 904/*
@@ -777,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
777 bool *isidle, unsigned long *maxj) 917 bool *isidle, unsigned long *maxj)
778{ 918{
779 unsigned int curr; 919 unsigned int curr;
920 int *rcrmp;
780 unsigned int snap; 921 unsigned int snap;
781 922
782 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); 923 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
@@ -817,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
817 } 958 }
818 959
819 /* 960 /*
820 * There is a possibility that a CPU in adaptive-ticks state 961 * A CPU running for an extended time within the kernel can
821 * might run in the kernel with the scheduling-clock tick disabled 962 * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
822 * for an extended time period. Invoke rcu_kick_nohz_cpu() to 963 * even context-switching back and forth between a pair of
823 * force the CPU to restart the scheduling-clock tick in this 964 * in-kernel CPU-bound tasks cannot advance grace periods.
824 * CPU is in this state. 965 * So if the grace period is old enough, make the CPU pay attention.
825 */ 966 * Note that the unsynchronized assignments to the per-CPU
826 rcu_kick_nohz_cpu(rdp->cpu); 967 * rcu_sched_qs_mask variable are safe. Yes, setting of
827 968 * bits can be lost, but they will be set again on the next
828 /* 969 * force-quiescent-state pass. So lost bit sets do not result
829 * Alternatively, the CPU might be running in the kernel 970 * in incorrect behavior, merely in a grace period lasting
830 * for an extended period of time without a quiescent state. 971 * a few jiffies longer than it might otherwise. Because
831 * Attempt to force the CPU through the scheduler to gain the 972 * there are at most four threads involved, and because the
832 * needed quiescent state, but only if the grace period has gone 973 * updates are only once every few jiffies, the probability of
833 * on for an uncommonly long time. If there are many stuck CPUs, 974 * lossage (and thus of slight grace-period extension) is
834 * we will beat on the first one until it gets unstuck, then move 975 * quite low.
835 * to the next. Only do this for the primary flavor of RCU. 976 *
977 * Note that if the jiffies_till_sched_qs boot/sysfs parameter
978 * is set too high, we override with half of the RCU CPU stall
979 * warning delay.
836 */ 980 */
837 if (rdp->rsp == rcu_state && 981 rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
982 if (ULONG_CMP_GE(jiffies,
983 rdp->rsp->gp_start + jiffies_till_sched_qs) ||
838 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { 984 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
839 rdp->rsp->jiffies_resched += 5; 985 if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
840 resched_cpu(rdp->cpu); 986 ACCESS_ONCE(rdp->cond_resched_completed) =
987 ACCESS_ONCE(rdp->mynode->completed);
988 smp_mb(); /* ->cond_resched_completed before *rcrmp. */
989 ACCESS_ONCE(*rcrmp) =
990 ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
991 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
992 rdp->rsp->jiffies_resched += 5; /* Enable beating. */
993 } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
994 /* Time to beat on that CPU again! */
995 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
996 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
997 }
841 } 998 }
842 999
843 return 0; 1000 return 0;
@@ -851,7 +1008,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
851 rsp->gp_start = j; 1008 rsp->gp_start = j;
852 smp_wmb(); /* Record start time before stall time. */ 1009 smp_wmb(); /* Record start time before stall time. */
853 j1 = rcu_jiffies_till_stall_check(); 1010 j1 = rcu_jiffies_till_stall_check();
854 rsp->jiffies_stall = j + j1; 1011 ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
855 rsp->jiffies_resched = j + j1 / 2; 1012 rsp->jiffies_resched = j + j1 / 2;
856} 1013}
857 1014
@@ -890,12 +1047,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
890 /* Only let one CPU complain about others per time interval. */ 1047 /* Only let one CPU complain about others per time interval. */
891 1048
892 raw_spin_lock_irqsave(&rnp->lock, flags); 1049 raw_spin_lock_irqsave(&rnp->lock, flags);
893 delta = jiffies - rsp->jiffies_stall; 1050 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
894 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 1051 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
895 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1052 raw_spin_unlock_irqrestore(&rnp->lock, flags);
896 return; 1053 return;
897 } 1054 }
898 rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; 1055 ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
899 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1056 raw_spin_unlock_irqrestore(&rnp->lock, flags);
900 1057
901 /* 1058 /*
@@ -932,9 +1089,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
932 print_cpu_stall_info_end(); 1089 print_cpu_stall_info_end();
933 for_each_possible_cpu(cpu) 1090 for_each_possible_cpu(cpu)
934 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1091 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
935 pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", 1092 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
936 smp_processor_id(), (long)(jiffies - rsp->gp_start), 1093 smp_processor_id(), (long)(jiffies - rsp->gp_start),
937 rsp->gpnum, rsp->completed, totqlen); 1094 (long)rsp->gpnum, (long)rsp->completed, totqlen);
938 if (ndetected == 0) 1095 if (ndetected == 0)
939 pr_err("INFO: Stall ended before state dump start\n"); 1096 pr_err("INFO: Stall ended before state dump start\n");
940 else if (!trigger_all_cpu_backtrace()) 1097 else if (!trigger_all_cpu_backtrace())
@@ -947,12 +1104,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
947 force_quiescent_state(rsp); /* Kick them all. */ 1104 force_quiescent_state(rsp); /* Kick them all. */
948} 1105}
949 1106
950/*
951 * This function really isn't for public consumption, but RCU is special in
952 * that context switches can allow the state machine to make progress.
953 */
954extern void resched_cpu(int cpu);
955
956static void print_cpu_stall(struct rcu_state *rsp) 1107static void print_cpu_stall(struct rcu_state *rsp)
957{ 1108{
958 int cpu; 1109 int cpu;
@@ -971,14 +1122,15 @@ static void print_cpu_stall(struct rcu_state *rsp)
971 print_cpu_stall_info_end(); 1122 print_cpu_stall_info_end();
972 for_each_possible_cpu(cpu) 1123 for_each_possible_cpu(cpu)
973 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1124 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
974 pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", 1125 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
975 jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); 1126 jiffies - rsp->gp_start,
1127 (long)rsp->gpnum, (long)rsp->completed, totqlen);
976 if (!trigger_all_cpu_backtrace()) 1128 if (!trigger_all_cpu_backtrace())
977 dump_stack(); 1129 dump_stack();
978 1130
979 raw_spin_lock_irqsave(&rnp->lock, flags); 1131 raw_spin_lock_irqsave(&rnp->lock, flags);
980 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) 1132 if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
981 rsp->jiffies_stall = jiffies + 1133 ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
982 3 * rcu_jiffies_till_stall_check() + 3; 1134 3 * rcu_jiffies_till_stall_check() + 3;
983 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1135 raw_spin_unlock_irqrestore(&rnp->lock, flags);
984 1136
@@ -1062,7 +1214,7 @@ void rcu_cpu_stall_reset(void)
1062 struct rcu_state *rsp; 1214 struct rcu_state *rsp;
1063 1215
1064 for_each_rcu_flavor(rsp) 1216 for_each_rcu_flavor(rsp)
1065 rsp->jiffies_stall = jiffies + ULONG_MAX / 2; 1217 ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
1066} 1218}
1067 1219
1068/* 1220/*
@@ -1123,15 +1275,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1123/* 1275/*
1124 * Start some future grace period, as needed to handle newly arrived 1276 * Start some future grace period, as needed to handle newly arrived
1125 * callbacks. The required future grace periods are recorded in each 1277 * callbacks. The required future grace periods are recorded in each
1126 * rcu_node structure's ->need_future_gp field. 1278 * rcu_node structure's ->need_future_gp field. Returns true if there
1279 * is reason to awaken the grace-period kthread.
1127 * 1280 *
1128 * The caller must hold the specified rcu_node structure's ->lock. 1281 * The caller must hold the specified rcu_node structure's ->lock.
1129 */ 1282 */
1130static unsigned long __maybe_unused 1283static bool __maybe_unused
1131rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) 1284rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1285 unsigned long *c_out)
1132{ 1286{
1133 unsigned long c; 1287 unsigned long c;
1134 int i; 1288 int i;
1289 bool ret = false;
1135 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); 1290 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
1136 1291
1137 /* 1292 /*
@@ -1142,7 +1297,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1142 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); 1297 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
1143 if (rnp->need_future_gp[c & 0x1]) { 1298 if (rnp->need_future_gp[c & 0x1]) {
1144 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); 1299 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
1145 return c; 1300 goto out;
1146 } 1301 }
1147 1302
1148 /* 1303 /*
@@ -1156,7 +1311,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1156 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { 1311 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
1157 rnp->need_future_gp[c & 0x1]++; 1312 rnp->need_future_gp[c & 0x1]++;
1158 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); 1313 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
1159 return c; 1314 goto out;
1160 } 1315 }
1161 1316
1162 /* 1317 /*
@@ -1197,12 +1352,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1197 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); 1352 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
1198 } else { 1353 } else {
1199 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); 1354 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
1200 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); 1355 ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
1201 } 1356 }
1202unlock_out: 1357unlock_out:
1203 if (rnp != rnp_root) 1358 if (rnp != rnp_root)
1204 raw_spin_unlock(&rnp_root->lock); 1359 raw_spin_unlock(&rnp_root->lock);
1205 return c; 1360out:
1361 if (c_out != NULL)
1362 *c_out = c;
1363 return ret;
1206} 1364}
1207 1365
1208/* 1366/*
@@ -1226,25 +1384,43 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1226} 1384}
1227 1385
1228/* 1386/*
1387 * Awaken the grace-period kthread for the specified flavor of RCU.
1388 * Don't do a self-awaken, and don't bother awakening when there is
1389 * nothing for the grace-period kthread to do (as in several CPUs
1390 * raced to awaken, and we lost), and finally don't try to awaken
1391 * a kthread that has not yet been created.
1392 */
1393static void rcu_gp_kthread_wake(struct rcu_state *rsp)
1394{
1395 if (current == rsp->gp_kthread ||
1396 !ACCESS_ONCE(rsp->gp_flags) ||
1397 !rsp->gp_kthread)
1398 return;
1399 wake_up(&rsp->gp_wq);
1400}
1401
1402/*
1229 * If there is room, assign a ->completed number to any callbacks on 1403 * If there is room, assign a ->completed number to any callbacks on
1230 * this CPU that have not already been assigned. Also accelerate any 1404 * this CPU that have not already been assigned. Also accelerate any
1231 * callbacks that were previously assigned a ->completed number that has 1405 * callbacks that were previously assigned a ->completed number that has
1232 * since proven to be too conservative, which can happen if callbacks get 1406 * since proven to be too conservative, which can happen if callbacks get
1233 * assigned a ->completed number while RCU is idle, but with reference to 1407 * assigned a ->completed number while RCU is idle, but with reference to
1234 * a non-root rcu_node structure. This function is idempotent, so it does 1408 * a non-root rcu_node structure. This function is idempotent, so it does
1235 * not hurt to call it repeatedly. 1409 * not hurt to call it repeatedly. Returns an flag saying that we should
1410 * awaken the RCU grace-period kthread.
1236 * 1411 *
1237 * The caller must hold rnp->lock with interrupts disabled. 1412 * The caller must hold rnp->lock with interrupts disabled.
1238 */ 1413 */
1239static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1414static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1240 struct rcu_data *rdp) 1415 struct rcu_data *rdp)
1241{ 1416{
1242 unsigned long c; 1417 unsigned long c;
1243 int i; 1418 int i;
1419 bool ret;
1244 1420
1245 /* If the CPU has no callbacks, nothing to do. */ 1421 /* If the CPU has no callbacks, nothing to do. */
1246 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) 1422 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1247 return; 1423 return false;
1248 1424
1249 /* 1425 /*
1250 * Starting from the sublist containing the callbacks most 1426 * Starting from the sublist containing the callbacks most
@@ -1273,7 +1449,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1273 * be grouped into. 1449 * be grouped into.
1274 */ 1450 */
1275 if (++i >= RCU_NEXT_TAIL) 1451 if (++i >= RCU_NEXT_TAIL)
1276 return; 1452 return false;
1277 1453
1278 /* 1454 /*
1279 * Assign all subsequent callbacks' ->completed number to the next 1455 * Assign all subsequent callbacks' ->completed number to the next
@@ -1285,13 +1461,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1285 rdp->nxtcompleted[i] = c; 1461 rdp->nxtcompleted[i] = c;
1286 } 1462 }
1287 /* Record any needed additional grace periods. */ 1463 /* Record any needed additional grace periods. */
1288 rcu_start_future_gp(rnp, rdp); 1464 ret = rcu_start_future_gp(rnp, rdp, NULL);
1289 1465
1290 /* Trace depending on how much we were able to accelerate. */ 1466 /* Trace depending on how much we were able to accelerate. */
1291 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1467 if (!*rdp->nxttail[RCU_WAIT_TAIL])
1292 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); 1468 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
1293 else 1469 else
1294 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); 1470 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
1471 return ret;
1295} 1472}
1296 1473
1297/* 1474/*
@@ -1300,17 +1477,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1300 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL 1477 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
1301 * sublist. This function is idempotent, so it does not hurt to 1478 * sublist. This function is idempotent, so it does not hurt to
1302 * invoke it repeatedly. As long as it is not invoked -too- often... 1479 * invoke it repeatedly. As long as it is not invoked -too- often...
1480 * Returns true if the RCU grace-period kthread needs to be awakened.
1303 * 1481 *
1304 * The caller must hold rnp->lock with interrupts disabled. 1482 * The caller must hold rnp->lock with interrupts disabled.
1305 */ 1483 */
1306static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1484static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1307 struct rcu_data *rdp) 1485 struct rcu_data *rdp)
1308{ 1486{
1309 int i, j; 1487 int i, j;
1310 1488
1311 /* If the CPU has no callbacks, nothing to do. */ 1489 /* If the CPU has no callbacks, nothing to do. */
1312 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) 1490 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1313 return; 1491 return false;
1314 1492
1315 /* 1493 /*
1316 * Find all callbacks whose ->completed numbers indicate that they 1494 * Find all callbacks whose ->completed numbers indicate that they
@@ -1334,26 +1512,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1334 } 1512 }
1335 1513
1336 /* Classify any remaining callbacks. */ 1514 /* Classify any remaining callbacks. */
1337 rcu_accelerate_cbs(rsp, rnp, rdp); 1515 return rcu_accelerate_cbs(rsp, rnp, rdp);
1338} 1516}
1339 1517
1340/* 1518/*
1341 * Update CPU-local rcu_data state to record the beginnings and ends of 1519 * Update CPU-local rcu_data state to record the beginnings and ends of
1342 * grace periods. The caller must hold the ->lock of the leaf rcu_node 1520 * grace periods. The caller must hold the ->lock of the leaf rcu_node
1343 * structure corresponding to the current CPU, and must have irqs disabled. 1521 * structure corresponding to the current CPU, and must have irqs disabled.
1522 * Returns true if the grace-period kthread needs to be awakened.
1344 */ 1523 */
1345static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) 1524static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1525 struct rcu_data *rdp)
1346{ 1526{
1527 bool ret;
1528
1347 /* Handle the ends of any preceding grace periods first. */ 1529 /* Handle the ends of any preceding grace periods first. */
1348 if (rdp->completed == rnp->completed) { 1530 if (rdp->completed == rnp->completed) {
1349 1531
1350 /* No grace period end, so just accelerate recent callbacks. */ 1532 /* No grace period end, so just accelerate recent callbacks. */
1351 rcu_accelerate_cbs(rsp, rnp, rdp); 1533 ret = rcu_accelerate_cbs(rsp, rnp, rdp);
1352 1534
1353 } else { 1535 } else {
1354 1536
1355 /* Advance callbacks. */ 1537 /* Advance callbacks. */
1356 rcu_advance_cbs(rsp, rnp, rdp); 1538 ret = rcu_advance_cbs(rsp, rnp, rdp);
1357 1539
1358 /* Remember that we saw this grace-period completion. */ 1540 /* Remember that we saw this grace-period completion. */
1359 rdp->completed = rnp->completed; 1541 rdp->completed = rnp->completed;
@@ -1372,11 +1554,13 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
1372 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1554 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1373 zero_cpu_stall_ticks(rdp); 1555 zero_cpu_stall_ticks(rdp);
1374 } 1556 }
1557 return ret;
1375} 1558}
1376 1559
1377static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) 1560static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1378{ 1561{
1379 unsigned long flags; 1562 unsigned long flags;
1563 bool needwake;
1380 struct rcu_node *rnp; 1564 struct rcu_node *rnp;
1381 1565
1382 local_irq_save(flags); 1566 local_irq_save(flags);
@@ -1388,8 +1572,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1388 return; 1572 return;
1389 } 1573 }
1390 smp_mb__after_unlock_lock(); 1574 smp_mb__after_unlock_lock();
1391 __note_gp_changes(rsp, rnp, rdp); 1575 needwake = __note_gp_changes(rsp, rnp, rdp);
1392 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1576 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1577 if (needwake)
1578 rcu_gp_kthread_wake(rsp);
1393} 1579}
1394 1580
1395/* 1581/*
@@ -1403,12 +1589,12 @@ static int rcu_gp_init(struct rcu_state *rsp)
1403 rcu_bind_gp_kthread(); 1589 rcu_bind_gp_kthread();
1404 raw_spin_lock_irq(&rnp->lock); 1590 raw_spin_lock_irq(&rnp->lock);
1405 smp_mb__after_unlock_lock(); 1591 smp_mb__after_unlock_lock();
1406 if (rsp->gp_flags == 0) { 1592 if (!ACCESS_ONCE(rsp->gp_flags)) {
1407 /* Spurious wakeup, tell caller to go back to sleep. */ 1593 /* Spurious wakeup, tell caller to go back to sleep. */
1408 raw_spin_unlock_irq(&rnp->lock); 1594 raw_spin_unlock_irq(&rnp->lock);
1409 return 0; 1595 return 0;
1410 } 1596 }
1411 rsp->gp_flags = 0; /* Clear all flags: New grace period. */ 1597 ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
1412 1598
1413 if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { 1599 if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
1414 /* 1600 /*
@@ -1453,7 +1639,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1453 WARN_ON_ONCE(rnp->completed != rsp->completed); 1639 WARN_ON_ONCE(rnp->completed != rsp->completed);
1454 ACCESS_ONCE(rnp->completed) = rsp->completed; 1640 ACCESS_ONCE(rnp->completed) = rsp->completed;
1455 if (rnp == rdp->mynode) 1641 if (rnp == rdp->mynode)
1456 __note_gp_changes(rsp, rnp, rdp); 1642 (void)__note_gp_changes(rsp, rnp, rdp);
1457 rcu_preempt_boost_start_gp(rnp); 1643 rcu_preempt_boost_start_gp(rnp);
1458 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 1644 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
1459 rnp->level, rnp->grplo, 1645 rnp->level, rnp->grplo,
@@ -1501,7 +1687,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1501 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1687 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1502 raw_spin_lock_irq(&rnp->lock); 1688 raw_spin_lock_irq(&rnp->lock);
1503 smp_mb__after_unlock_lock(); 1689 smp_mb__after_unlock_lock();
1504 rsp->gp_flags &= ~RCU_GP_FLAG_FQS; 1690 ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS;
1505 raw_spin_unlock_irq(&rnp->lock); 1691 raw_spin_unlock_irq(&rnp->lock);
1506 } 1692 }
1507 return fqs_state; 1693 return fqs_state;
@@ -1513,6 +1699,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1513static void rcu_gp_cleanup(struct rcu_state *rsp) 1699static void rcu_gp_cleanup(struct rcu_state *rsp)
1514{ 1700{
1515 unsigned long gp_duration; 1701 unsigned long gp_duration;
1702 bool needgp = false;
1516 int nocb = 0; 1703 int nocb = 0;
1517 struct rcu_data *rdp; 1704 struct rcu_data *rdp;
1518 struct rcu_node *rnp = rcu_get_root(rsp); 1705 struct rcu_node *rnp = rcu_get_root(rsp);
@@ -1548,7 +1735,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1548 ACCESS_ONCE(rnp->completed) = rsp->gpnum; 1735 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1549 rdp = this_cpu_ptr(rsp->rda); 1736 rdp = this_cpu_ptr(rsp->rda);
1550 if (rnp == rdp->mynode) 1737 if (rnp == rdp->mynode)
1551 __note_gp_changes(rsp, rnp, rdp); 1738 needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
1552 /* smp_mb() provided by prior unlock-lock pair. */ 1739 /* smp_mb() provided by prior unlock-lock pair. */
1553 nocb += rcu_future_gp_cleanup(rsp, rnp); 1740 nocb += rcu_future_gp_cleanup(rsp, rnp);
1554 raw_spin_unlock_irq(&rnp->lock); 1741 raw_spin_unlock_irq(&rnp->lock);
@@ -1564,9 +1751,10 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1564 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); 1751 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
1565 rsp->fqs_state = RCU_GP_IDLE; 1752 rsp->fqs_state = RCU_GP_IDLE;
1566 rdp = this_cpu_ptr(rsp->rda); 1753 rdp = this_cpu_ptr(rsp->rda);
1567 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ 1754 /* Advance CBs to reduce false positives below. */
1568 if (cpu_needs_another_gp(rsp, rdp)) { 1755 needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
1569 rsp->gp_flags = RCU_GP_FLAG_INIT; 1756 if (needgp || cpu_needs_another_gp(rsp, rdp)) {
1757 ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
1570 trace_rcu_grace_period(rsp->name, 1758 trace_rcu_grace_period(rsp->name,
1571 ACCESS_ONCE(rsp->gpnum), 1759 ACCESS_ONCE(rsp->gpnum),
1572 TPS("newreq")); 1760 TPS("newreq"));
@@ -1593,6 +1781,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1593 trace_rcu_grace_period(rsp->name, 1781 trace_rcu_grace_period(rsp->name,
1594 ACCESS_ONCE(rsp->gpnum), 1782 ACCESS_ONCE(rsp->gpnum),
1595 TPS("reqwait")); 1783 TPS("reqwait"));
1784 rsp->gp_state = RCU_GP_WAIT_GPS;
1596 wait_event_interruptible(rsp->gp_wq, 1785 wait_event_interruptible(rsp->gp_wq,
1597 ACCESS_ONCE(rsp->gp_flags) & 1786 ACCESS_ONCE(rsp->gp_flags) &
1598 RCU_GP_FLAG_INIT); 1787 RCU_GP_FLAG_INIT);
@@ -1620,6 +1809,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1620 trace_rcu_grace_period(rsp->name, 1809 trace_rcu_grace_period(rsp->name,
1621 ACCESS_ONCE(rsp->gpnum), 1810 ACCESS_ONCE(rsp->gpnum),
1622 TPS("fqswait")); 1811 TPS("fqswait"));
1812 rsp->gp_state = RCU_GP_WAIT_FQS;
1623 ret = wait_event_interruptible_timeout(rsp->gp_wq, 1813 ret = wait_event_interruptible_timeout(rsp->gp_wq,
1624 ((gf = ACCESS_ONCE(rsp->gp_flags)) & 1814 ((gf = ACCESS_ONCE(rsp->gp_flags)) &
1625 RCU_GP_FLAG_FQS) || 1815 RCU_GP_FLAG_FQS) ||
@@ -1665,14 +1855,6 @@ static int __noreturn rcu_gp_kthread(void *arg)
1665 } 1855 }
1666} 1856}
1667 1857
1668static void rsp_wakeup(struct irq_work *work)
1669{
1670 struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work);
1671
1672 /* Wake up rcu_gp_kthread() to start the grace period. */
1673 wake_up(&rsp->gp_wq);
1674}
1675
1676/* 1858/*
1677 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1859 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1678 * in preparation for detecting the next grace period. The caller must hold 1860 * in preparation for detecting the next grace period. The caller must hold
@@ -1681,8 +1863,10 @@ static void rsp_wakeup(struct irq_work *work)
1681 * Note that it is legal for a dying CPU (which is marked as offline) to 1863 * Note that it is legal for a dying CPU (which is marked as offline) to
1682 * invoke this function. This can happen when the dying CPU reports its 1864 * invoke this function. This can happen when the dying CPU reports its
1683 * quiescent state. 1865 * quiescent state.
1866 *
1867 * Returns true if the grace-period kthread must be awakened.
1684 */ 1868 */
1685static void 1869static bool
1686rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 1870rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1687 struct rcu_data *rdp) 1871 struct rcu_data *rdp)
1688{ 1872{
@@ -1693,20 +1877,18 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1693 * or a grace period is already in progress. 1877 * or a grace period is already in progress.
1694 * Either way, don't start a new grace period. 1878 * Either way, don't start a new grace period.
1695 */ 1879 */
1696 return; 1880 return false;
1697 } 1881 }
1698 rsp->gp_flags = RCU_GP_FLAG_INIT; 1882 ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
1699 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), 1883 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
1700 TPS("newreq")); 1884 TPS("newreq"));
1701 1885
1702 /* 1886 /*
1703 * We can't do wakeups while holding the rnp->lock, as that 1887 * We can't do wakeups while holding the rnp->lock, as that
1704 * could cause possible deadlocks with the rq->lock. Defer 1888 * could cause possible deadlocks with the rq->lock. Defer
1705 * the wakeup to interrupt context. And don't bother waking 1889 * the wakeup to our caller.
1706 * up the running kthread.
1707 */ 1890 */
1708 if (current != rsp->gp_kthread) 1891 return true;
1709 irq_work_queue(&rsp->wakeup_work);
1710} 1892}
1711 1893
1712/* 1894/*
@@ -1715,12 +1897,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1715 * is invoked indirectly from rcu_advance_cbs(), which would result in 1897 * is invoked indirectly from rcu_advance_cbs(), which would result in
1716 * endless recursion -- or would do so if it wasn't for the self-deadlock 1898 * endless recursion -- or would do so if it wasn't for the self-deadlock
1717 * that is encountered beforehand. 1899 * that is encountered beforehand.
1900 *
1901 * Returns true if the grace-period kthread needs to be awakened.
1718 */ 1902 */
1719static void 1903static bool rcu_start_gp(struct rcu_state *rsp)
1720rcu_start_gp(struct rcu_state *rsp)
1721{ 1904{
1722 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1905 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1723 struct rcu_node *rnp = rcu_get_root(rsp); 1906 struct rcu_node *rnp = rcu_get_root(rsp);
1907 bool ret = false;
1724 1908
1725 /* 1909 /*
1726 * If there is no grace period in progress right now, any 1910 * If there is no grace period in progress right now, any
@@ -1730,8 +1914,9 @@ rcu_start_gp(struct rcu_state *rsp)
1730 * resulting in pointless grace periods. So, advance callbacks 1914 * resulting in pointless grace periods. So, advance callbacks
1731 * then start the grace period! 1915 * then start the grace period!
1732 */ 1916 */
1733 rcu_advance_cbs(rsp, rnp, rdp); 1917 ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
1734 rcu_start_gp_advanced(rsp, rnp, rdp); 1918 ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
1919 return ret;
1735} 1920}
1736 1921
1737/* 1922/*
@@ -1820,6 +2005,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1820{ 2005{
1821 unsigned long flags; 2006 unsigned long flags;
1822 unsigned long mask; 2007 unsigned long mask;
2008 bool needwake;
1823 struct rcu_node *rnp; 2009 struct rcu_node *rnp;
1824 2010
1825 rnp = rdp->mynode; 2011 rnp = rdp->mynode;
@@ -1848,9 +2034,11 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1848 * This GP can't end until cpu checks in, so all of our 2034 * This GP can't end until cpu checks in, so all of our
1849 * callbacks can be processed during the next GP. 2035 * callbacks can be processed during the next GP.
1850 */ 2036 */
1851 rcu_accelerate_cbs(rsp, rnp, rdp); 2037 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
1852 2038
1853 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ 2039 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
2040 if (needwake)
2041 rcu_gp_kthread_wake(rsp);
1854 } 2042 }
1855} 2043}
1856 2044
@@ -1951,7 +2139,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1951static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) 2139static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
1952{ 2140{
1953 int i; 2141 int i;
1954 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 2142 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
1955 2143
1956 /* No-CBs CPUs are handled specially. */ 2144 /* No-CBs CPUs are handled specially. */
1957 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) 2145 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
@@ -2320,7 +2508,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
2320 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2508 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2321 return; /* Someone beat us to it. */ 2509 return; /* Someone beat us to it. */
2322 } 2510 }
2323 rsp->gp_flags |= RCU_GP_FLAG_FQS; 2511 ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS;
2324 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2512 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2325 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ 2513 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
2326} 2514}
@@ -2334,7 +2522,8 @@ static void
2334__rcu_process_callbacks(struct rcu_state *rsp) 2522__rcu_process_callbacks(struct rcu_state *rsp)
2335{ 2523{
2336 unsigned long flags; 2524 unsigned long flags;
2337 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 2525 bool needwake;
2526 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
2338 2527
2339 WARN_ON_ONCE(rdp->beenonline == 0); 2528 WARN_ON_ONCE(rdp->beenonline == 0);
2340 2529
@@ -2345,8 +2534,10 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2345 local_irq_save(flags); 2534 local_irq_save(flags);
2346 if (cpu_needs_another_gp(rsp, rdp)) { 2535 if (cpu_needs_another_gp(rsp, rdp)) {
2347 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ 2536 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2348 rcu_start_gp(rsp); 2537 needwake = rcu_start_gp(rsp);
2349 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 2538 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
2539 if (needwake)
2540 rcu_gp_kthread_wake(rsp);
2350 } else { 2541 } else {
2351 local_irq_restore(flags); 2542 local_irq_restore(flags);
2352 } 2543 }
@@ -2404,6 +2595,8 @@ static void invoke_rcu_core(void)
2404static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, 2595static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2405 struct rcu_head *head, unsigned long flags) 2596 struct rcu_head *head, unsigned long flags)
2406{ 2597{
2598 bool needwake;
2599
2407 /* 2600 /*
2408 * If called from an extended quiescent state, invoke the RCU 2601 * If called from an extended quiescent state, invoke the RCU
2409 * core in order to force a re-evaluation of RCU's idleness. 2602 * core in order to force a re-evaluation of RCU's idleness.
@@ -2433,8 +2626,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2433 2626
2434 raw_spin_lock(&rnp_root->lock); 2627 raw_spin_lock(&rnp_root->lock);
2435 smp_mb__after_unlock_lock(); 2628 smp_mb__after_unlock_lock();
2436 rcu_start_gp(rsp); 2629 needwake = rcu_start_gp(rsp);
2437 raw_spin_unlock(&rnp_root->lock); 2630 raw_spin_unlock(&rnp_root->lock);
2631 if (needwake)
2632 rcu_gp_kthread_wake(rsp);
2438 } else { 2633 } else {
2439 /* Give the grace period a kick. */ 2634 /* Give the grace period a kick. */
2440 rdp->blimit = LONG_MAX; 2635 rdp->blimit = LONG_MAX;
@@ -2537,6 +2732,20 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2537EXPORT_SYMBOL_GPL(call_rcu_bh); 2732EXPORT_SYMBOL_GPL(call_rcu_bh);
2538 2733
2539/* 2734/*
2735 * Queue an RCU callback for lazy invocation after a grace period.
2736 * This will likely be later named something like "call_rcu_lazy()",
2737 * but this change will require some way of tagging the lazy RCU
2738 * callbacks in the list of pending callbacks. Until then, this
2739 * function may only be called from __kfree_rcu().
2740 */
2741void kfree_call_rcu(struct rcu_head *head,
2742 void (*func)(struct rcu_head *rcu))
2743{
2744 __call_rcu(head, func, rcu_state_p, -1, 1);
2745}
2746EXPORT_SYMBOL_GPL(kfree_call_rcu);
2747
2748/*
2540 * Because a context switch is a grace period for RCU-sched and RCU-bh, 2749 * Because a context switch is a grace period for RCU-sched and RCU-bh,
2541 * any blocking grace-period wait automatically implies a grace period 2750 * any blocking grace-period wait automatically implies a grace period
2542 * if there is only one CPU online at any point time during execution 2751 * if there is only one CPU online at any point time during execution
@@ -2659,7 +2868,7 @@ unsigned long get_state_synchronize_rcu(void)
2659 * time-consuming work between get_state_synchronize_rcu() 2868 * time-consuming work between get_state_synchronize_rcu()
2660 * and cond_synchronize_rcu(). 2869 * and cond_synchronize_rcu().
2661 */ 2870 */
2662 return smp_load_acquire(&rcu_state->gpnum); 2871 return smp_load_acquire(&rcu_state_p->gpnum);
2663} 2872}
2664EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); 2873EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
2665 2874
@@ -2685,7 +2894,7 @@ void cond_synchronize_rcu(unsigned long oldstate)
2685 * Ensure that this load happens before any RCU-destructive 2894 * Ensure that this load happens before any RCU-destructive
2686 * actions the caller might carry out after we return. 2895 * actions the caller might carry out after we return.
2687 */ 2896 */
2688 newstate = smp_load_acquire(&rcu_state->completed); 2897 newstate = smp_load_acquire(&rcu_state_p->completed);
2689 if (ULONG_CMP_GE(oldstate, newstate)) 2898 if (ULONG_CMP_GE(oldstate, newstate))
2690 synchronize_rcu(); 2899 synchronize_rcu();
2691} 2900}
@@ -2790,7 +2999,7 @@ void synchronize_sched_expedited(void)
2790 s = atomic_long_read(&rsp->expedited_done); 2999 s = atomic_long_read(&rsp->expedited_done);
2791 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { 3000 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2792 /* ensure test happens before caller kfree */ 3001 /* ensure test happens before caller kfree */
2793 smp_mb__before_atomic_inc(); /* ^^^ */ 3002 smp_mb__before_atomic(); /* ^^^ */
2794 atomic_long_inc(&rsp->expedited_workdone1); 3003 atomic_long_inc(&rsp->expedited_workdone1);
2795 return; 3004 return;
2796 } 3005 }
@@ -2808,7 +3017,7 @@ void synchronize_sched_expedited(void)
2808 s = atomic_long_read(&rsp->expedited_done); 3017 s = atomic_long_read(&rsp->expedited_done);
2809 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { 3018 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2810 /* ensure test happens before caller kfree */ 3019 /* ensure test happens before caller kfree */
2811 smp_mb__before_atomic_inc(); /* ^^^ */ 3020 smp_mb__before_atomic(); /* ^^^ */
2812 atomic_long_inc(&rsp->expedited_workdone2); 3021 atomic_long_inc(&rsp->expedited_workdone2);
2813 return; 3022 return;
2814 } 3023 }
@@ -2837,7 +3046,7 @@ void synchronize_sched_expedited(void)
2837 s = atomic_long_read(&rsp->expedited_done); 3046 s = atomic_long_read(&rsp->expedited_done);
2838 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { 3047 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
2839 /* ensure test happens before caller kfree */ 3048 /* ensure test happens before caller kfree */
2840 smp_mb__before_atomic_inc(); /* ^^^ */ 3049 smp_mb__before_atomic(); /* ^^^ */
2841 atomic_long_inc(&rsp->expedited_done_lost); 3050 atomic_long_inc(&rsp->expedited_done_lost);
2842 break; 3051 break;
2843 } 3052 }
@@ -2988,7 +3197,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
2988static void rcu_barrier_func(void *type) 3197static void rcu_barrier_func(void *type)
2989{ 3198{
2990 struct rcu_state *rsp = type; 3199 struct rcu_state *rsp = type;
2991 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 3200 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
2992 3201
2993 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); 3202 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
2994 atomic_inc(&rsp->barrier_cpu_count); 3203 atomic_inc(&rsp->barrier_cpu_count);
@@ -3160,7 +3369,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
3160 * that this CPU cannot possibly have any RCU callbacks in flight yet. 3369 * that this CPU cannot possibly have any RCU callbacks in flight yet.
3161 */ 3370 */
3162static void 3371static void
3163rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) 3372rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3164{ 3373{
3165 unsigned long flags; 3374 unsigned long flags;
3166 unsigned long mask; 3375 unsigned long mask;
@@ -3173,7 +3382,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
3173 /* Set up local state, ensuring consistent view of global state. */ 3382 /* Set up local state, ensuring consistent view of global state. */
3174 raw_spin_lock_irqsave(&rnp->lock, flags); 3383 raw_spin_lock_irqsave(&rnp->lock, flags);
3175 rdp->beenonline = 1; /* We have now been online. */ 3384 rdp->beenonline = 1; /* We have now been online. */
3176 rdp->preemptible = preemptible;
3177 rdp->qlen_last_fqs_check = 0; 3385 rdp->qlen_last_fqs_check = 0;
3178 rdp->n_force_qs_snap = rsp->n_force_qs; 3386 rdp->n_force_qs_snap = rsp->n_force_qs;
3179 rdp->blimit = blimit; 3387 rdp->blimit = blimit;
@@ -3217,8 +3425,7 @@ static void rcu_prepare_cpu(int cpu)
3217 struct rcu_state *rsp; 3425 struct rcu_state *rsp;
3218 3426
3219 for_each_rcu_flavor(rsp) 3427 for_each_rcu_flavor(rsp)
3220 rcu_init_percpu_data(cpu, rsp, 3428 rcu_init_percpu_data(cpu, rsp);
3221 strcmp(rsp->name, "rcu_preempt") == 0);
3222} 3429}
3223 3430
3224/* 3431/*
@@ -3228,7 +3435,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
3228 unsigned long action, void *hcpu) 3435 unsigned long action, void *hcpu)
3229{ 3436{
3230 long cpu = (long)hcpu; 3437 long cpu = (long)hcpu;
3231 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 3438 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
3232 struct rcu_node *rnp = rdp->mynode; 3439 struct rcu_node *rnp = rdp->mynode;
3233 struct rcu_state *rsp; 3440 struct rcu_state *rsp;
3234 3441
@@ -3365,6 +3572,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3365 "rcu_node_fqs_1", 3572 "rcu_node_fqs_1",
3366 "rcu_node_fqs_2", 3573 "rcu_node_fqs_2",
3367 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ 3574 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
3575 static u8 fl_mask = 0x1;
3368 int cpustride = 1; 3576 int cpustride = 1;
3369 int i; 3577 int i;
3370 int j; 3578 int j;
@@ -3383,6 +3591,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3383 for (i = 1; i < rcu_num_lvls; i++) 3591 for (i = 1; i < rcu_num_lvls; i++)
3384 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; 3592 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
3385 rcu_init_levelspread(rsp); 3593 rcu_init_levelspread(rsp);
3594 rsp->flavor_mask = fl_mask;
3595 fl_mask <<= 1;
3386 3596
3387 /* Initialize the elements themselves, starting from the leaves. */ 3597 /* Initialize the elements themselves, starting from the leaves. */
3388 3598
@@ -3402,8 +3612,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3402 rnp->qsmaskinit = 0; 3612 rnp->qsmaskinit = 0;
3403 rnp->grplo = j * cpustride; 3613 rnp->grplo = j * cpustride;
3404 rnp->grphi = (j + 1) * cpustride - 1; 3614 rnp->grphi = (j + 1) * cpustride - 1;
3405 if (rnp->grphi >= NR_CPUS) 3615 if (rnp->grphi >= nr_cpu_ids)
3406 rnp->grphi = NR_CPUS - 1; 3616 rnp->grphi = nr_cpu_ids - 1;
3407 if (i == 0) { 3617 if (i == 0) {
3408 rnp->grpnum = 0; 3618 rnp->grpnum = 0;
3409 rnp->grpmask = 0; 3619 rnp->grpmask = 0;
@@ -3422,7 +3632,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3422 3632
3423 rsp->rda = rda; 3633 rsp->rda = rda;
3424 init_waitqueue_head(&rsp->gp_wq); 3634 init_waitqueue_head(&rsp->gp_wq);
3425 init_irq_work(&rsp->wakeup_work, rsp_wakeup);
3426 rnp = rsp->level[rcu_num_lvls - 1]; 3635 rnp = rsp->level[rcu_num_lvls - 1];
3427 for_each_possible_cpu(i) { 3636 for_each_possible_cpu(i) {
3428 while (i > rnp->grphi) 3637 while (i > rnp->grphi)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 75dc3c39a02a..0f69a79c5b7d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -252,7 +252,6 @@ struct rcu_data {
252 bool passed_quiesce; /* User-mode/idle loop etc. */ 252 bool passed_quiesce; /* User-mode/idle loop etc. */
253 bool qs_pending; /* Core waits for quiesc state. */ 253 bool qs_pending; /* Core waits for quiesc state. */
254 bool beenonline; /* CPU online at least once. */ 254 bool beenonline; /* CPU online at least once. */
255 bool preemptible; /* Preemptible RCU? */
256 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 255 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
257 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 256 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
258#ifdef CONFIG_RCU_CPU_STALL_INFO 257#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -308,6 +307,9 @@ struct rcu_data {
308 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 307 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
309 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 308 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
310 unsigned long offline_fqs; /* Kicked due to being offline. */ 309 unsigned long offline_fqs; /* Kicked due to being offline. */
310 unsigned long cond_resched_completed;
311 /* Grace period that needs help */
312 /* from cond_resched(). */
311 313
312 /* 5) __rcu_pending() statistics. */ 314 /* 5) __rcu_pending() statistics. */
313 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 315 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
@@ -393,6 +395,7 @@ struct rcu_state {
393 struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ 395 struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
394 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 396 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
395 u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ 397 u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
398 u8 flavor_mask; /* bit in flavor mask. */
396 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 399 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
397 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 400 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
398 void (*func)(struct rcu_head *head)); 401 void (*func)(struct rcu_head *head));
@@ -406,7 +409,8 @@ struct rcu_state {
406 unsigned long completed; /* # of last completed gp. */ 409 unsigned long completed; /* # of last completed gp. */
407 struct task_struct *gp_kthread; /* Task for grace periods. */ 410 struct task_struct *gp_kthread; /* Task for grace periods. */
408 wait_queue_head_t gp_wq; /* Where GP task waits. */ 411 wait_queue_head_t gp_wq; /* Where GP task waits. */
409 int gp_flags; /* Commands for GP task. */ 412 short gp_flags; /* Commands for GP task. */
413 short gp_state; /* GP kthread sleep state. */
410 414
411 /* End of fields guarded by root rcu_node's lock. */ 415 /* End of fields guarded by root rcu_node's lock. */
412 416
@@ -462,13 +466,17 @@ struct rcu_state {
462 const char *name; /* Name of structure. */ 466 const char *name; /* Name of structure. */
463 char abbr; /* Abbreviated name. */ 467 char abbr; /* Abbreviated name. */
464 struct list_head flavors; /* List of RCU flavors. */ 468 struct list_head flavors; /* List of RCU flavors. */
465 struct irq_work wakeup_work; /* Postponed wakeups */
466}; 469};
467 470
468/* Values for rcu_state structure's gp_flags field. */ 471/* Values for rcu_state structure's gp_flags field. */
469#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ 472#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
470#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ 473#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
471 474
475/* Values for rcu_state structure's gp_flags field. */
476#define RCU_GP_WAIT_INIT 0 /* Initial state. */
477#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
478#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */
479
472extern struct list_head rcu_struct_flavors; 480extern struct list_head rcu_struct_flavors;
473 481
474/* Sequence through rcu_state structures for each RCU flavor. */ 482/* Sequence through rcu_state structures for each RCU flavor. */
@@ -547,7 +555,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
547static void print_cpu_stall_info_end(void); 555static void print_cpu_stall_info_end(void);
548static void zero_cpu_stall_ticks(struct rcu_data *rdp); 556static void zero_cpu_stall_ticks(struct rcu_data *rdp);
549static void increment_cpu_stall_ticks(void); 557static void increment_cpu_stall_ticks(void);
550static int rcu_nocb_needs_gp(struct rcu_state *rsp);
551static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); 558static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
552static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); 559static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
553static void rcu_init_one_nocb(struct rcu_node *rnp); 560static void rcu_init_one_nocb(struct rcu_node *rnp);
@@ -560,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
560static void do_nocb_deferred_wakeup(struct rcu_data *rdp); 567static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
561static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 568static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
562static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 569static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
563static void rcu_kick_nohz_cpu(int cpu); 570static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
564static bool init_nocb_callback_list(struct rcu_data *rdp); 571static bool init_nocb_callback_list(struct rcu_data *rdp);
565static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); 572static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
566static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); 573static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 962d1d589929..02ac0fb186b8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -116,7 +116,7 @@ static void __init rcu_bootup_announce_oddness(void)
116#ifdef CONFIG_TREE_PREEMPT_RCU 116#ifdef CONFIG_TREE_PREEMPT_RCU
117 117
118RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); 118RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
119static struct rcu_state *rcu_state = &rcu_preempt_state; 119static struct rcu_state *rcu_state_p = &rcu_preempt_state;
120 120
121static int rcu_preempted_readers_exp(struct rcu_node *rnp); 121static int rcu_preempted_readers_exp(struct rcu_node *rnp);
122 122
@@ -149,15 +149,6 @@ long rcu_batches_completed(void)
149EXPORT_SYMBOL_GPL(rcu_batches_completed); 149EXPORT_SYMBOL_GPL(rcu_batches_completed);
150 150
151/* 151/*
152 * Force a quiescent state for preemptible RCU.
153 */
154void rcu_force_quiescent_state(void)
155{
156 force_quiescent_state(&rcu_preempt_state);
157}
158EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
159
160/*
161 * Record a preemptible-RCU quiescent state for the specified CPU. Note 152 * Record a preemptible-RCU quiescent state for the specified CPU. Note
162 * that this just means that the task currently running on the CPU is 153 * that this just means that the task currently running on the CPU is
163 * not in a quiescent state. There might be any number of tasks blocked 154 * not in a quiescent state. There might be any number of tasks blocked
@@ -688,20 +679,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
688} 679}
689EXPORT_SYMBOL_GPL(call_rcu); 680EXPORT_SYMBOL_GPL(call_rcu);
690 681
691/*
692 * Queue an RCU callback for lazy invocation after a grace period.
693 * This will likely be later named something like "call_rcu_lazy()",
694 * but this change will require some way of tagging the lazy RCU
695 * callbacks in the list of pending callbacks. Until then, this
696 * function may only be called from __kfree_rcu().
697 */
698void kfree_call_rcu(struct rcu_head *head,
699 void (*func)(struct rcu_head *rcu))
700{
701 __call_rcu(head, func, &rcu_preempt_state, -1, 1);
702}
703EXPORT_SYMBOL_GPL(kfree_call_rcu);
704
705/** 682/**
706 * synchronize_rcu - wait until a grace period has elapsed. 683 * synchronize_rcu - wait until a grace period has elapsed.
707 * 684 *
@@ -970,7 +947,7 @@ void exit_rcu(void)
970 947
971#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 948#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
972 949
973static struct rcu_state *rcu_state = &rcu_sched_state; 950static struct rcu_state *rcu_state_p = &rcu_sched_state;
974 951
975/* 952/*
976 * Tell them what RCU they are running. 953 * Tell them what RCU they are running.
@@ -991,16 +968,6 @@ long rcu_batches_completed(void)
991EXPORT_SYMBOL_GPL(rcu_batches_completed); 968EXPORT_SYMBOL_GPL(rcu_batches_completed);
992 969
993/* 970/*
994 * Force a quiescent state for RCU, which, because there is no preemptible
995 * RCU, becomes the same as rcu-sched.
996 */
997void rcu_force_quiescent_state(void)
998{
999 rcu_sched_force_quiescent_state();
1000}
1001EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1002
1003/*
1004 * Because preemptible RCU does not exist, we never have to check for 971 * Because preemptible RCU does not exist, we never have to check for
1005 * CPUs being in quiescent states. 972 * CPUs being in quiescent states.
1006 */ 973 */
@@ -1080,22 +1047,6 @@ static void rcu_preempt_check_callbacks(int cpu)
1080} 1047}
1081 1048
1082/* 1049/*
1083 * Queue an RCU callback for lazy invocation after a grace period.
1084 * This will likely be later named something like "call_rcu_lazy()",
1085 * but this change will require some way of tagging the lazy RCU
1086 * callbacks in the list of pending callbacks. Until then, this
1087 * function may only be called from __kfree_rcu().
1088 *
1089 * Because there is no preemptible RCU, we use RCU-sched instead.
1090 */
1091void kfree_call_rcu(struct rcu_head *head,
1092 void (*func)(struct rcu_head *rcu))
1093{
1094 __call_rcu(head, func, &rcu_sched_state, -1, 1);
1095}
1096EXPORT_SYMBOL_GPL(kfree_call_rcu);
1097
1098/*
1099 * Wait for an rcu-preempt grace period, but make it happen quickly. 1050 * Wait for an rcu-preempt grace period, but make it happen quickly.
1100 * But because preemptible RCU does not exist, map to rcu-sched. 1051 * But because preemptible RCU does not exist, map to rcu-sched.
1101 */ 1052 */
@@ -1517,11 +1468,11 @@ static int __init rcu_spawn_kthreads(void)
1517 for_each_possible_cpu(cpu) 1468 for_each_possible_cpu(cpu)
1518 per_cpu(rcu_cpu_has_work, cpu) = 0; 1469 per_cpu(rcu_cpu_has_work, cpu) = 0;
1519 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); 1470 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
1520 rnp = rcu_get_root(rcu_state); 1471 rnp = rcu_get_root(rcu_state_p);
1521 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1472 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1522 if (NUM_RCU_NODES > 1) { 1473 if (NUM_RCU_NODES > 1) {
1523 rcu_for_each_leaf_node(rcu_state, rnp) 1474 rcu_for_each_leaf_node(rcu_state_p, rnp)
1524 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1475 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1525 } 1476 }
1526 return 0; 1477 return 0;
1527} 1478}
@@ -1529,12 +1480,12 @@ early_initcall(rcu_spawn_kthreads);
1529 1480
1530static void rcu_prepare_kthreads(int cpu) 1481static void rcu_prepare_kthreads(int cpu)
1531{ 1482{
1532 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 1483 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
1533 struct rcu_node *rnp = rdp->mynode; 1484 struct rcu_node *rnp = rdp->mynode;
1534 1485
1535 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ 1486 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1536 if (rcu_scheduler_fully_active) 1487 if (rcu_scheduler_fully_active)
1537 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1488 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1538} 1489}
1539 1490
1540#else /* #ifdef CONFIG_RCU_BOOST */ 1491#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1744,6 +1695,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
1744static void rcu_prepare_for_idle(int cpu) 1695static void rcu_prepare_for_idle(int cpu)
1745{ 1696{
1746#ifndef CONFIG_RCU_NOCB_CPU_ALL 1697#ifndef CONFIG_RCU_NOCB_CPU_ALL
1698 bool needwake;
1747 struct rcu_data *rdp; 1699 struct rcu_data *rdp;
1748 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1700 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1749 struct rcu_node *rnp; 1701 struct rcu_node *rnp;
@@ -1792,8 +1744,10 @@ static void rcu_prepare_for_idle(int cpu)
1792 rnp = rdp->mynode; 1744 rnp = rdp->mynode;
1793 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1745 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1794 smp_mb__after_unlock_lock(); 1746 smp_mb__after_unlock_lock();
1795 rcu_accelerate_cbs(rsp, rnp, rdp); 1747 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
1796 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1748 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1749 if (needwake)
1750 rcu_gp_kthread_wake(rsp);
1797 } 1751 }
1798#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ 1752#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1799} 1753}
@@ -1855,7 +1809,7 @@ static void rcu_oom_notify_cpu(void *unused)
1855 struct rcu_data *rdp; 1809 struct rcu_data *rdp;
1856 1810
1857 for_each_rcu_flavor(rsp) { 1811 for_each_rcu_flavor(rsp) {
1858 rdp = __this_cpu_ptr(rsp->rda); 1812 rdp = raw_cpu_ptr(rsp->rda);
1859 if (rdp->qlen_lazy != 0) { 1813 if (rdp->qlen_lazy != 0) {
1860 atomic_inc(&oom_callback_count); 1814 atomic_inc(&oom_callback_count);
1861 rsp->call(&rdp->oom_head, rcu_oom_callback); 1815 rsp->call(&rdp->oom_head, rcu_oom_callback);
@@ -1997,7 +1951,7 @@ static void increment_cpu_stall_ticks(void)
1997 struct rcu_state *rsp; 1951 struct rcu_state *rsp;
1998 1952
1999 for_each_rcu_flavor(rsp) 1953 for_each_rcu_flavor(rsp)
2000 __this_cpu_ptr(rsp->rda)->ticks_this_gp++; 1954 raw_cpu_inc(rsp->rda->ticks_this_gp);
2001} 1955}
2002 1956
2003#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ 1957#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
@@ -2068,19 +2022,6 @@ static int __init parse_rcu_nocb_poll(char *arg)
2068early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 2022early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
2069 2023
2070/* 2024/*
2071 * Do any no-CBs CPUs need another grace period?
2072 *
2073 * Interrupts must be disabled. If the caller does not hold the root
2074 * rnp_node structure's ->lock, the results are advisory only.
2075 */
2076static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2077{
2078 struct rcu_node *rnp = rcu_get_root(rsp);
2079
2080 return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
2081}
2082
2083/*
2084 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended 2025 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
2085 * grace period. 2026 * grace period.
2086 */ 2027 */
@@ -2109,7 +2050,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
2109} 2050}
2110 2051
2111#ifndef CONFIG_RCU_NOCB_CPU_ALL 2052#ifndef CONFIG_RCU_NOCB_CPU_ALL
2112/* Is the specified CPU a no-CPUs CPU? */ 2053/* Is the specified CPU a no-CBs CPU? */
2113bool rcu_is_nocb_cpu(int cpu) 2054bool rcu_is_nocb_cpu(int cpu)
2114{ 2055{
2115 if (have_rcu_nocb_mask) 2056 if (have_rcu_nocb_mask)
@@ -2243,12 +2184,15 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2243 unsigned long c; 2184 unsigned long c;
2244 bool d; 2185 bool d;
2245 unsigned long flags; 2186 unsigned long flags;
2187 bool needwake;
2246 struct rcu_node *rnp = rdp->mynode; 2188 struct rcu_node *rnp = rdp->mynode;
2247 2189
2248 raw_spin_lock_irqsave(&rnp->lock, flags); 2190 raw_spin_lock_irqsave(&rnp->lock, flags);
2249 smp_mb__after_unlock_lock(); 2191 smp_mb__after_unlock_lock();
2250 c = rcu_start_future_gp(rnp, rdp); 2192 needwake = rcu_start_future_gp(rnp, rdp, &c);
2251 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2193 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2194 if (needwake)
2195 rcu_gp_kthread_wake(rdp->rsp);
2252 2196
2253 /* 2197 /*
2254 * Wait for the grace period. Do so interruptibly to avoid messing 2198 * Wait for the grace period. Do so interruptibly to avoid messing
@@ -2402,11 +2346,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2402 2346
2403#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2347#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2404 2348
2405static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2406{
2407 return 0;
2408}
2409
2410static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 2349static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2411{ 2350{
2412} 2351}
@@ -2465,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2465 * if an adaptive-ticks CPU is failing to respond to the current grace 2404 * if an adaptive-ticks CPU is failing to respond to the current grace
2466 * period and has not be idle from an RCU perspective, kick it. 2405 * period and has not be idle from an RCU perspective, kick it.
2467 */ 2406 */
2468static void rcu_kick_nohz_cpu(int cpu) 2407static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
2469{ 2408{
2470#ifdef CONFIG_NO_HZ_FULL 2409#ifdef CONFIG_NO_HZ_FULL
2471 if (tick_nohz_full_cpu(cpu)) 2410 if (tick_nohz_full_cpu(cpu))
@@ -2523,9 +2462,9 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2523 /* Record start of fully idle period. */ 2462 /* Record start of fully idle period. */
2524 j = jiffies; 2463 j = jiffies;
2525 ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; 2464 ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
2526 smp_mb__before_atomic_inc(); 2465 smp_mb__before_atomic();
2527 atomic_inc(&rdtp->dynticks_idle); 2466 atomic_inc(&rdtp->dynticks_idle);
2528 smp_mb__after_atomic_inc(); 2467 smp_mb__after_atomic();
2529 WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); 2468 WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
2530} 2469}
2531 2470
@@ -2590,9 +2529,9 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2590 } 2529 }
2591 2530
2592 /* Record end of idle period. */ 2531 /* Record end of idle period. */
2593 smp_mb__before_atomic_inc(); 2532 smp_mb__before_atomic();
2594 atomic_inc(&rdtp->dynticks_idle); 2533 atomic_inc(&rdtp->dynticks_idle);
2595 smp_mb__after_atomic_inc(); 2534 smp_mb__after_atomic();
2596 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); 2535 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
2597 2536
2598 /* 2537 /*
@@ -2657,20 +2596,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2657} 2596}
2658 2597
2659/* 2598/*
2660 * Bind the grace-period kthread for the sysidle flavor of RCU to the
2661 * timekeeping CPU.
2662 */
2663static void rcu_bind_gp_kthread(void)
2664{
2665 int cpu = ACCESS_ONCE(tick_do_timer_cpu);
2666
2667 if (cpu < 0 || cpu >= nr_cpu_ids)
2668 return;
2669 if (raw_smp_processor_id() != cpu)
2670 set_cpus_allowed_ptr(current, cpumask_of(cpu));
2671}
2672
2673/*
2674 * Return a delay in jiffies based on the number of CPUs, rcu_node 2599 * Return a delay in jiffies based on the number of CPUs, rcu_node
2675 * leaf fanout, and jiffies tick rate. The idea is to allow larger 2600 * leaf fanout, and jiffies tick rate. The idea is to allow larger
2676 * systems more time to transition to full-idle state in order to 2601 * systems more time to transition to full-idle state in order to
@@ -2734,7 +2659,8 @@ static void rcu_sysidle(unsigned long j)
2734static void rcu_sysidle_cancel(void) 2659static void rcu_sysidle_cancel(void)
2735{ 2660{
2736 smp_mb(); 2661 smp_mb();
2737 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; 2662 if (full_sysidle_state > RCU_SYSIDLE_SHORT)
2663 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
2738} 2664}
2739 2665
2740/* 2666/*
@@ -2880,10 +2806,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2880 return false; 2806 return false;
2881} 2807}
2882 2808
2883static void rcu_bind_gp_kthread(void)
2884{
2885}
2886
2887static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, 2809static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2888 unsigned long maxj) 2810 unsigned long maxj)
2889{ 2811{
@@ -2914,3 +2836,19 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
2914#endif /* #ifdef CONFIG_NO_HZ_FULL */ 2836#endif /* #ifdef CONFIG_NO_HZ_FULL */
2915 return 0; 2837 return 0;
2916} 2838}
2839
2840/*
2841 * Bind the grace-period kthread for the sysidle flavor of RCU to the
2842 * timekeeping CPU.
2843 */
2844static void rcu_bind_gp_kthread(void)
2845{
2846#ifdef CONFIG_NO_HZ_FULL
2847 int cpu = ACCESS_ONCE(tick_do_timer_cpu);
2848
2849 if (cpu < 0 || cpu >= nr_cpu_ids)
2850 return;
2851 if (raw_smp_processor_id() != cpu)
2852 set_cpus_allowed_ptr(current, cpumask_of(cpu));
2853#endif /* #ifdef CONFIG_NO_HZ_FULL */
2854}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 4c0a9b0af469..bc7883570530 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -200,12 +200,12 @@ void wait_rcu_gp(call_rcu_func_t crf)
200EXPORT_SYMBOL_GPL(wait_rcu_gp); 200EXPORT_SYMBOL_GPL(wait_rcu_gp);
201 201
202#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 202#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
203static inline void debug_init_rcu_head(struct rcu_head *head) 203void init_rcu_head(struct rcu_head *head)
204{ 204{
205 debug_object_init(head, &rcuhead_debug_descr); 205 debug_object_init(head, &rcuhead_debug_descr);
206} 206}
207 207
208static inline void debug_rcu_head_free(struct rcu_head *head) 208void destroy_rcu_head(struct rcu_head *head)
209{ 209{
210 debug_object_free(head, &rcuhead_debug_descr); 210 debug_object_free(head, &rcuhead_debug_descr);
211} 211}
@@ -320,6 +320,18 @@ int rcu_jiffies_till_stall_check(void)
320 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; 320 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
321} 321}
322 322
323void rcu_sysrq_start(void)
324{
325 if (!rcu_cpu_stall_suppress)
326 rcu_cpu_stall_suppress = 2;
327}
328
329void rcu_sysrq_end(void)
330{
331 if (rcu_cpu_stall_suppress == 2)
332 rcu_cpu_stall_suppress = 0;
333}
334
323static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) 335static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
324{ 336{
325 rcu_cpu_stall_suppress = 1; 337 rcu_cpu_stall_suppress = 1;
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 662c83fc16b7..a3a9e240fcdb 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -388,15 +388,22 @@ static int __init reboot_setup(char *str)
388 break; 388 break;
389 389
390 case 's': 390 case 's':
391 if (isdigit(*(str+1))) 391 {
392 reboot_cpu = simple_strtoul(str+1, NULL, 0); 392 int rc;
393 else if (str[1] == 'm' && str[2] == 'p' && 393
394 isdigit(*(str+3))) 394 if (isdigit(*(str+1))) {
395 reboot_cpu = simple_strtoul(str+3, NULL, 0); 395 rc = kstrtoint(str+1, 0, &reboot_cpu);
396 else 396 if (rc)
397 return rc;
398 } else if (str[1] == 'm' && str[2] == 'p' &&
399 isdigit(*(str+3))) {
400 rc = kstrtoint(str+3, 0, &reboot_cpu);
401 if (rc)
402 return rc;
403 } else
397 reboot_mode = REBOOT_SOFT; 404 reboot_mode = REBOOT_SOFT;
398 break; 405 break;
399 406 }
400 case 'g': 407 case 'g':
401 reboot_mode = REBOOT_GPIO; 408 reboot_mode = REBOOT_GPIO;
402 break; 409 break;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 51dbac6a3633..e791130f85a7 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -186,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf,
186 186
187 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ 187 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
188 if (*buf == '-') { 188 if (*buf == '-') {
189 res = simple_strtoull(buf + 1, &end, 10); 189 int rc = kstrtoull(buf + 1, 10, &res);
190 if (res != 1 || *end != '\0') 190
191 if (rc)
192 return rc;
193 if (res != 1)
191 return -EINVAL; 194 return -EINVAL;
192 *resp = RES_COUNTER_MAX; 195 *resp = RES_COUNTER_MAX;
193 return 0; 196 return 0;
diff --git a/kernel/resource.c b/kernel/resource.c
index 8957d686e29b..3c2237ac32db 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1288,13 +1288,10 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
1288 if (p->flags & IORESOURCE_BUSY) 1288 if (p->flags & IORESOURCE_BUSY)
1289 continue; 1289 continue;
1290 1290
1291 printk(KERN_WARNING "resource map sanity check conflict: " 1291 printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
1292 "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
1293 (unsigned long long)addr, 1292 (unsigned long long)addr,
1294 (unsigned long long)(addr + size - 1), 1293 (unsigned long long)(addr + size - 1),
1295 (unsigned long long)p->start, 1294 p->name, p);
1296 (unsigned long long)p->end,
1297 p->name);
1298 err = -1; 1295 err = -1;
1299 break; 1296 break;
1300 } 1297 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d9d8ece46a15..bc1638b33449 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,6 +90,22 @@
90#define CREATE_TRACE_POINTS 90#define CREATE_TRACE_POINTS
91#include <trace/events/sched.h> 91#include <trace/events/sched.h>
92 92
93#ifdef smp_mb__before_atomic
94void __smp_mb__before_atomic(void)
95{
96 smp_mb__before_atomic();
97}
98EXPORT_SYMBOL(__smp_mb__before_atomic);
99#endif
100
101#ifdef smp_mb__after_atomic
102void __smp_mb__after_atomic(void)
103{
104 smp_mb__after_atomic();
105}
106EXPORT_SYMBOL(__smp_mb__after_atomic);
107#endif
108
93void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 109void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
94{ 110{
95 unsigned long delta; 111 unsigned long delta;
@@ -506,6 +522,71 @@ static inline void init_hrtick(void)
506#endif /* CONFIG_SCHED_HRTICK */ 522#endif /* CONFIG_SCHED_HRTICK */
507 523
508/* 524/*
525 * cmpxchg based fetch_or, macro so it works for different integer types
526 */
527#define fetch_or(ptr, val) \
528({ typeof(*(ptr)) __old, __val = *(ptr); \
529 for (;;) { \
530 __old = cmpxchg((ptr), __val, __val | (val)); \
531 if (__old == __val) \
532 break; \
533 __val = __old; \
534 } \
535 __old; \
536})
537
538#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
539/*
540 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
541 * this avoids any races wrt polling state changes and thereby avoids
542 * spurious IPIs.
543 */
544static bool set_nr_and_not_polling(struct task_struct *p)
545{
546 struct thread_info *ti = task_thread_info(p);
547 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
548}
549
550/*
551 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
552 *
553 * If this returns true, then the idle task promises to call
554 * sched_ttwu_pending() and reschedule soon.
555 */
556static bool set_nr_if_polling(struct task_struct *p)
557{
558 struct thread_info *ti = task_thread_info(p);
559 typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
560
561 for (;;) {
562 if (!(val & _TIF_POLLING_NRFLAG))
563 return false;
564 if (val & _TIF_NEED_RESCHED)
565 return true;
566 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
567 if (old == val)
568 break;
569 val = old;
570 }
571 return true;
572}
573
574#else
575static bool set_nr_and_not_polling(struct task_struct *p)
576{
577 set_tsk_need_resched(p);
578 return true;
579}
580
581#ifdef CONFIG_SMP
582static bool set_nr_if_polling(struct task_struct *p)
583{
584 return false;
585}
586#endif
587#endif
588
589/*
509 * resched_task - mark a task 'to be rescheduled now'. 590 * resched_task - mark a task 'to be rescheduled now'.
510 * 591 *
511 * On UP this means the setting of the need_resched flag, on SMP it 592 * On UP this means the setting of the need_resched flag, on SMP it
@@ -521,18 +602,18 @@ void resched_task(struct task_struct *p)
521 if (test_tsk_need_resched(p)) 602 if (test_tsk_need_resched(p))
522 return; 603 return;
523 604
524 set_tsk_need_resched(p);
525
526 cpu = task_cpu(p); 605 cpu = task_cpu(p);
606
527 if (cpu == smp_processor_id()) { 607 if (cpu == smp_processor_id()) {
608 set_tsk_need_resched(p);
528 set_preempt_need_resched(); 609 set_preempt_need_resched();
529 return; 610 return;
530 } 611 }
531 612
532 /* NEED_RESCHED must be visible before we test polling */ 613 if (set_nr_and_not_polling(p))
533 smp_mb();
534 if (!tsk_is_polling(p))
535 smp_send_reschedule(cpu); 614 smp_send_reschedule(cpu);
615 else
616 trace_sched_wake_idle_without_ipi(cpu);
536} 617}
537 618
538void resched_cpu(int cpu) 619void resched_cpu(int cpu)
@@ -595,27 +676,10 @@ static void wake_up_idle_cpu(int cpu)
595 if (cpu == smp_processor_id()) 676 if (cpu == smp_processor_id())
596 return; 677 return;
597 678
598 /* 679 if (set_nr_and_not_polling(rq->idle))
599 * This is safe, as this function is called with the timer
600 * wheel base lock of (cpu) held. When the CPU is on the way
601 * to idle and has not yet set rq->curr to idle then it will
602 * be serialized on the timer wheel base lock and take the new
603 * timer into account automatically.
604 */
605 if (rq->curr != rq->idle)
606 return;
607
608 /*
609 * We can set TIF_RESCHED on the idle task of the other CPU
610 * lockless. The worst case is that the other CPU runs the
611 * idle task through an additional NOOP schedule()
612 */
613 set_tsk_need_resched(rq->idle);
614
615 /* NEED_RESCHED must be visible before we test polling */
616 smp_mb();
617 if (!tsk_is_polling(rq->idle))
618 smp_send_reschedule(cpu); 680 smp_send_reschedule(cpu);
681 else
682 trace_sched_wake_idle_without_ipi(cpu);
619} 683}
620 684
621static bool wake_up_full_nohz_cpu(int cpu) 685static bool wake_up_full_nohz_cpu(int cpu)
@@ -841,7 +905,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
841 rq->clock_task += delta; 905 rq->clock_task += delta;
842 906
843#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 907#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
844 if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) 908 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
845 sched_rt_avg_update(rq, irq_delta + steal); 909 sched_rt_avg_update(rq, irq_delta + steal);
846#endif 910#endif
847} 911}
@@ -1320,7 +1384,7 @@ out:
1320 * leave kernel. 1384 * leave kernel.
1321 */ 1385 */
1322 if (p->mm && printk_ratelimit()) { 1386 if (p->mm && printk_ratelimit()) {
1323 printk_sched("process %d (%s) no longer affine to cpu%d\n", 1387 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1324 task_pid_nr(p), p->comm, cpu); 1388 task_pid_nr(p), p->comm, cpu);
1325 } 1389 }
1326 } 1390 }
@@ -1474,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
1474} 1538}
1475 1539
1476#ifdef CONFIG_SMP 1540#ifdef CONFIG_SMP
1477static void sched_ttwu_pending(void) 1541void sched_ttwu_pending(void)
1478{ 1542{
1479 struct rq *rq = this_rq(); 1543 struct rq *rq = this_rq();
1480 struct llist_node *llist = llist_del_all(&rq->wake_list); 1544 struct llist_node *llist = llist_del_all(&rq->wake_list);
1481 struct task_struct *p; 1545 struct task_struct *p;
1546 unsigned long flags;
1482 1547
1483 raw_spin_lock(&rq->lock); 1548 if (!llist)
1549 return;
1550
1551 raw_spin_lock_irqsave(&rq->lock, flags);
1484 1552
1485 while (llist) { 1553 while (llist) {
1486 p = llist_entry(llist, struct task_struct, wake_entry); 1554 p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1488,7 +1556,7 @@ static void sched_ttwu_pending(void)
1488 ttwu_do_activate(rq, p, 0); 1556 ttwu_do_activate(rq, p, 0);
1489 } 1557 }
1490 1558
1491 raw_spin_unlock(&rq->lock); 1559 raw_spin_unlock_irqrestore(&rq->lock, flags);
1492} 1560}
1493 1561
1494void scheduler_ipi(void) 1562void scheduler_ipi(void)
@@ -1534,8 +1602,14 @@ void scheduler_ipi(void)
1534 1602
1535static void ttwu_queue_remote(struct task_struct *p, int cpu) 1603static void ttwu_queue_remote(struct task_struct *p, int cpu)
1536{ 1604{
1537 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) 1605 struct rq *rq = cpu_rq(cpu);
1538 smp_send_reschedule(cpu); 1606
1607 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
1608 if (!set_nr_if_polling(rq->idle))
1609 smp_send_reschedule(cpu);
1610 else
1611 trace_sched_wake_idle_without_ipi(cpu);
1612 }
1539} 1613}
1540 1614
1541bool cpus_share_cache(int this_cpu, int that_cpu) 1615bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -2480,7 +2554,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
2480#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2554#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2481 defined(CONFIG_PREEMPT_TRACER)) 2555 defined(CONFIG_PREEMPT_TRACER))
2482 2556
2483void __kprobes preempt_count_add(int val) 2557void preempt_count_add(int val)
2484{ 2558{
2485#ifdef CONFIG_DEBUG_PREEMPT 2559#ifdef CONFIG_DEBUG_PREEMPT
2486 /* 2560 /*
@@ -2506,8 +2580,9 @@ void __kprobes preempt_count_add(int val)
2506 } 2580 }
2507} 2581}
2508EXPORT_SYMBOL(preempt_count_add); 2582EXPORT_SYMBOL(preempt_count_add);
2583NOKPROBE_SYMBOL(preempt_count_add);
2509 2584
2510void __kprobes preempt_count_sub(int val) 2585void preempt_count_sub(int val)
2511{ 2586{
2512#ifdef CONFIG_DEBUG_PREEMPT 2587#ifdef CONFIG_DEBUG_PREEMPT
2513 /* 2588 /*
@@ -2528,6 +2603,7 @@ void __kprobes preempt_count_sub(int val)
2528 __preempt_count_sub(val); 2603 __preempt_count_sub(val);
2529} 2604}
2530EXPORT_SYMBOL(preempt_count_sub); 2605EXPORT_SYMBOL(preempt_count_sub);
2606NOKPROBE_SYMBOL(preempt_count_sub);
2531 2607
2532#endif 2608#endif
2533 2609
@@ -2592,8 +2668,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
2592 if (likely(prev->sched_class == class && 2668 if (likely(prev->sched_class == class &&
2593 rq->nr_running == rq->cfs.h_nr_running)) { 2669 rq->nr_running == rq->cfs.h_nr_running)) {
2594 p = fair_sched_class.pick_next_task(rq, prev); 2670 p = fair_sched_class.pick_next_task(rq, prev);
2595 if (likely(p && p != RETRY_TASK)) 2671 if (unlikely(p == RETRY_TASK))
2596 return p; 2672 goto again;
2673
2674 /* assumes fair_sched_class->next == idle_sched_class */
2675 if (unlikely(!p))
2676 p = idle_sched_class.pick_next_task(rq, prev);
2677
2678 return p;
2597 } 2679 }
2598 2680
2599again: 2681again:
@@ -2804,6 +2886,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
2804 barrier(); 2886 barrier();
2805 } while (need_resched()); 2887 } while (need_resched());
2806} 2888}
2889NOKPROBE_SYMBOL(preempt_schedule);
2807EXPORT_SYMBOL(preempt_schedule); 2890EXPORT_SYMBOL(preempt_schedule);
2808#endif /* CONFIG_PREEMPT */ 2891#endif /* CONFIG_PREEMPT */
2809 2892
@@ -2996,7 +3079,7 @@ EXPORT_SYMBOL(set_user_nice);
2996int can_nice(const struct task_struct *p, const int nice) 3079int can_nice(const struct task_struct *p, const int nice)
2997{ 3080{
2998 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3081 /* convert nice value [19,-20] to rlimit style value [1,40] */
2999 int nice_rlim = 20 - nice; 3082 int nice_rlim = nice_to_rlimit(nice);
3000 3083
3001 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3084 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3002 capable(CAP_SYS_NICE)); 3085 capable(CAP_SYS_NICE));
@@ -3020,17 +3103,10 @@ SYSCALL_DEFINE1(nice, int, increment)
3020 * We don't have to worry. Conceptually one call occurs first 3103 * We don't have to worry. Conceptually one call occurs first
3021 * and we have a single winner. 3104 * and we have a single winner.
3022 */ 3105 */
3023 if (increment < -40) 3106 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3024 increment = -40;
3025 if (increment > 40)
3026 increment = 40;
3027
3028 nice = task_nice(current) + increment; 3107 nice = task_nice(current) + increment;
3029 if (nice < MIN_NICE)
3030 nice = MIN_NICE;
3031 if (nice > MAX_NICE)
3032 nice = MAX_NICE;
3033 3108
3109 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
3034 if (increment < 0 && !can_nice(current, nice)) 3110 if (increment < 0 && !can_nice(current, nice))
3035 return -EPERM; 3111 return -EPERM;
3036 3112
@@ -3124,6 +3200,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3124 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3200 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3125 dl_se->dl_throttled = 0; 3201 dl_se->dl_throttled = 0;
3126 dl_se->dl_new = 1; 3202 dl_se->dl_new = 1;
3203 dl_se->dl_yielded = 0;
3127} 3204}
3128 3205
3129static void __setscheduler_params(struct task_struct *p, 3206static void __setscheduler_params(struct task_struct *p,
@@ -3188,17 +3265,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
3188 * We ask for the deadline not being zero, and greater or equal 3265 * We ask for the deadline not being zero, and greater or equal
3189 * than the runtime, as well as the period of being zero or 3266 * than the runtime, as well as the period of being zero or
3190 * greater than deadline. Furthermore, we have to be sure that 3267 * greater than deadline. Furthermore, we have to be sure that
3191 * user parameters are above the internal resolution (1us); we 3268 * user parameters are above the internal resolution of 1us (we
3192 * check sched_runtime only since it is always the smaller one. 3269 * check sched_runtime only since it is always the smaller one) and
3270 * below 2^63 ns (we have to check both sched_deadline and
3271 * sched_period, as the latter can be zero).
3193 */ 3272 */
3194static bool 3273static bool
3195__checkparam_dl(const struct sched_attr *attr) 3274__checkparam_dl(const struct sched_attr *attr)
3196{ 3275{
3197 return attr && attr->sched_deadline != 0 && 3276 /* deadline != 0 */
3198 (attr->sched_period == 0 || 3277 if (attr->sched_deadline == 0)
3199 (s64)(attr->sched_period - attr->sched_deadline) >= 0) && 3278 return false;
3200 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && 3279
3201 attr->sched_runtime >= (2 << (DL_SCALE - 1)); 3280 /*
3281 * Since we truncate DL_SCALE bits, make sure we're at least
3282 * that big.
3283 */
3284 if (attr->sched_runtime < (1ULL << DL_SCALE))
3285 return false;
3286
3287 /*
3288 * Since we use the MSB for wrap-around and sign issues, make
3289 * sure it's not set (mind that period can be equal to zero).
3290 */
3291 if (attr->sched_deadline & (1ULL << 63) ||
3292 attr->sched_period & (1ULL << 63))
3293 return false;
3294
3295 /* runtime <= deadline <= period (if period != 0) */
3296 if ((attr->sched_period != 0 &&
3297 attr->sched_period < attr->sched_deadline) ||
3298 attr->sched_deadline < attr->sched_runtime)
3299 return false;
3300
3301 return true;
3202} 3302}
3203 3303
3204/* 3304/*
@@ -3596,13 +3696,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
3596 */ 3696 */
3597 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 3697 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
3598 3698
3599out: 3699 return 0;
3600 return ret;
3601 3700
3602err_size: 3701err_size:
3603 put_user(sizeof(*attr), &uattr->size); 3702 put_user(sizeof(*attr), &uattr->size);
3604 ret = -E2BIG; 3703 return -E2BIG;
3605 goto out;
3606} 3704}
3607 3705
3608/** 3706/**
@@ -3639,6 +3737,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3639 * sys_sched_setattr - same as above, but with extended sched_attr 3737 * sys_sched_setattr - same as above, but with extended sched_attr
3640 * @pid: the pid in question. 3738 * @pid: the pid in question.
3641 * @uattr: structure containing the extended parameters. 3739 * @uattr: structure containing the extended parameters.
3740 * @flags: for future extension.
3642 */ 3741 */
3643SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, 3742SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3644 unsigned int, flags) 3743 unsigned int, flags)
@@ -3650,8 +3749,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3650 if (!uattr || pid < 0 || flags) 3749 if (!uattr || pid < 0 || flags)
3651 return -EINVAL; 3750 return -EINVAL;
3652 3751
3653 if (sched_copy_attr(uattr, &attr)) 3752 retval = sched_copy_attr(uattr, &attr);
3654 return -EFAULT; 3753 if (retval)
3754 return retval;
3755
3756 if ((int)attr.sched_policy < 0)
3757 return -EINVAL;
3655 3758
3656 rcu_read_lock(); 3759 rcu_read_lock();
3657 retval = -ESRCH; 3760 retval = -ESRCH;
@@ -3701,7 +3804,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3701 */ 3804 */
3702SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 3805SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3703{ 3806{
3704 struct sched_param lp; 3807 struct sched_param lp = { .sched_priority = 0 };
3705 struct task_struct *p; 3808 struct task_struct *p;
3706 int retval; 3809 int retval;
3707 3810
@@ -3718,11 +3821,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3718 if (retval) 3821 if (retval)
3719 goto out_unlock; 3822 goto out_unlock;
3720 3823
3721 if (task_has_dl_policy(p)) { 3824 if (task_has_rt_policy(p))
3722 retval = -EINVAL; 3825 lp.sched_priority = p->rt_priority;
3723 goto out_unlock;
3724 }
3725 lp.sched_priority = p->rt_priority;
3726 rcu_read_unlock(); 3826 rcu_read_unlock();
3727 3827
3728 /* 3828 /*
@@ -3760,7 +3860,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
3760 3860
3761 for (; addr < end; addr++) { 3861 for (; addr < end; addr++) {
3762 if (*addr) 3862 if (*addr)
3763 goto err_size; 3863 return -EFBIG;
3764 } 3864 }
3765 3865
3766 attr->size = usize; 3866 attr->size = usize;
@@ -3770,12 +3870,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
3770 if (ret) 3870 if (ret)
3771 return -EFAULT; 3871 return -EFAULT;
3772 3872
3773out: 3873 return 0;
3774 return ret;
3775
3776err_size:
3777 ret = -E2BIG;
3778 goto out;
3779} 3874}
3780 3875
3781/** 3876/**
@@ -3783,6 +3878,7 @@ err_size:
3783 * @pid: the pid in question. 3878 * @pid: the pid in question.
3784 * @uattr: structure containing the extended parameters. 3879 * @uattr: structure containing the extended parameters.
3785 * @size: sizeof(attr) for fwd/bwd comp. 3880 * @size: sizeof(attr) for fwd/bwd comp.
3881 * @flags: for future extension.
3786 */ 3882 */
3787SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 3883SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3788 unsigned int, size, unsigned int, flags) 3884 unsigned int, size, unsigned int, flags)
@@ -4145,7 +4241,7 @@ EXPORT_SYMBOL(yield);
4145 * false (0) if we failed to boost the target. 4241 * false (0) if we failed to boost the target.
4146 * -ESRCH if there's no task to yield to. 4242 * -ESRCH if there's no task to yield to.
4147 */ 4243 */
4148bool __sched yield_to(struct task_struct *p, bool preempt) 4244int __sched yield_to(struct task_struct *p, bool preempt)
4149{ 4245{
4150 struct task_struct *curr = current; 4246 struct task_struct *curr = current;
4151 struct rq *rq, *p_rq; 4247 struct rq *rq, *p_rq;
@@ -5039,11 +5135,20 @@ static struct notifier_block migration_notifier = {
5039 .priority = CPU_PRI_MIGRATION, 5135 .priority = CPU_PRI_MIGRATION,
5040}; 5136};
5041 5137
5138static void __cpuinit set_cpu_rq_start_time(void)
5139{
5140 int cpu = smp_processor_id();
5141 struct rq *rq = cpu_rq(cpu);
5142 rq->age_stamp = sched_clock_cpu(cpu);
5143}
5144
5042static int sched_cpu_active(struct notifier_block *nfb, 5145static int sched_cpu_active(struct notifier_block *nfb,
5043 unsigned long action, void *hcpu) 5146 unsigned long action, void *hcpu)
5044{ 5147{
5045 switch (action & ~CPU_TASKS_FROZEN) { 5148 switch (action & ~CPU_TASKS_FROZEN) {
5046 case CPU_STARTING: 5149 case CPU_STARTING:
5150 set_cpu_rq_start_time();
5151 return NOTIFY_OK;
5047 case CPU_DOWN_FAILED: 5152 case CPU_DOWN_FAILED:
5048 set_cpu_active((long)hcpu, true); 5153 set_cpu_active((long)hcpu, true);
5049 return NOTIFY_OK; 5154 return NOTIFY_OK;
@@ -5162,14 +5267,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5162 } 5267 }
5163 5268
5164 /* 5269 /*
5165 * Even though we initialize ->power to something semi-sane, 5270 * Even though we initialize ->capacity to something semi-sane,
5166 * we leave power_orig unset. This allows us to detect if 5271 * we leave capacity_orig unset. This allows us to detect if
5167 * domain iteration is still funny without causing /0 traps. 5272 * domain iteration is still funny without causing /0 traps.
5168 */ 5273 */
5169 if (!group->sgp->power_orig) { 5274 if (!group->sgc->capacity_orig) {
5170 printk(KERN_CONT "\n"); 5275 printk(KERN_CONT "\n");
5171 printk(KERN_ERR "ERROR: domain->cpu_power not " 5276 printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
5172 "set\n");
5173 break; 5277 break;
5174 } 5278 }
5175 5279
@@ -5191,9 +5295,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5191 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 5295 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5192 5296
5193 printk(KERN_CONT " %s", str); 5297 printk(KERN_CONT " %s", str);
5194 if (group->sgp->power != SCHED_POWER_SCALE) { 5298 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
5195 printk(KERN_CONT " (cpu_power = %d)", 5299 printk(KERN_CONT " (cpu_capacity = %d)",
5196 group->sgp->power); 5300 group->sgc->capacity);
5197 } 5301 }
5198 5302
5199 group = group->next; 5303 group = group->next;
@@ -5251,8 +5355,9 @@ static int sd_degenerate(struct sched_domain *sd)
5251 SD_BALANCE_NEWIDLE | 5355 SD_BALANCE_NEWIDLE |
5252 SD_BALANCE_FORK | 5356 SD_BALANCE_FORK |
5253 SD_BALANCE_EXEC | 5357 SD_BALANCE_EXEC |
5254 SD_SHARE_CPUPOWER | 5358 SD_SHARE_CPUCAPACITY |
5255 SD_SHARE_PKG_RESOURCES)) { 5359 SD_SHARE_PKG_RESOURCES |
5360 SD_SHARE_POWERDOMAIN)) {
5256 if (sd->groups != sd->groups->next) 5361 if (sd->groups != sd->groups->next)
5257 return 0; 5362 return 0;
5258 } 5363 }
@@ -5281,9 +5386,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5281 SD_BALANCE_NEWIDLE | 5386 SD_BALANCE_NEWIDLE |
5282 SD_BALANCE_FORK | 5387 SD_BALANCE_FORK |
5283 SD_BALANCE_EXEC | 5388 SD_BALANCE_EXEC |
5284 SD_SHARE_CPUPOWER | 5389 SD_SHARE_CPUCAPACITY |
5285 SD_SHARE_PKG_RESOURCES | 5390 SD_SHARE_PKG_RESOURCES |
5286 SD_PREFER_SIBLING); 5391 SD_PREFER_SIBLING |
5392 SD_SHARE_POWERDOMAIN);
5287 if (nr_node_ids == 1) 5393 if (nr_node_ids == 1)
5288 pflags &= ~SD_SERIALIZE; 5394 pflags &= ~SD_SERIALIZE;
5289 } 5395 }
@@ -5405,7 +5511,7 @@ static struct root_domain *alloc_rootdomain(void)
5405 return rd; 5511 return rd;
5406} 5512}
5407 5513
5408static void free_sched_groups(struct sched_group *sg, int free_sgp) 5514static void free_sched_groups(struct sched_group *sg, int free_sgc)
5409{ 5515{
5410 struct sched_group *tmp, *first; 5516 struct sched_group *tmp, *first;
5411 5517
@@ -5416,8 +5522,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
5416 do { 5522 do {
5417 tmp = sg->next; 5523 tmp = sg->next;
5418 5524
5419 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) 5525 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
5420 kfree(sg->sgp); 5526 kfree(sg->sgc);
5421 5527
5422 kfree(sg); 5528 kfree(sg);
5423 sg = tmp; 5529 sg = tmp;
@@ -5435,7 +5541,7 @@ static void free_sched_domain(struct rcu_head *rcu)
5435 if (sd->flags & SD_OVERLAP) { 5541 if (sd->flags & SD_OVERLAP) {
5436 free_sched_groups(sd->groups, 1); 5542 free_sched_groups(sd->groups, 1);
5437 } else if (atomic_dec_and_test(&sd->groups->ref)) { 5543 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5438 kfree(sd->groups->sgp); 5544 kfree(sd->groups->sgc);
5439 kfree(sd->groups); 5545 kfree(sd->groups);
5440 } 5546 }
5441 kfree(sd); 5547 kfree(sd);
@@ -5557,17 +5663,6 @@ static int __init isolated_cpu_setup(char *str)
5557 5663
5558__setup("isolcpus=", isolated_cpu_setup); 5664__setup("isolcpus=", isolated_cpu_setup);
5559 5665
5560static const struct cpumask *cpu_cpu_mask(int cpu)
5561{
5562 return cpumask_of_node(cpu_to_node(cpu));
5563}
5564
5565struct sd_data {
5566 struct sched_domain **__percpu sd;
5567 struct sched_group **__percpu sg;
5568 struct sched_group_power **__percpu sgp;
5569};
5570
5571struct s_data { 5666struct s_data {
5572 struct sched_domain ** __percpu sd; 5667 struct sched_domain ** __percpu sd;
5573 struct root_domain *rd; 5668 struct root_domain *rd;
@@ -5580,21 +5675,6 @@ enum s_alloc {
5580 sa_none, 5675 sa_none,
5581}; 5676};
5582 5677
5583struct sched_domain_topology_level;
5584
5585typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5586typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5587
5588#define SDTL_OVERLAP 0x01
5589
5590struct sched_domain_topology_level {
5591 sched_domain_init_f init;
5592 sched_domain_mask_f mask;
5593 int flags;
5594 int numa_level;
5595 struct sd_data data;
5596};
5597
5598/* 5678/*
5599 * Build an iteration mask that can exclude certain CPUs from the upwards 5679 * Build an iteration mask that can exclude certain CPUs from the upwards
5600 * domain traversal. 5680 * domain traversal.
@@ -5672,17 +5752,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5672 5752
5673 cpumask_or(covered, covered, sg_span); 5753 cpumask_or(covered, covered, sg_span);
5674 5754
5675 sg->sgp = *per_cpu_ptr(sdd->sgp, i); 5755 sg->sgc = *per_cpu_ptr(sdd->sgc, i);
5676 if (atomic_inc_return(&sg->sgp->ref) == 1) 5756 if (atomic_inc_return(&sg->sgc->ref) == 1)
5677 build_group_mask(sd, sg); 5757 build_group_mask(sd, sg);
5678 5758
5679 /* 5759 /*
5680 * Initialize sgp->power such that even if we mess up the 5760 * Initialize sgc->capacity such that even if we mess up the
5681 * domains and no possible iteration will get us here, we won't 5761 * domains and no possible iteration will get us here, we won't
5682 * die on a /0 trap. 5762 * die on a /0 trap.
5683 */ 5763 */
5684 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); 5764 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
5685 sg->sgp->power_orig = sg->sgp->power; 5765 sg->sgc->capacity_orig = sg->sgc->capacity;
5686 5766
5687 /* 5767 /*
5688 * Make sure the first group of this domain contains the 5768 * Make sure the first group of this domain contains the
@@ -5720,8 +5800,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5720 5800
5721 if (sg) { 5801 if (sg) {
5722 *sg = *per_cpu_ptr(sdd->sg, cpu); 5802 *sg = *per_cpu_ptr(sdd->sg, cpu);
5723 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); 5803 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
5724 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ 5804 atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
5725 } 5805 }
5726 5806
5727 return cpu; 5807 return cpu;
@@ -5730,7 +5810,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5730/* 5810/*
5731 * build_sched_groups will build a circular linked list of the groups 5811 * build_sched_groups will build a circular linked list of the groups
5732 * covered by the given span, and will set each group's ->cpumask correctly, 5812 * covered by the given span, and will set each group's ->cpumask correctly,
5733 * and ->cpu_power to 0. 5813 * and ->cpu_capacity to 0.
5734 * 5814 *
5735 * Assumes the sched_domain tree is fully constructed 5815 * Assumes the sched_domain tree is fully constructed
5736 */ 5816 */
@@ -5762,8 +5842,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5762 continue; 5842 continue;
5763 5843
5764 group = get_group(i, sdd, &sg); 5844 group = get_group(i, sdd, &sg);
5765 cpumask_clear(sched_group_cpus(sg));
5766 sg->sgp->power = 0;
5767 cpumask_setall(sched_group_mask(sg)); 5845 cpumask_setall(sched_group_mask(sg));
5768 5846
5769 for_each_cpu(j, span) { 5847 for_each_cpu(j, span) {
@@ -5786,16 +5864,16 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5786} 5864}
5787 5865
5788/* 5866/*
5789 * Initialize sched groups cpu_power. 5867 * Initialize sched groups cpu_capacity.
5790 * 5868 *
5791 * cpu_power indicates the capacity of sched group, which is used while 5869 * cpu_capacity indicates the capacity of sched group, which is used while
5792 * distributing the load between different sched groups in a sched domain. 5870 * distributing the load between different sched groups in a sched domain.
5793 * Typically cpu_power for all the groups in a sched domain will be same unless 5871 * Typically cpu_capacity for all the groups in a sched domain will be same
5794 * there are asymmetries in the topology. If there are asymmetries, group 5872 * unless there are asymmetries in the topology. If there are asymmetries,
5795 * having more cpu_power will pickup more load compared to the group having 5873 * group having more cpu_capacity will pickup more load compared to the
5796 * less cpu_power. 5874 * group having less cpu_capacity.
5797 */ 5875 */
5798static void init_sched_groups_power(int cpu, struct sched_domain *sd) 5876static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
5799{ 5877{
5800 struct sched_group *sg = sd->groups; 5878 struct sched_group *sg = sd->groups;
5801 5879
@@ -5809,13 +5887,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5809 if (cpu != group_balance_cpu(sg)) 5887 if (cpu != group_balance_cpu(sg))
5810 return; 5888 return;
5811 5889
5812 update_group_power(sd, cpu); 5890 update_group_capacity(sd, cpu);
5813 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); 5891 atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
5814}
5815
5816int __weak arch_sd_sibling_asym_packing(void)
5817{
5818 return 0*SD_ASYM_PACKING;
5819} 5892}
5820 5893
5821/* 5894/*
@@ -5823,34 +5896,6 @@ int __weak arch_sd_sibling_asym_packing(void)
5823 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 5896 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
5824 */ 5897 */
5825 5898
5826#ifdef CONFIG_SCHED_DEBUG
5827# define SD_INIT_NAME(sd, type) sd->name = #type
5828#else
5829# define SD_INIT_NAME(sd, type) do { } while (0)
5830#endif
5831
5832#define SD_INIT_FUNC(type) \
5833static noinline struct sched_domain * \
5834sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5835{ \
5836 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5837 *sd = SD_##type##_INIT; \
5838 SD_INIT_NAME(sd, type); \
5839 sd->private = &tl->data; \
5840 return sd; \
5841}
5842
5843SD_INIT_FUNC(CPU)
5844#ifdef CONFIG_SCHED_SMT
5845 SD_INIT_FUNC(SIBLING)
5846#endif
5847#ifdef CONFIG_SCHED_MC
5848 SD_INIT_FUNC(MC)
5849#endif
5850#ifdef CONFIG_SCHED_BOOK
5851 SD_INIT_FUNC(BOOK)
5852#endif
5853
5854static int default_relax_domain_level = -1; 5899static int default_relax_domain_level = -1;
5855int sched_domain_level_max; 5900int sched_domain_level_max;
5856 5901
@@ -5934,101 +5979,158 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
5934 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 5979 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
5935 *per_cpu_ptr(sdd->sg, cpu) = NULL; 5980 *per_cpu_ptr(sdd->sg, cpu) = NULL;
5936 5981
5937 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) 5982 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
5938 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 5983 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
5939} 5984}
5940 5985
5941#ifdef CONFIG_SCHED_SMT
5942static const struct cpumask *cpu_smt_mask(int cpu)
5943{
5944 return topology_thread_cpumask(cpu);
5945}
5946#endif
5947
5948/*
5949 * Topology list, bottom-up.
5950 */
5951static struct sched_domain_topology_level default_topology[] = {
5952#ifdef CONFIG_SCHED_SMT
5953 { sd_init_SIBLING, cpu_smt_mask, },
5954#endif
5955#ifdef CONFIG_SCHED_MC
5956 { sd_init_MC, cpu_coregroup_mask, },
5957#endif
5958#ifdef CONFIG_SCHED_BOOK
5959 { sd_init_BOOK, cpu_book_mask, },
5960#endif
5961 { sd_init_CPU, cpu_cpu_mask, },
5962 { NULL, },
5963};
5964
5965static struct sched_domain_topology_level *sched_domain_topology = default_topology;
5966
5967#define for_each_sd_topology(tl) \
5968 for (tl = sched_domain_topology; tl->init; tl++)
5969
5970#ifdef CONFIG_NUMA 5986#ifdef CONFIG_NUMA
5971
5972static int sched_domains_numa_levels; 5987static int sched_domains_numa_levels;
5973static int *sched_domains_numa_distance; 5988static int *sched_domains_numa_distance;
5974static struct cpumask ***sched_domains_numa_masks; 5989static struct cpumask ***sched_domains_numa_masks;
5975static int sched_domains_curr_level; 5990static int sched_domains_curr_level;
5991#endif
5976 5992
5977static inline int sd_local_flags(int level) 5993/*
5978{ 5994 * SD_flags allowed in topology descriptions.
5979 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) 5995 *
5980 return 0; 5996 * SD_SHARE_CPUCAPACITY - describes SMT topologies
5981 5997 * SD_SHARE_PKG_RESOURCES - describes shared caches
5982 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 5998 * SD_NUMA - describes NUMA topologies
5983} 5999 * SD_SHARE_POWERDOMAIN - describes shared power domain
6000 *
6001 * Odd one out:
6002 * SD_ASYM_PACKING - describes SMT quirks
6003 */
6004#define TOPOLOGY_SD_FLAGS \
6005 (SD_SHARE_CPUCAPACITY | \
6006 SD_SHARE_PKG_RESOURCES | \
6007 SD_NUMA | \
6008 SD_ASYM_PACKING | \
6009 SD_SHARE_POWERDOMAIN)
5984 6010
5985static struct sched_domain * 6011static struct sched_domain *
5986sd_numa_init(struct sched_domain_topology_level *tl, int cpu) 6012sd_init(struct sched_domain_topology_level *tl, int cpu)
5987{ 6013{
5988 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 6014 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
5989 int level = tl->numa_level; 6015 int sd_weight, sd_flags = 0;
5990 int sd_weight = cpumask_weight( 6016
5991 sched_domains_numa_masks[level][cpu_to_node(cpu)]); 6017#ifdef CONFIG_NUMA
6018 /*
6019 * Ugly hack to pass state to sd_numa_mask()...
6020 */
6021 sched_domains_curr_level = tl->numa_level;
6022#endif
6023
6024 sd_weight = cpumask_weight(tl->mask(cpu));
6025
6026 if (tl->sd_flags)
6027 sd_flags = (*tl->sd_flags)();
6028 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
6029 "wrong sd_flags in topology description\n"))
6030 sd_flags &= ~TOPOLOGY_SD_FLAGS;
5992 6031
5993 *sd = (struct sched_domain){ 6032 *sd = (struct sched_domain){
5994 .min_interval = sd_weight, 6033 .min_interval = sd_weight,
5995 .max_interval = 2*sd_weight, 6034 .max_interval = 2*sd_weight,
5996 .busy_factor = 32, 6035 .busy_factor = 32,
5997 .imbalance_pct = 125, 6036 .imbalance_pct = 125,
5998 .cache_nice_tries = 2, 6037
5999 .busy_idx = 3, 6038 .cache_nice_tries = 0,
6000 .idle_idx = 2, 6039 .busy_idx = 0,
6040 .idle_idx = 0,
6001 .newidle_idx = 0, 6041 .newidle_idx = 0,
6002 .wake_idx = 0, 6042 .wake_idx = 0,
6003 .forkexec_idx = 0, 6043 .forkexec_idx = 0,
6004 6044
6005 .flags = 1*SD_LOAD_BALANCE 6045 .flags = 1*SD_LOAD_BALANCE
6006 | 1*SD_BALANCE_NEWIDLE 6046 | 1*SD_BALANCE_NEWIDLE
6007 | 0*SD_BALANCE_EXEC 6047 | 1*SD_BALANCE_EXEC
6008 | 0*SD_BALANCE_FORK 6048 | 1*SD_BALANCE_FORK
6009 | 0*SD_BALANCE_WAKE 6049 | 0*SD_BALANCE_WAKE
6010 | 0*SD_WAKE_AFFINE 6050 | 1*SD_WAKE_AFFINE
6011 | 0*SD_SHARE_CPUPOWER 6051 | 0*SD_SHARE_CPUCAPACITY
6012 | 0*SD_SHARE_PKG_RESOURCES 6052 | 0*SD_SHARE_PKG_RESOURCES
6013 | 1*SD_SERIALIZE 6053 | 0*SD_SERIALIZE
6014 | 0*SD_PREFER_SIBLING 6054 | 0*SD_PREFER_SIBLING
6015 | 1*SD_NUMA 6055 | 0*SD_NUMA
6016 | sd_local_flags(level) 6056 | sd_flags
6017 , 6057 ,
6058
6018 .last_balance = jiffies, 6059 .last_balance = jiffies,
6019 .balance_interval = sd_weight, 6060 .balance_interval = sd_weight,
6061 .smt_gain = 0,
6062 .max_newidle_lb_cost = 0,
6063 .next_decay_max_lb_cost = jiffies,
6064#ifdef CONFIG_SCHED_DEBUG
6065 .name = tl->name,
6066#endif
6020 }; 6067 };
6021 SD_INIT_NAME(sd, NUMA);
6022 sd->private = &tl->data;
6023 6068
6024 /* 6069 /*
6025 * Ugly hack to pass state to sd_numa_mask()... 6070 * Convert topological properties into behaviour.
6026 */ 6071 */
6027 sched_domains_curr_level = tl->numa_level; 6072
6073 if (sd->flags & SD_SHARE_CPUCAPACITY) {
6074 sd->imbalance_pct = 110;
6075 sd->smt_gain = 1178; /* ~15% */
6076
6077 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6078 sd->imbalance_pct = 117;
6079 sd->cache_nice_tries = 1;
6080 sd->busy_idx = 2;
6081
6082#ifdef CONFIG_NUMA
6083 } else if (sd->flags & SD_NUMA) {
6084 sd->cache_nice_tries = 2;
6085 sd->busy_idx = 3;
6086 sd->idle_idx = 2;
6087
6088 sd->flags |= SD_SERIALIZE;
6089 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
6090 sd->flags &= ~(SD_BALANCE_EXEC |
6091 SD_BALANCE_FORK |
6092 SD_WAKE_AFFINE);
6093 }
6094
6095#endif
6096 } else {
6097 sd->flags |= SD_PREFER_SIBLING;
6098 sd->cache_nice_tries = 1;
6099 sd->busy_idx = 2;
6100 sd->idle_idx = 1;
6101 }
6102
6103 sd->private = &tl->data;
6028 6104
6029 return sd; 6105 return sd;
6030} 6106}
6031 6107
6108/*
6109 * Topology list, bottom-up.
6110 */
6111static struct sched_domain_topology_level default_topology[] = {
6112#ifdef CONFIG_SCHED_SMT
6113 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
6114#endif
6115#ifdef CONFIG_SCHED_MC
6116 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
6117#endif
6118 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
6119 { NULL, },
6120};
6121
6122struct sched_domain_topology_level *sched_domain_topology = default_topology;
6123
6124#define for_each_sd_topology(tl) \
6125 for (tl = sched_domain_topology; tl->mask; tl++)
6126
6127void set_sched_topology(struct sched_domain_topology_level *tl)
6128{
6129 sched_domain_topology = tl;
6130}
6131
6132#ifdef CONFIG_NUMA
6133
6032static const struct cpumask *sd_numa_mask(int cpu) 6134static const struct cpumask *sd_numa_mask(int cpu)
6033{ 6135{
6034 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6136 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6172,7 +6274,10 @@ static void sched_init_numa(void)
6172 } 6274 }
6173 } 6275 }
6174 6276
6175 tl = kzalloc((ARRAY_SIZE(default_topology) + level) * 6277 /* Compute default topology size */
6278 for (i = 0; sched_domain_topology[i].mask; i++);
6279
6280 tl = kzalloc((i + level + 1) *
6176 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6281 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6177 if (!tl) 6282 if (!tl)
6178 return; 6283 return;
@@ -6180,18 +6285,19 @@ static void sched_init_numa(void)
6180 /* 6285 /*
6181 * Copy the default topology bits.. 6286 * Copy the default topology bits..
6182 */ 6287 */
6183 for (i = 0; default_topology[i].init; i++) 6288 for (i = 0; sched_domain_topology[i].mask; i++)
6184 tl[i] = default_topology[i]; 6289 tl[i] = sched_domain_topology[i];
6185 6290
6186 /* 6291 /*
6187 * .. and append 'j' levels of NUMA goodness. 6292 * .. and append 'j' levels of NUMA goodness.
6188 */ 6293 */
6189 for (j = 0; j < level; i++, j++) { 6294 for (j = 0; j < level; i++, j++) {
6190 tl[i] = (struct sched_domain_topology_level){ 6295 tl[i] = (struct sched_domain_topology_level){
6191 .init = sd_numa_init,
6192 .mask = sd_numa_mask, 6296 .mask = sd_numa_mask,
6297 .sd_flags = cpu_numa_flags,
6193 .flags = SDTL_OVERLAP, 6298 .flags = SDTL_OVERLAP,
6194 .numa_level = j, 6299 .numa_level = j,
6300 SD_INIT_NAME(NUMA)
6195 }; 6301 };
6196 } 6302 }
6197 6303
@@ -6276,14 +6382,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6276 if (!sdd->sg) 6382 if (!sdd->sg)
6277 return -ENOMEM; 6383 return -ENOMEM;
6278 6384
6279 sdd->sgp = alloc_percpu(struct sched_group_power *); 6385 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
6280 if (!sdd->sgp) 6386 if (!sdd->sgc)
6281 return -ENOMEM; 6387 return -ENOMEM;
6282 6388
6283 for_each_cpu(j, cpu_map) { 6389 for_each_cpu(j, cpu_map) {
6284 struct sched_domain *sd; 6390 struct sched_domain *sd;
6285 struct sched_group *sg; 6391 struct sched_group *sg;
6286 struct sched_group_power *sgp; 6392 struct sched_group_capacity *sgc;
6287 6393
6288 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 6394 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6289 GFP_KERNEL, cpu_to_node(j)); 6395 GFP_KERNEL, cpu_to_node(j));
@@ -6301,12 +6407,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6301 6407
6302 *per_cpu_ptr(sdd->sg, j) = sg; 6408 *per_cpu_ptr(sdd->sg, j) = sg;
6303 6409
6304 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), 6410 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
6305 GFP_KERNEL, cpu_to_node(j)); 6411 GFP_KERNEL, cpu_to_node(j));
6306 if (!sgp) 6412 if (!sgc)
6307 return -ENOMEM; 6413 return -ENOMEM;
6308 6414
6309 *per_cpu_ptr(sdd->sgp, j) = sgp; 6415 *per_cpu_ptr(sdd->sgc, j) = sgc;
6310 } 6416 }
6311 } 6417 }
6312 6418
@@ -6333,15 +6439,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
6333 6439
6334 if (sdd->sg) 6440 if (sdd->sg)
6335 kfree(*per_cpu_ptr(sdd->sg, j)); 6441 kfree(*per_cpu_ptr(sdd->sg, j));
6336 if (sdd->sgp) 6442 if (sdd->sgc)
6337 kfree(*per_cpu_ptr(sdd->sgp, j)); 6443 kfree(*per_cpu_ptr(sdd->sgc, j));
6338 } 6444 }
6339 free_percpu(sdd->sd); 6445 free_percpu(sdd->sd);
6340 sdd->sd = NULL; 6446 sdd->sd = NULL;
6341 free_percpu(sdd->sg); 6447 free_percpu(sdd->sg);
6342 sdd->sg = NULL; 6448 sdd->sg = NULL;
6343 free_percpu(sdd->sgp); 6449 free_percpu(sdd->sgc);
6344 sdd->sgp = NULL; 6450 sdd->sgc = NULL;
6345 } 6451 }
6346} 6452}
6347 6453
@@ -6349,7 +6455,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6349 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6455 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6350 struct sched_domain *child, int cpu) 6456 struct sched_domain *child, int cpu)
6351{ 6457{
6352 struct sched_domain *sd = tl->init(tl, cpu); 6458 struct sched_domain *sd = sd_init(tl, cpu);
6353 if (!sd) 6459 if (!sd)
6354 return child; 6460 return child;
6355 6461
@@ -6411,14 +6517,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
6411 } 6517 }
6412 } 6518 }
6413 6519
6414 /* Calculate CPU power for physical packages and nodes */ 6520 /* Calculate CPU capacity for physical packages and nodes */
6415 for (i = nr_cpumask_bits-1; i >= 0; i--) { 6521 for (i = nr_cpumask_bits-1; i >= 0; i--) {
6416 if (!cpumask_test_cpu(i, cpu_map)) 6522 if (!cpumask_test_cpu(i, cpu_map))
6417 continue; 6523 continue;
6418 6524
6419 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6525 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6420 claim_allocations(i, sd); 6526 claim_allocations(i, sd);
6421 init_sched_groups_power(i, sd); 6527 init_sched_groups_capacity(i, sd);
6422 } 6528 }
6423 } 6529 }
6424 6530
@@ -6861,7 +6967,7 @@ void __init sched_init(void)
6861#ifdef CONFIG_SMP 6967#ifdef CONFIG_SMP
6862 rq->sd = NULL; 6968 rq->sd = NULL;
6863 rq->rd = NULL; 6969 rq->rd = NULL;
6864 rq->cpu_power = SCHED_POWER_SCALE; 6970 rq->cpu_capacity = SCHED_CAPACITY_SCALE;
6865 rq->post_schedule = 0; 6971 rq->post_schedule = 0;
6866 rq->active_balance = 0; 6972 rq->active_balance = 0;
6867 rq->next_balance = jiffies; 6973 rq->next_balance = jiffies;
@@ -6919,6 +7025,7 @@ void __init sched_init(void)
6919 if (cpu_isolated_map == NULL) 7025 if (cpu_isolated_map == NULL)
6920 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7026 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6921 idle_thread_set_boot_cpu(); 7027 idle_thread_set_boot_cpu();
7028 set_cpu_rq_start_time();
6922#endif 7029#endif
6923 init_sched_fair_class(); 7030 init_sched_fair_class();
6924 7031
@@ -7586,7 +7693,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7586static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 7693static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7587{ 7694{
7588 struct task_group *tg = css_tg(css); 7695 struct task_group *tg = css_tg(css);
7589 struct task_group *parent = css_tg(css_parent(css)); 7696 struct task_group *parent = css_tg(css->parent);
7590 7697
7591 if (parent) 7698 if (parent)
7592 sched_online_group(tg, parent); 7699 sched_online_group(tg, parent);
@@ -7717,8 +7824,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7717 /* restart the period timer (if active) to handle new period expiry */ 7824 /* restart the period timer (if active) to handle new period expiry */
7718 if (runtime_enabled && cfs_b->timer_active) { 7825 if (runtime_enabled && cfs_b->timer_active) {
7719 /* force a reprogram */ 7826 /* force a reprogram */
7720 cfs_b->timer_active = 0; 7827 __start_cfs_bandwidth(cfs_b, true);
7721 __start_cfs_bandwidth(cfs_b);
7722 } 7828 }
7723 raw_spin_unlock_irq(&cfs_b->lock); 7829 raw_spin_unlock_irq(&cfs_b->lock);
7724 7830
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index c143ee380e3a..9cf350c94ec4 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -46,7 +46,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)
46 46
47static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
48{ 48{
49 return css_ca(css_parent(&ca->css)); 49 return css_ca(ca->css.parent);
50} 50}
51 51
52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); 52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b9bb42b2d47..bd95963dae80 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/gfp.h> 14#include <linux/gfp.h>
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/slab.h>
16#include "cpudeadline.h" 17#include "cpudeadline.h"
17 18
18static inline int parent(int i) 19static inline int parent(int i)
@@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b)
39{ 40{
40 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; 41 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
41 42
42 swap(cp->elements[a], cp->elements[b]); 43 swap(cp->elements[a].cpu, cp->elements[b].cpu);
43 swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); 44 swap(cp->elements[a].dl , cp->elements[b].dl );
45
46 swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
44} 47}
45 48
46static void cpudl_heapify(struct cpudl *cp, int idx) 49static void cpudl_heapify(struct cpudl *cp, int idx)
@@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
140 WARN_ON(!cpu_present(cpu)); 143 WARN_ON(!cpu_present(cpu));
141 144
142 raw_spin_lock_irqsave(&cp->lock, flags); 145 raw_spin_lock_irqsave(&cp->lock, flags);
143 old_idx = cp->cpu_to_idx[cpu]; 146 old_idx = cp->elements[cpu].idx;
144 if (!is_valid) { 147 if (!is_valid) {
145 /* remove item */ 148 /* remove item */
146 if (old_idx == IDX_INVALID) { 149 if (old_idx == IDX_INVALID) {
@@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
155 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; 158 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
156 cp->elements[old_idx].cpu = new_cpu; 159 cp->elements[old_idx].cpu = new_cpu;
157 cp->size--; 160 cp->size--;
158 cp->cpu_to_idx[new_cpu] = old_idx; 161 cp->elements[new_cpu].idx = old_idx;
159 cp->cpu_to_idx[cpu] = IDX_INVALID; 162 cp->elements[cpu].idx = IDX_INVALID;
160 while (old_idx > 0 && dl_time_before( 163 while (old_idx > 0 && dl_time_before(
161 cp->elements[parent(old_idx)].dl, 164 cp->elements[parent(old_idx)].dl,
162 cp->elements[old_idx].dl)) { 165 cp->elements[old_idx].dl)) {
@@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
173 cp->size++; 176 cp->size++;
174 cp->elements[cp->size - 1].dl = 0; 177 cp->elements[cp->size - 1].dl = 0;
175 cp->elements[cp->size - 1].cpu = cpu; 178 cp->elements[cp->size - 1].cpu = cpu;
176 cp->cpu_to_idx[cpu] = cp->size - 1; 179 cp->elements[cpu].idx = cp->size - 1;
177 cpudl_change_key(cp, cp->size - 1, dl); 180 cpudl_change_key(cp, cp->size - 1, dl);
178 cpumask_clear_cpu(cpu, cp->free_cpus); 181 cpumask_clear_cpu(cpu, cp->free_cpus);
179 } else { 182 } else {
@@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp)
195 memset(cp, 0, sizeof(*cp)); 198 memset(cp, 0, sizeof(*cp));
196 raw_spin_lock_init(&cp->lock); 199 raw_spin_lock_init(&cp->lock);
197 cp->size = 0; 200 cp->size = 0;
198 for (i = 0; i < NR_CPUS; i++) 201
199 cp->cpu_to_idx[i] = IDX_INVALID; 202 cp->elements = kcalloc(nr_cpu_ids,
200 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) 203 sizeof(struct cpudl_item),
204 GFP_KERNEL);
205 if (!cp->elements)
206 return -ENOMEM;
207
208 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
209 kfree(cp->elements);
201 return -ENOMEM; 210 return -ENOMEM;
211 }
212
213 for_each_possible_cpu(i)
214 cp->elements[i].idx = IDX_INVALID;
215
202 cpumask_setall(cp->free_cpus); 216 cpumask_setall(cp->free_cpus);
203 217
204 return 0; 218 return 0;
@@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp)
210 */ 224 */
211void cpudl_cleanup(struct cpudl *cp) 225void cpudl_cleanup(struct cpudl *cp)
212{ 226{
213 /* 227 free_cpumask_var(cp->free_cpus);
214 * nothing to do for the moment 228 kfree(cp->elements);
215 */
216} 229}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index a202789a412c..538c9796ad4a 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -5,17 +5,17 @@
5 5
6#define IDX_INVALID -1 6#define IDX_INVALID -1
7 7
8struct array_item { 8struct cpudl_item {
9 u64 dl; 9 u64 dl;
10 int cpu; 10 int cpu;
11 int idx;
11}; 12};
12 13
13struct cpudl { 14struct cpudl {
14 raw_spinlock_t lock; 15 raw_spinlock_t lock;
15 int size; 16 int size;
16 int cpu_to_idx[NR_CPUS];
17 struct array_item elements[NR_CPUS];
18 cpumask_var_t free_cpus; 17 cpumask_var_t free_cpus;
18 struct cpudl_item *elements;
19}; 19};
20 20
21 21
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 8b836b376d91..981fcd7dc394 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -30,6 +30,7 @@
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/sched/rt.h> 32#include <linux/sched/rt.h>
33#include <linux/slab.h>
33#include "cpupri.h" 34#include "cpupri.h"
34 35
35/* Convert between a 140 based task->prio, and our 102 based cpupri */ 36/* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 int idx = 0; 71 int idx = 0;
71 int task_pri = convert_prio(p->prio); 72 int task_pri = convert_prio(p->prio);
72 73
73 if (task_pri >= MAX_RT_PRIO) 74 BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
74 return 0;
75 75
76 for (idx = 0; idx < task_pri; idx++) { 76 for (idx = 0; idx < task_pri; idx++) {
77 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 77 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
@@ -165,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
165 * do a write memory barrier, and then update the count, to 165 * do a write memory barrier, and then update the count, to
166 * make sure the vector is visible when count is set. 166 * make sure the vector is visible when count is set.
167 */ 167 */
168 smp_mb__before_atomic_inc(); 168 smp_mb__before_atomic();
169 atomic_inc(&(vec)->count); 169 atomic_inc(&(vec)->count);
170 do_mb = 1; 170 do_mb = 1;
171 } 171 }
@@ -185,14 +185,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
185 * the new priority vec. 185 * the new priority vec.
186 */ 186 */
187 if (do_mb) 187 if (do_mb)
188 smp_mb__after_atomic_inc(); 188 smp_mb__after_atomic();
189 189
190 /* 190 /*
191 * When removing from the vector, we decrement the counter first 191 * When removing from the vector, we decrement the counter first
192 * do a memory barrier and then clear the mask. 192 * do a memory barrier and then clear the mask.
193 */ 193 */
194 atomic_dec(&(vec)->count); 194 atomic_dec(&(vec)->count);
195 smp_mb__after_atomic_inc(); 195 smp_mb__after_atomic();
196 cpumask_clear_cpu(cpu, vec->mask); 196 cpumask_clear_cpu(cpu, vec->mask);
197 } 197 }
198 198
@@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp)
219 goto cleanup; 219 goto cleanup;
220 } 220 }
221 221
222 cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
223 if (!cp->cpu_to_pri)
224 goto cleanup;
225
222 for_each_possible_cpu(i) 226 for_each_possible_cpu(i)
223 cp->cpu_to_pri[i] = CPUPRI_INVALID; 227 cp->cpu_to_pri[i] = CPUPRI_INVALID;
228
224 return 0; 229 return 0;
225 230
226cleanup: 231cleanup:
@@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp)
237{ 242{
238 int i; 243 int i;
239 244
245 kfree(cp->cpu_to_pri);
240 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) 246 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
241 free_cpumask_var(cp->pri_to_cpu[i].mask); 247 free_cpumask_var(cp->pri_to_cpu[i].mask);
242} 248}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index f6d756173491..6b033347fdfd 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -17,7 +17,7 @@ struct cpupri_vec {
17 17
18struct cpupri { 18struct cpupri {
19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
20 int cpu_to_pri[NR_CPUS]; 20 int *cpu_to_pri;
21}; 21};
22 22
23#ifdef CONFIG_SMP 23#ifdef CONFIG_SMP
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a95097cb4591..72fdf06ef865 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -332,50 +332,50 @@ out:
332 * softirq as those do not count in task exec_runtime any more. 332 * softirq as those do not count in task exec_runtime any more.
333 */ 333 */
334static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 334static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
335 struct rq *rq) 335 struct rq *rq, int ticks)
336{ 336{
337 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 337 cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
338 u64 cputime = (__force u64) cputime_one_jiffy;
338 u64 *cpustat = kcpustat_this_cpu->cpustat; 339 u64 *cpustat = kcpustat_this_cpu->cpustat;
339 340
340 if (steal_account_process_tick()) 341 if (steal_account_process_tick())
341 return; 342 return;
342 343
344 cputime *= ticks;
345 scaled *= ticks;
346
343 if (irqtime_account_hi_update()) { 347 if (irqtime_account_hi_update()) {
344 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; 348 cpustat[CPUTIME_IRQ] += cputime;
345 } else if (irqtime_account_si_update()) { 349 } else if (irqtime_account_si_update()) {
346 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; 350 cpustat[CPUTIME_SOFTIRQ] += cputime;
347 } else if (this_cpu_ksoftirqd() == p) { 351 } else if (this_cpu_ksoftirqd() == p) {
348 /* 352 /*
349 * ksoftirqd time do not get accounted in cpu_softirq_time. 353 * ksoftirqd time do not get accounted in cpu_softirq_time.
350 * So, we have to handle it separately here. 354 * So, we have to handle it separately here.
351 * Also, p->stime needs to be updated for ksoftirqd. 355 * Also, p->stime needs to be updated for ksoftirqd.
352 */ 356 */
353 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 357 __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
354 CPUTIME_SOFTIRQ);
355 } else if (user_tick) { 358 } else if (user_tick) {
356 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 359 account_user_time(p, cputime, scaled);
357 } else if (p == rq->idle) { 360 } else if (p == rq->idle) {
358 account_idle_time(cputime_one_jiffy); 361 account_idle_time(cputime);
359 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 362 } else if (p->flags & PF_VCPU) { /* System time or guest time */
360 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); 363 account_guest_time(p, cputime, scaled);
361 } else { 364 } else {
362 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 365 __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
363 CPUTIME_SYSTEM);
364 } 366 }
365} 367}
366 368
367static void irqtime_account_idle_ticks(int ticks) 369static void irqtime_account_idle_ticks(int ticks)
368{ 370{
369 int i;
370 struct rq *rq = this_rq(); 371 struct rq *rq = this_rq();
371 372
372 for (i = 0; i < ticks; i++) 373 irqtime_account_process_tick(current, 0, rq, ticks);
373 irqtime_account_process_tick(current, 0, rq);
374} 374}
375#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 375#else /* CONFIG_IRQ_TIME_ACCOUNTING */
376static inline void irqtime_account_idle_ticks(int ticks) {} 376static inline void irqtime_account_idle_ticks(int ticks) {}
377static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 377static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
378 struct rq *rq) {} 378 struct rq *rq, int nr_ticks) {}
379#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 379#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
380 380
381/* 381/*
@@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
464 return; 464 return;
465 465
466 if (sched_clock_irqtime) { 466 if (sched_clock_irqtime) {
467 irqtime_account_process_tick(p, user_tick, rq); 467 irqtime_account_process_tick(p, user_tick, rq, 1);
468 return; 468 return;
469 } 469 }
470 470
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b08095786cb8..fc4f98b1258f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
57 dl_b->dl_runtime = runtime; 57 dl_b->dl_runtime = runtime;
58} 58}
59 59
60extern unsigned long to_ratio(u64 period, u64 runtime);
61
62void init_dl_bw(struct dl_bw *dl_b) 60void init_dl_bw(struct dl_bw *dl_b)
63{ 61{
64 raw_spin_lock_init(&dl_b->lock); 62 raw_spin_lock_init(&dl_b->lock);
@@ -348,12 +346,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
348 * entity. 346 * entity.
349 */ 347 */
350 if (dl_time_before(dl_se->deadline, rq_clock(rq))) { 348 if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
351 static bool lag_once = false; 349 printk_deferred_once("sched: DL replenish lagged to much\n");
352
353 if (!lag_once) {
354 lag_once = true;
355 printk_sched("sched: DL replenish lagged to much\n");
356 }
357 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 350 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
358 dl_se->runtime = pi_se->dl_runtime; 351 dl_se->runtime = pi_se->dl_runtime;
359 } 352 }
@@ -513,14 +506,22 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
513 struct sched_dl_entity, 506 struct sched_dl_entity,
514 dl_timer); 507 dl_timer);
515 struct task_struct *p = dl_task_of(dl_se); 508 struct task_struct *p = dl_task_of(dl_se);
516 struct rq *rq = task_rq(p); 509 struct rq *rq;
510again:
511 rq = task_rq(p);
517 raw_spin_lock(&rq->lock); 512 raw_spin_lock(&rq->lock);
518 513
514 if (rq != task_rq(p)) {
515 /* Task was moved, retrying. */
516 raw_spin_unlock(&rq->lock);
517 goto again;
518 }
519
519 /* 520 /*
520 * We need to take care of a possible races here. In fact, the 521 * We need to take care of a possible races here. In fact, the
521 * task might have changed its scheduling policy to something 522 * task might have changed its scheduling policy to something
522 * different from SCHED_DEADLINE or changed its reservation 523 * different from SCHED_DEADLINE or changed its reservation
523 * parameters (through sched_setscheduler()). 524 * parameters (through sched_setattr()).
524 */ 525 */
525 if (!dl_task(p) || dl_se->dl_new) 526 if (!dl_task(p) || dl_se->dl_new)
526 goto unlock; 527 goto unlock;
@@ -528,6 +529,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
528 sched_clock_tick(); 529 sched_clock_tick();
529 update_rq_clock(rq); 530 update_rq_clock(rq);
530 dl_se->dl_throttled = 0; 531 dl_se->dl_throttled = 0;
532 dl_se->dl_yielded = 0;
531 if (p->on_rq) { 533 if (p->on_rq) {
532 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 534 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
533 if (task_has_dl_policy(rq->curr)) 535 if (task_has_dl_policy(rq->curr))
@@ -740,7 +742,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
740 742
741 WARN_ON(!dl_prio(prio)); 743 WARN_ON(!dl_prio(prio));
742 dl_rq->dl_nr_running++; 744 dl_rq->dl_nr_running++;
743 inc_nr_running(rq_of_dl_rq(dl_rq)); 745 add_nr_running(rq_of_dl_rq(dl_rq), 1);
744 746
745 inc_dl_deadline(dl_rq, deadline); 747 inc_dl_deadline(dl_rq, deadline);
746 inc_dl_migration(dl_se, dl_rq); 748 inc_dl_migration(dl_se, dl_rq);
@@ -754,7 +756,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
754 WARN_ON(!dl_prio(prio)); 756 WARN_ON(!dl_prio(prio));
755 WARN_ON(!dl_rq->dl_nr_running); 757 WARN_ON(!dl_rq->dl_nr_running);
756 dl_rq->dl_nr_running--; 758 dl_rq->dl_nr_running--;
757 dec_nr_running(rq_of_dl_rq(dl_rq)); 759 sub_nr_running(rq_of_dl_rq(dl_rq), 1);
758 760
759 dec_dl_deadline(dl_rq, dl_se->deadline); 761 dec_dl_deadline(dl_rq, dl_se->deadline);
760 dec_dl_migration(dl_se, dl_rq); 762 dec_dl_migration(dl_se, dl_rq);
@@ -893,10 +895,10 @@ static void yield_task_dl(struct rq *rq)
893 * We make the task go to sleep until its current deadline by 895 * We make the task go to sleep until its current deadline by
894 * forcing its runtime to zero. This way, update_curr_dl() stops 896 * forcing its runtime to zero. This way, update_curr_dl() stops
895 * it and the bandwidth timer will wake it up and will give it 897 * it and the bandwidth timer will wake it up and will give it
896 * new scheduling parameters (thanks to dl_new=1). 898 * new scheduling parameters (thanks to dl_yielded=1).
897 */ 899 */
898 if (p->dl.runtime > 0) { 900 if (p->dl.runtime > 0) {
899 rq->curr->dl.dl_new = 1; 901 rq->curr->dl.dl_yielded = 1;
900 p->dl.runtime = 0; 902 p->dl.runtime = 0;
901 } 903 }
902 update_curr_dl(rq); 904 update_curr_dl(rq);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 695f9773bb60..627b3c34b821 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
608 608
609 avg_atom = p->se.sum_exec_runtime; 609 avg_atom = p->se.sum_exec_runtime;
610 if (nr_switches) 610 if (nr_switches)
611 do_div(avg_atom, nr_switches); 611 avg_atom = div64_ul(avg_atom, nr_switches);
612 else 612 else
613 avg_atom = -1LL; 613 avg_atom = -1LL;
614 614
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7570dd969c28..fea7d3335e1f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1017,7 +1017,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1017static unsigned long weighted_cpuload(const int cpu); 1017static unsigned long weighted_cpuload(const int cpu);
1018static unsigned long source_load(int cpu, int type); 1018static unsigned long source_load(int cpu, int type);
1019static unsigned long target_load(int cpu, int type); 1019static unsigned long target_load(int cpu, int type);
1020static unsigned long power_of(int cpu); 1020static unsigned long capacity_of(int cpu);
1021static long effective_load(struct task_group *tg, int cpu, long wl, long wg); 1021static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1022 1022
1023/* Cached statistics for all CPUs within a node */ 1023/* Cached statistics for all CPUs within a node */
@@ -1026,11 +1026,11 @@ struct numa_stats {
1026 unsigned long load; 1026 unsigned long load;
1027 1027
1028 /* Total compute capacity of CPUs on a node */ 1028 /* Total compute capacity of CPUs on a node */
1029 unsigned long power; 1029 unsigned long compute_capacity;
1030 1030
1031 /* Approximate capacity in terms of runnable tasks on a node */ 1031 /* Approximate capacity in terms of runnable tasks on a node */
1032 unsigned long capacity; 1032 unsigned long task_capacity;
1033 int has_capacity; 1033 int has_free_capacity;
1034}; 1034};
1035 1035
1036/* 1036/*
@@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1046 1046
1047 ns->nr_running += rq->nr_running; 1047 ns->nr_running += rq->nr_running;
1048 ns->load += weighted_cpuload(cpu); 1048 ns->load += weighted_cpuload(cpu);
1049 ns->power += power_of(cpu); 1049 ns->compute_capacity += capacity_of(cpu);
1050 1050
1051 cpus++; 1051 cpus++;
1052 } 1052 }
@@ -1056,15 +1056,16 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1056 * the @ns structure is NULL'ed and task_numa_compare() will 1056 * the @ns structure is NULL'ed and task_numa_compare() will
1057 * not find this node attractive. 1057 * not find this node attractive.
1058 * 1058 *
1059 * We'll either bail at !has_capacity, or we'll detect a huge imbalance 1059 * We'll either bail at !has_free_capacity, or we'll detect a huge
1060 * and bail there. 1060 * imbalance and bail there.
1061 */ 1061 */
1062 if (!cpus) 1062 if (!cpus)
1063 return; 1063 return;
1064 1064
1065 ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; 1065 ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
1066 ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); 1066 ns->task_capacity =
1067 ns->has_capacity = (ns->nr_running < ns->capacity); 1067 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
1068 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1068} 1069}
1069 1070
1070struct task_numa_env { 1071struct task_numa_env {
@@ -1095,6 +1096,34 @@ static void task_numa_assign(struct task_numa_env *env,
1095 env->best_cpu = env->dst_cpu; 1096 env->best_cpu = env->dst_cpu;
1096} 1097}
1097 1098
1099static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
1100 long src_load, long dst_load,
1101 struct task_numa_env *env)
1102{
1103 long imb, old_imb;
1104
1105 /* We care about the slope of the imbalance, not the direction. */
1106 if (dst_load < src_load)
1107 swap(dst_load, src_load);
1108
1109 /* Is the difference below the threshold? */
1110 imb = dst_load * 100 - src_load * env->imbalance_pct;
1111 if (imb <= 0)
1112 return false;
1113
1114 /*
1115 * The imbalance is above the allowed threshold.
1116 * Compare it with the old imbalance.
1117 */
1118 if (orig_dst_load < orig_src_load)
1119 swap(orig_dst_load, orig_src_load);
1120
1121 old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
1122
1123 /* Would this change make things worse? */
1124 return (imb > old_imb);
1125}
1126
1098/* 1127/*
1099 * This checks if the overall compute and NUMA accesses of the system would 1128 * This checks if the overall compute and NUMA accesses of the system would
1100 * be improved if the source tasks was migrated to the target dst_cpu taking 1129 * be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1107,7 +1136,8 @@ static void task_numa_compare(struct task_numa_env *env,
1107 struct rq *src_rq = cpu_rq(env->src_cpu); 1136 struct rq *src_rq = cpu_rq(env->src_cpu);
1108 struct rq *dst_rq = cpu_rq(env->dst_cpu); 1137 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1109 struct task_struct *cur; 1138 struct task_struct *cur;
1110 long dst_load, src_load; 1139 long orig_src_load, src_load;
1140 long orig_dst_load, dst_load;
1111 long load; 1141 long load;
1112 long imp = (groupimp > 0) ? groupimp : taskimp; 1142 long imp = (groupimp > 0) ? groupimp : taskimp;
1113 1143
@@ -1166,8 +1196,8 @@ static void task_numa_compare(struct task_numa_env *env,
1166 1196
1167 if (!cur) { 1197 if (!cur) {
1168 /* Is there capacity at our destination? */ 1198 /* Is there capacity at our destination? */
1169 if (env->src_stats.has_capacity && 1199 if (env->src_stats.has_free_capacity &&
1170 !env->dst_stats.has_capacity) 1200 !env->dst_stats.has_free_capacity)
1171 goto unlock; 1201 goto unlock;
1172 1202
1173 goto balance; 1203 goto balance;
@@ -1181,13 +1211,13 @@ static void task_numa_compare(struct task_numa_env *env,
1181 * In the overloaded case, try and keep the load balanced. 1211 * In the overloaded case, try and keep the load balanced.
1182 */ 1212 */
1183balance: 1213balance:
1184 dst_load = env->dst_stats.load; 1214 orig_dst_load = env->dst_stats.load;
1185 src_load = env->src_stats.load; 1215 orig_src_load = env->src_stats.load;
1186 1216
1187 /* XXX missing power terms */ 1217 /* XXX missing capacity terms */
1188 load = task_h_load(env->p); 1218 load = task_h_load(env->p);
1189 dst_load += load; 1219 dst_load = orig_dst_load + load;
1190 src_load -= load; 1220 src_load = orig_src_load - load;
1191 1221
1192 if (cur) { 1222 if (cur) {
1193 load = task_h_load(cur); 1223 load = task_h_load(cur);
@@ -1195,11 +1225,8 @@ balance:
1195 src_load += load; 1225 src_load += load;
1196 } 1226 }
1197 1227
1198 /* make src_load the smaller */ 1228 if (load_too_imbalanced(orig_src_load, orig_dst_load,
1199 if (dst_load < src_load) 1229 src_load, dst_load, env))
1200 swap(dst_load, src_load);
1201
1202 if (src_load * env->imbalance_pct < dst_load * 100)
1203 goto unlock; 1230 goto unlock;
1204 1231
1205assign: 1232assign:
@@ -1275,8 +1302,8 @@ static int task_numa_migrate(struct task_struct *p)
1275 groupimp = group_weight(p, env.dst_nid) - groupweight; 1302 groupimp = group_weight(p, env.dst_nid) - groupweight;
1276 update_numa_stats(&env.dst_stats, env.dst_nid); 1303 update_numa_stats(&env.dst_stats, env.dst_nid);
1277 1304
1278 /* If the preferred nid has capacity, try to use it. */ 1305 /* If the preferred nid has free capacity, try to use it. */
1279 if (env.dst_stats.has_capacity) 1306 if (env.dst_stats.has_free_capacity)
1280 task_numa_find_cpu(&env, taskimp, groupimp); 1307 task_numa_find_cpu(&env, taskimp, groupimp);
1281 1308
1282 /* No space available on the preferred nid. Look elsewhere. */ 1309 /* No space available on the preferred nid. Look elsewhere. */
@@ -1301,7 +1328,16 @@ static int task_numa_migrate(struct task_struct *p)
1301 if (env.best_cpu == -1) 1328 if (env.best_cpu == -1)
1302 return -EAGAIN; 1329 return -EAGAIN;
1303 1330
1304 sched_setnuma(p, env.dst_nid); 1331 /*
1332 * If the task is part of a workload that spans multiple NUMA nodes,
1333 * and is migrating into one of the workload's active nodes, remember
1334 * this node as the task's preferred numa node, so the workload can
1335 * settle down.
1336 * A task that migrated to a second choice node will be better off
1337 * trying for a better one later. Do not set the preferred node here.
1338 */
1339 if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
1340 sched_setnuma(p, env.dst_nid);
1305 1341
1306 /* 1342 /*
1307 * Reset the scan period if the task is being rescheduled on an 1343 * Reset the scan period if the task is being rescheduled on an
@@ -1326,12 +1362,15 @@ static int task_numa_migrate(struct task_struct *p)
1326/* Attempt to migrate a task to a CPU on the preferred node. */ 1362/* Attempt to migrate a task to a CPU on the preferred node. */
1327static void numa_migrate_preferred(struct task_struct *p) 1363static void numa_migrate_preferred(struct task_struct *p)
1328{ 1364{
1365 unsigned long interval = HZ;
1366
1329 /* This task has no NUMA fault statistics yet */ 1367 /* This task has no NUMA fault statistics yet */
1330 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) 1368 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
1331 return; 1369 return;
1332 1370
1333 /* Periodically retry migrating the task to the preferred node */ 1371 /* Periodically retry migrating the task to the preferred node */
1334 p->numa_migrate_retry = jiffies + HZ; 1372 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1373 p->numa_migrate_retry = jiffies + interval;
1335 1374
1336 /* Success if task is already running on preferred CPU */ 1375 /* Success if task is already running on preferred CPU */
1337 if (task_node(p) == p->numa_preferred_nid) 1376 if (task_node(p) == p->numa_preferred_nid)
@@ -1707,18 +1746,19 @@ no_join:
1707void task_numa_free(struct task_struct *p) 1746void task_numa_free(struct task_struct *p)
1708{ 1747{
1709 struct numa_group *grp = p->numa_group; 1748 struct numa_group *grp = p->numa_group;
1710 int i;
1711 void *numa_faults = p->numa_faults_memory; 1749 void *numa_faults = p->numa_faults_memory;
1750 unsigned long flags;
1751 int i;
1712 1752
1713 if (grp) { 1753 if (grp) {
1714 spin_lock_irq(&grp->lock); 1754 spin_lock_irqsave(&grp->lock, flags);
1715 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 1755 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1716 grp->faults[i] -= p->numa_faults_memory[i]; 1756 grp->faults[i] -= p->numa_faults_memory[i];
1717 grp->total_faults -= p->total_numa_faults; 1757 grp->total_faults -= p->total_numa_faults;
1718 1758
1719 list_del(&p->numa_entry); 1759 list_del(&p->numa_entry);
1720 grp->nr_tasks--; 1760 grp->nr_tasks--;
1721 spin_unlock_irq(&grp->lock); 1761 spin_unlock_irqrestore(&grp->lock, flags);
1722 rcu_assign_pointer(p->numa_group, NULL); 1762 rcu_assign_pointer(p->numa_group, NULL);
1723 put_numa_group(grp); 1763 put_numa_group(grp);
1724 } 1764 }
@@ -1738,6 +1778,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1738 struct task_struct *p = current; 1778 struct task_struct *p = current;
1739 bool migrated = flags & TNF_MIGRATED; 1779 bool migrated = flags & TNF_MIGRATED;
1740 int cpu_node = task_node(current); 1780 int cpu_node = task_node(current);
1781 int local = !!(flags & TNF_FAULT_LOCAL);
1741 int priv; 1782 int priv;
1742 1783
1743 if (!numabalancing_enabled) 1784 if (!numabalancing_enabled)
@@ -1786,6 +1827,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1786 task_numa_group(p, last_cpupid, flags, &priv); 1827 task_numa_group(p, last_cpupid, flags, &priv);
1787 } 1828 }
1788 1829
1830 /*
1831 * If a workload spans multiple NUMA nodes, a shared fault that
1832 * occurs wholly within the set of nodes that the workload is
1833 * actively using should be counted as local. This allows the
1834 * scan rate to slow down when a workload has settled down.
1835 */
1836 if (!priv && !local && p->numa_group &&
1837 node_isset(cpu_node, p->numa_group->active_nodes) &&
1838 node_isset(mem_node, p->numa_group->active_nodes))
1839 local = 1;
1840
1789 task_numa_placement(p); 1841 task_numa_placement(p);
1790 1842
1791 /* 1843 /*
@@ -1800,7 +1852,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1800 1852
1801 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; 1853 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
1802 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; 1854 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
1803 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; 1855 p->numa_faults_locality[local] += pages;
1804} 1856}
1805 1857
1806static void reset_ptenuma_scan(struct task_struct *p) 1858static void reset_ptenuma_scan(struct task_struct *p)
@@ -3129,7 +3181,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3129 */ 3181 */
3130 if (!cfs_b->timer_active) { 3182 if (!cfs_b->timer_active) {
3131 __refill_cfs_bandwidth_runtime(cfs_b); 3183 __refill_cfs_bandwidth_runtime(cfs_b);
3132 __start_cfs_bandwidth(cfs_b); 3184 __start_cfs_bandwidth(cfs_b, false);
3133 } 3185 }
3134 3186
3135 if (cfs_b->runtime > 0) { 3187 if (cfs_b->runtime > 0) {
@@ -3174,10 +3226,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3174 * has not truly expired. 3226 * has not truly expired.
3175 * 3227 *
3176 * Fortunately we can check determine whether this the case by checking 3228 * Fortunately we can check determine whether this the case by checking
3177 * whether the global deadline has advanced. 3229 * whether the global deadline has advanced. It is valid to compare
3230 * cfs_b->runtime_expires without any locks since we only care about
3231 * exact equality, so a partial write will still work.
3178 */ 3232 */
3179 3233
3180 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { 3234 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
3181 /* extend local deadline, drift is bounded above by 2 ticks */ 3235 /* extend local deadline, drift is bounded above by 2 ticks */
3182 cfs_rq->runtime_expires += TICK_NSEC; 3236 cfs_rq->runtime_expires += TICK_NSEC;
3183 } else { 3237 } else {
@@ -3301,14 +3355,14 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3301 } 3355 }
3302 3356
3303 if (!se) 3357 if (!se)
3304 rq->nr_running -= task_delta; 3358 sub_nr_running(rq, task_delta);
3305 3359
3306 cfs_rq->throttled = 1; 3360 cfs_rq->throttled = 1;
3307 cfs_rq->throttled_clock = rq_clock(rq); 3361 cfs_rq->throttled_clock = rq_clock(rq);
3308 raw_spin_lock(&cfs_b->lock); 3362 raw_spin_lock(&cfs_b->lock);
3309 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 3363 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3310 if (!cfs_b->timer_active) 3364 if (!cfs_b->timer_active)
3311 __start_cfs_bandwidth(cfs_b); 3365 __start_cfs_bandwidth(cfs_b, false);
3312 raw_spin_unlock(&cfs_b->lock); 3366 raw_spin_unlock(&cfs_b->lock);
3313} 3367}
3314 3368
@@ -3352,7 +3406,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3352 } 3406 }
3353 3407
3354 if (!se) 3408 if (!se)
3355 rq->nr_running += task_delta; 3409 add_nr_running(rq, task_delta);
3356 3410
3357 /* determine whether we need to wake up potentially idle cpu */ 3411 /* determine whether we need to wake up potentially idle cpu */
3358 if (rq->curr == rq->idle && rq->cfs.nr_running) 3412 if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -3406,21 +3460,21 @@ next:
3406static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) 3460static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3407{ 3461{
3408 u64 runtime, runtime_expires; 3462 u64 runtime, runtime_expires;
3409 int idle = 1, throttled; 3463 int throttled;
3410 3464
3411 raw_spin_lock(&cfs_b->lock);
3412 /* no need to continue the timer with no bandwidth constraint */ 3465 /* no need to continue the timer with no bandwidth constraint */
3413 if (cfs_b->quota == RUNTIME_INF) 3466 if (cfs_b->quota == RUNTIME_INF)
3414 goto out_unlock; 3467 goto out_deactivate;
3415 3468
3416 throttled = !list_empty(&cfs_b->throttled_cfs_rq); 3469 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3417 /* idle depends on !throttled (for the case of a large deficit) */
3418 idle = cfs_b->idle && !throttled;
3419 cfs_b->nr_periods += overrun; 3470 cfs_b->nr_periods += overrun;
3420 3471
3421 /* if we're going inactive then everything else can be deferred */ 3472 /*
3422 if (idle) 3473 * idle depends on !throttled (for the case of a large deficit), and if
3423 goto out_unlock; 3474 * we're going inactive then everything else can be deferred
3475 */
3476 if (cfs_b->idle && !throttled)
3477 goto out_deactivate;
3424 3478
3425 /* 3479 /*
3426 * if we have relooped after returning idle once, we need to update our 3480 * if we have relooped after returning idle once, we need to update our
@@ -3434,7 +3488,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3434 if (!throttled) { 3488 if (!throttled) {
3435 /* mark as potentially idle for the upcoming period */ 3489 /* mark as potentially idle for the upcoming period */
3436 cfs_b->idle = 1; 3490 cfs_b->idle = 1;
3437 goto out_unlock; 3491 return 0;
3438 } 3492 }
3439 3493
3440 /* account preceding periods in which throttling occurred */ 3494 /* account preceding periods in which throttling occurred */
@@ -3474,12 +3528,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3474 * timer to remain active while there are any throttled entities.) 3528 * timer to remain active while there are any throttled entities.)
3475 */ 3529 */
3476 cfs_b->idle = 0; 3530 cfs_b->idle = 0;
3477out_unlock:
3478 if (idle)
3479 cfs_b->timer_active = 0;
3480 raw_spin_unlock(&cfs_b->lock);
3481 3531
3482 return idle; 3532 return 0;
3533
3534out_deactivate:
3535 cfs_b->timer_active = 0;
3536 return 1;
3483} 3537}
3484 3538
3485/* a cfs_rq won't donate quota below this amount */ 3539/* a cfs_rq won't donate quota below this amount */
@@ -3656,6 +3710,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3656 int overrun; 3710 int overrun;
3657 int idle = 0; 3711 int idle = 0;
3658 3712
3713 raw_spin_lock(&cfs_b->lock);
3659 for (;;) { 3714 for (;;) {
3660 now = hrtimer_cb_get_time(timer); 3715 now = hrtimer_cb_get_time(timer);
3661 overrun = hrtimer_forward(timer, now, cfs_b->period); 3716 overrun = hrtimer_forward(timer, now, cfs_b->period);
@@ -3665,6 +3720,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3665 3720
3666 idle = do_sched_cfs_period_timer(cfs_b, overrun); 3721 idle = do_sched_cfs_period_timer(cfs_b, overrun);
3667 } 3722 }
3723 raw_spin_unlock(&cfs_b->lock);
3668 3724
3669 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 3725 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
3670} 3726}
@@ -3690,7 +3746,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3690} 3746}
3691 3747
3692/* requires cfs_b->lock, may release to reprogram timer */ 3748/* requires cfs_b->lock, may release to reprogram timer */
3693void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 3749void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
3694{ 3750{
3695 /* 3751 /*
3696 * The timer may be active because we're trying to set a new bandwidth 3752 * The timer may be active because we're trying to set a new bandwidth
@@ -3705,7 +3761,7 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3705 cpu_relax(); 3761 cpu_relax();
3706 raw_spin_lock(&cfs_b->lock); 3762 raw_spin_lock(&cfs_b->lock);
3707 /* if someone else restarted the timer then we're done */ 3763 /* if someone else restarted the timer then we're done */
3708 if (cfs_b->timer_active) 3764 if (!force && cfs_b->timer_active)
3709 return; 3765 return;
3710 } 3766 }
3711 3767
@@ -3724,8 +3780,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3724 struct cfs_rq *cfs_rq; 3780 struct cfs_rq *cfs_rq;
3725 3781
3726 for_each_leaf_cfs_rq(rq, cfs_rq) { 3782 for_each_leaf_cfs_rq(rq, cfs_rq) {
3727 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3728
3729 if (!cfs_rq->runtime_enabled) 3783 if (!cfs_rq->runtime_enabled)
3730 continue; 3784 continue;
3731 3785
@@ -3733,7 +3787,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3733 * clock_task is not advancing so we just need to make sure 3787 * clock_task is not advancing so we just need to make sure
3734 * there's some valid quota amount 3788 * there's some valid quota amount
3735 */ 3789 */
3736 cfs_rq->runtime_remaining = cfs_b->quota; 3790 cfs_rq->runtime_remaining = 1;
3737 if (cfs_rq_throttled(cfs_rq)) 3791 if (cfs_rq_throttled(cfs_rq))
3738 unthrottle_cfs_rq(cfs_rq); 3792 unthrottle_cfs_rq(cfs_rq);
3739 } 3793 }
@@ -3884,7 +3938,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3884 3938
3885 if (!se) { 3939 if (!se) {
3886 update_rq_runnable_avg(rq, rq->nr_running); 3940 update_rq_runnable_avg(rq, rq->nr_running);
3887 inc_nr_running(rq); 3941 add_nr_running(rq, 1);
3888 } 3942 }
3889 hrtick_update(rq); 3943 hrtick_update(rq);
3890} 3944}
@@ -3944,7 +3998,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3944 } 3998 }
3945 3999
3946 if (!se) { 4000 if (!se) {
3947 dec_nr_running(rq); 4001 sub_nr_running(rq, 1);
3948 update_rq_runnable_avg(rq, 1); 4002 update_rq_runnable_avg(rq, 1);
3949 } 4003 }
3950 hrtick_update(rq); 4004 hrtick_update(rq);
@@ -3990,9 +4044,9 @@ static unsigned long target_load(int cpu, int type)
3990 return max(rq->cpu_load[type-1], total); 4044 return max(rq->cpu_load[type-1], total);
3991} 4045}
3992 4046
3993static unsigned long power_of(int cpu) 4047static unsigned long capacity_of(int cpu)
3994{ 4048{
3995 return cpu_rq(cpu)->cpu_power; 4049 return cpu_rq(cpu)->cpu_capacity;
3996} 4050}
3997 4051
3998static unsigned long cpu_avg_load_per_task(int cpu) 4052static unsigned long cpu_avg_load_per_task(int cpu)
@@ -4014,8 +4068,8 @@ static void record_wakee(struct task_struct *p)
4014 * about the boundary, really active task won't care 4068 * about the boundary, really active task won't care
4015 * about the loss. 4069 * about the loss.
4016 */ 4070 */
4017 if (jiffies > current->wakee_flip_decay_ts + HZ) { 4071 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
4018 current->wakee_flips = 0; 4072 current->wakee_flips >>= 1;
4019 current->wakee_flip_decay_ts = jiffies; 4073 current->wakee_flip_decay_ts = jiffies;
4020 } 4074 }
4021 4075
@@ -4235,12 +4289,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4235 s64 this_eff_load, prev_eff_load; 4289 s64 this_eff_load, prev_eff_load;
4236 4290
4237 this_eff_load = 100; 4291 this_eff_load = 100;
4238 this_eff_load *= power_of(prev_cpu); 4292 this_eff_load *= capacity_of(prev_cpu);
4239 this_eff_load *= this_load + 4293 this_eff_load *= this_load +
4240 effective_load(tg, this_cpu, weight, weight); 4294 effective_load(tg, this_cpu, weight, weight);
4241 4295
4242 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; 4296 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4243 prev_eff_load *= power_of(this_cpu); 4297 prev_eff_load *= capacity_of(this_cpu);
4244 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); 4298 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4245 4299
4246 balanced = this_eff_load <= prev_eff_load; 4300 balanced = this_eff_load <= prev_eff_load;
@@ -4316,8 +4370,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4316 avg_load += load; 4370 avg_load += load;
4317 } 4371 }
4318 4372
4319 /* Adjust by relative CPU power of the group */ 4373 /* Adjust by relative CPU capacity of the group */
4320 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; 4374 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
4321 4375
4322 if (local_group) { 4376 if (local_group) {
4323 this_load = avg_load; 4377 this_load = avg_load;
@@ -4449,10 +4503,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4449 sd = tmp; 4503 sd = tmp;
4450 } 4504 }
4451 4505
4452 if (affine_sd) { 4506 if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4453 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) 4507 prev_cpu = cpu;
4454 prev_cpu = cpu;
4455 4508
4509 if (sd_flag & SD_BALANCE_WAKE) {
4456 new_cpu = select_idle_sibling(p, prev_cpu); 4510 new_cpu = select_idle_sibling(p, prev_cpu);
4457 goto unlock; 4511 goto unlock;
4458 } 4512 }
@@ -4520,6 +4574,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
4520 atomic_long_add(se->avg.load_avg_contrib, 4574 atomic_long_add(se->avg.load_avg_contrib,
4521 &cfs_rq->removed_load); 4575 &cfs_rq->removed_load);
4522 } 4576 }
4577
4578 /* We have migrated, no longer consider this task hot */
4579 se->exec_start = 0;
4523} 4580}
4524#endif /* CONFIG_SMP */ 4581#endif /* CONFIG_SMP */
4525 4582
@@ -4894,14 +4951,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
4894 * 4951 *
4895 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) 4952 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
4896 * 4953 *
4897 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the 4954 * C_i is the compute capacity of cpu i, typically it is the
4898 * fraction of 'recent' time available for SCHED_OTHER task execution. But it 4955 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
4899 * can also include other factors [XXX]. 4956 * can also include other factors [XXX].
4900 * 4957 *
4901 * To achieve this balance we define a measure of imbalance which follows 4958 * To achieve this balance we define a measure of imbalance which follows
4902 * directly from (1): 4959 * directly from (1):
4903 * 4960 *
4904 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) 4961 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
4905 * 4962 *
4906 * We them move tasks around to minimize the imbalance. In the continuous 4963 * We them move tasks around to minimize the imbalance. In the continuous
4907 * function space it is obvious this converges, in the discrete case we get 4964 * function space it is obvious this converges, in the discrete case we get
@@ -5070,6 +5127,7 @@ task_hot(struct task_struct *p, u64 now)
5070/* Returns true if the destination node has incurred more faults */ 5127/* Returns true if the destination node has incurred more faults */
5071static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) 5128static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5072{ 5129{
5130 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5073 int src_nid, dst_nid; 5131 int src_nid, dst_nid;
5074 5132
5075 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || 5133 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
@@ -5083,21 +5141,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5083 if (src_nid == dst_nid) 5141 if (src_nid == dst_nid)
5084 return false; 5142 return false;
5085 5143
5086 /* Always encourage migration to the preferred node. */ 5144 if (numa_group) {
5087 if (dst_nid == p->numa_preferred_nid) 5145 /* Task is already in the group's interleave set. */
5088 return true; 5146 if (node_isset(src_nid, numa_group->active_nodes))
5147 return false;
5148
5149 /* Task is moving into the group's interleave set. */
5150 if (node_isset(dst_nid, numa_group->active_nodes))
5151 return true;
5152
5153 return group_faults(p, dst_nid) > group_faults(p, src_nid);
5154 }
5089 5155
5090 /* If both task and group weight improve, this move is a winner. */ 5156 /* Encourage migration to the preferred node. */
5091 if (task_weight(p, dst_nid) > task_weight(p, src_nid) && 5157 if (dst_nid == p->numa_preferred_nid)
5092 group_weight(p, dst_nid) > group_weight(p, src_nid))
5093 return true; 5158 return true;
5094 5159
5095 return false; 5160 return task_faults(p, dst_nid) > task_faults(p, src_nid);
5096} 5161}
5097 5162
5098 5163
5099static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) 5164static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5100{ 5165{
5166 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5101 int src_nid, dst_nid; 5167 int src_nid, dst_nid;
5102 5168
5103 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5169 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5112,16 +5178,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5112 if (src_nid == dst_nid) 5178 if (src_nid == dst_nid)
5113 return false; 5179 return false;
5114 5180
5181 if (numa_group) {
5182 /* Task is moving within/into the group's interleave set. */
5183 if (node_isset(dst_nid, numa_group->active_nodes))
5184 return false;
5185
5186 /* Task is moving out of the group's interleave set. */
5187 if (node_isset(src_nid, numa_group->active_nodes))
5188 return true;
5189
5190 return group_faults(p, dst_nid) < group_faults(p, src_nid);
5191 }
5192
5115 /* Migrating away from the preferred node is always bad. */ 5193 /* Migrating away from the preferred node is always bad. */
5116 if (src_nid == p->numa_preferred_nid) 5194 if (src_nid == p->numa_preferred_nid)
5117 return true; 5195 return true;
5118 5196
5119 /* If either task or group weight get worse, don't do it. */ 5197 return task_faults(p, dst_nid) < task_faults(p, src_nid);
5120 if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
5121 group_weight(p, dst_nid) < group_weight(p, src_nid))
5122 return true;
5123
5124 return false;
5125} 5198}
5126 5199
5127#else 5200#else
@@ -5460,13 +5533,13 @@ struct sg_lb_stats {
5460 unsigned long group_load; /* Total load over the CPUs of the group */ 5533 unsigned long group_load; /* Total load over the CPUs of the group */
5461 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 5534 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
5462 unsigned long load_per_task; 5535 unsigned long load_per_task;
5463 unsigned long group_power; 5536 unsigned long group_capacity;
5464 unsigned int sum_nr_running; /* Nr tasks running in the group */ 5537 unsigned int sum_nr_running; /* Nr tasks running in the group */
5465 unsigned int group_capacity; 5538 unsigned int group_capacity_factor;
5466 unsigned int idle_cpus; 5539 unsigned int idle_cpus;
5467 unsigned int group_weight; 5540 unsigned int group_weight;
5468 int group_imb; /* Is there an imbalance in the group ? */ 5541 int group_imb; /* Is there an imbalance in the group ? */
5469 int group_has_capacity; /* Is there extra capacity in the group? */ 5542 int group_has_free_capacity;
5470#ifdef CONFIG_NUMA_BALANCING 5543#ifdef CONFIG_NUMA_BALANCING
5471 unsigned int nr_numa_running; 5544 unsigned int nr_numa_running;
5472 unsigned int nr_preferred_running; 5545 unsigned int nr_preferred_running;
@@ -5481,7 +5554,7 @@ struct sd_lb_stats {
5481 struct sched_group *busiest; /* Busiest group in this sd */ 5554 struct sched_group *busiest; /* Busiest group in this sd */
5482 struct sched_group *local; /* Local group in this sd */ 5555 struct sched_group *local; /* Local group in this sd */
5483 unsigned long total_load; /* Total load of all groups in sd */ 5556 unsigned long total_load; /* Total load of all groups in sd */
5484 unsigned long total_pwr; /* Total power of all groups in sd */ 5557 unsigned long total_capacity; /* Total capacity of all groups in sd */
5485 unsigned long avg_load; /* Average load across all groups in sd */ 5558 unsigned long avg_load; /* Average load across all groups in sd */
5486 5559
5487 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ 5560 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -5500,7 +5573,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
5500 .busiest = NULL, 5573 .busiest = NULL,
5501 .local = NULL, 5574 .local = NULL,
5502 .total_load = 0UL, 5575 .total_load = 0UL,
5503 .total_pwr = 0UL, 5576 .total_capacity = 0UL,
5504 .busiest_stat = { 5577 .busiest_stat = {
5505 .avg_load = 0UL, 5578 .avg_load = 0UL,
5506 }, 5579 },
@@ -5535,17 +5608,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
5535 return load_idx; 5608 return load_idx;
5536} 5609}
5537 5610
5538static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 5611static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
5539{ 5612{
5540 return SCHED_POWER_SCALE; 5613 return SCHED_CAPACITY_SCALE;
5541} 5614}
5542 5615
5543unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) 5616unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
5544{ 5617{
5545 return default_scale_freq_power(sd, cpu); 5618 return default_scale_capacity(sd, cpu);
5546} 5619}
5547 5620
5548static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 5621static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
5549{ 5622{
5550 unsigned long weight = sd->span_weight; 5623 unsigned long weight = sd->span_weight;
5551 unsigned long smt_gain = sd->smt_gain; 5624 unsigned long smt_gain = sd->smt_gain;
@@ -5555,15 +5628,16 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
5555 return smt_gain; 5628 return smt_gain;
5556} 5629}
5557 5630
5558unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) 5631unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
5559{ 5632{
5560 return default_scale_smt_power(sd, cpu); 5633 return default_scale_smt_capacity(sd, cpu);
5561} 5634}
5562 5635
5563static unsigned long scale_rt_power(int cpu) 5636static unsigned long scale_rt_capacity(int cpu)
5564{ 5637{
5565 struct rq *rq = cpu_rq(cpu); 5638 struct rq *rq = cpu_rq(cpu);
5566 u64 total, available, age_stamp, avg; 5639 u64 total, available, age_stamp, avg;
5640 s64 delta;
5567 5641
5568 /* 5642 /*
5569 * Since we're reading these variables without serialization make sure 5643 * Since we're reading these variables without serialization make sure
@@ -5572,74 +5646,78 @@ static unsigned long scale_rt_power(int cpu)
5572 age_stamp = ACCESS_ONCE(rq->age_stamp); 5646 age_stamp = ACCESS_ONCE(rq->age_stamp);
5573 avg = ACCESS_ONCE(rq->rt_avg); 5647 avg = ACCESS_ONCE(rq->rt_avg);
5574 5648
5575 total = sched_avg_period() + (rq_clock(rq) - age_stamp); 5649 delta = rq_clock(rq) - age_stamp;
5650 if (unlikely(delta < 0))
5651 delta = 0;
5652
5653 total = sched_avg_period() + delta;
5576 5654
5577 if (unlikely(total < avg)) { 5655 if (unlikely(total < avg)) {
5578 /* Ensures that power won't end up being negative */ 5656 /* Ensures that capacity won't end up being negative */
5579 available = 0; 5657 available = 0;
5580 } else { 5658 } else {
5581 available = total - avg; 5659 available = total - avg;
5582 } 5660 }
5583 5661
5584 if (unlikely((s64)total < SCHED_POWER_SCALE)) 5662 if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
5585 total = SCHED_POWER_SCALE; 5663 total = SCHED_CAPACITY_SCALE;
5586 5664
5587 total >>= SCHED_POWER_SHIFT; 5665 total >>= SCHED_CAPACITY_SHIFT;
5588 5666
5589 return div_u64(available, total); 5667 return div_u64(available, total);
5590} 5668}
5591 5669
5592static void update_cpu_power(struct sched_domain *sd, int cpu) 5670static void update_cpu_capacity(struct sched_domain *sd, int cpu)
5593{ 5671{
5594 unsigned long weight = sd->span_weight; 5672 unsigned long weight = sd->span_weight;
5595 unsigned long power = SCHED_POWER_SCALE; 5673 unsigned long capacity = SCHED_CAPACITY_SCALE;
5596 struct sched_group *sdg = sd->groups; 5674 struct sched_group *sdg = sd->groups;
5597 5675
5598 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 5676 if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
5599 if (sched_feat(ARCH_POWER)) 5677 if (sched_feat(ARCH_CAPACITY))
5600 power *= arch_scale_smt_power(sd, cpu); 5678 capacity *= arch_scale_smt_capacity(sd, cpu);
5601 else 5679 else
5602 power *= default_scale_smt_power(sd, cpu); 5680 capacity *= default_scale_smt_capacity(sd, cpu);
5603 5681
5604 power >>= SCHED_POWER_SHIFT; 5682 capacity >>= SCHED_CAPACITY_SHIFT;
5605 } 5683 }
5606 5684
5607 sdg->sgp->power_orig = power; 5685 sdg->sgc->capacity_orig = capacity;
5608 5686
5609 if (sched_feat(ARCH_POWER)) 5687 if (sched_feat(ARCH_CAPACITY))
5610 power *= arch_scale_freq_power(sd, cpu); 5688 capacity *= arch_scale_freq_capacity(sd, cpu);
5611 else 5689 else
5612 power *= default_scale_freq_power(sd, cpu); 5690 capacity *= default_scale_capacity(sd, cpu);
5613 5691
5614 power >>= SCHED_POWER_SHIFT; 5692 capacity >>= SCHED_CAPACITY_SHIFT;
5615 5693
5616 power *= scale_rt_power(cpu); 5694 capacity *= scale_rt_capacity(cpu);
5617 power >>= SCHED_POWER_SHIFT; 5695 capacity >>= SCHED_CAPACITY_SHIFT;
5618 5696
5619 if (!power) 5697 if (!capacity)
5620 power = 1; 5698 capacity = 1;
5621 5699
5622 cpu_rq(cpu)->cpu_power = power; 5700 cpu_rq(cpu)->cpu_capacity = capacity;
5623 sdg->sgp->power = power; 5701 sdg->sgc->capacity = capacity;
5624} 5702}
5625 5703
5626void update_group_power(struct sched_domain *sd, int cpu) 5704void update_group_capacity(struct sched_domain *sd, int cpu)
5627{ 5705{
5628 struct sched_domain *child = sd->child; 5706 struct sched_domain *child = sd->child;
5629 struct sched_group *group, *sdg = sd->groups; 5707 struct sched_group *group, *sdg = sd->groups;
5630 unsigned long power, power_orig; 5708 unsigned long capacity, capacity_orig;
5631 unsigned long interval; 5709 unsigned long interval;
5632 5710
5633 interval = msecs_to_jiffies(sd->balance_interval); 5711 interval = msecs_to_jiffies(sd->balance_interval);
5634 interval = clamp(interval, 1UL, max_load_balance_interval); 5712 interval = clamp(interval, 1UL, max_load_balance_interval);
5635 sdg->sgp->next_update = jiffies + interval; 5713 sdg->sgc->next_update = jiffies + interval;
5636 5714
5637 if (!child) { 5715 if (!child) {
5638 update_cpu_power(sd, cpu); 5716 update_cpu_capacity(sd, cpu);
5639 return; 5717 return;
5640 } 5718 }
5641 5719
5642 power_orig = power = 0; 5720 capacity_orig = capacity = 0;
5643 5721
5644 if (child->flags & SD_OVERLAP) { 5722 if (child->flags & SD_OVERLAP) {
5645 /* 5723 /*
@@ -5648,31 +5726,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
5648 */ 5726 */
5649 5727
5650 for_each_cpu(cpu, sched_group_cpus(sdg)) { 5728 for_each_cpu(cpu, sched_group_cpus(sdg)) {
5651 struct sched_group_power *sgp; 5729 struct sched_group_capacity *sgc;
5652 struct rq *rq = cpu_rq(cpu); 5730 struct rq *rq = cpu_rq(cpu);
5653 5731
5654 /* 5732 /*
5655 * build_sched_domains() -> init_sched_groups_power() 5733 * build_sched_domains() -> init_sched_groups_capacity()
5656 * gets here before we've attached the domains to the 5734 * gets here before we've attached the domains to the
5657 * runqueues. 5735 * runqueues.
5658 * 5736 *
5659 * Use power_of(), which is set irrespective of domains 5737 * Use capacity_of(), which is set irrespective of domains
5660 * in update_cpu_power(). 5738 * in update_cpu_capacity().
5661 * 5739 *
5662 * This avoids power/power_orig from being 0 and 5740 * This avoids capacity/capacity_orig from being 0 and
5663 * causing divide-by-zero issues on boot. 5741 * causing divide-by-zero issues on boot.
5664 * 5742 *
5665 * Runtime updates will correct power_orig. 5743 * Runtime updates will correct capacity_orig.
5666 */ 5744 */
5667 if (unlikely(!rq->sd)) { 5745 if (unlikely(!rq->sd)) {
5668 power_orig += power_of(cpu); 5746 capacity_orig += capacity_of(cpu);
5669 power += power_of(cpu); 5747 capacity += capacity_of(cpu);
5670 continue; 5748 continue;
5671 } 5749 }
5672 5750
5673 sgp = rq->sd->groups->sgp; 5751 sgc = rq->sd->groups->sgc;
5674 power_orig += sgp->power_orig; 5752 capacity_orig += sgc->capacity_orig;
5675 power += sgp->power; 5753 capacity += sgc->capacity;
5676 } 5754 }
5677 } else { 5755 } else {
5678 /* 5756 /*
@@ -5682,14 +5760,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
5682 5760
5683 group = child->groups; 5761 group = child->groups;
5684 do { 5762 do {
5685 power_orig += group->sgp->power_orig; 5763 capacity_orig += group->sgc->capacity_orig;
5686 power += group->sgp->power; 5764 capacity += group->sgc->capacity;
5687 group = group->next; 5765 group = group->next;
5688 } while (group != child->groups); 5766 } while (group != child->groups);
5689 } 5767 }
5690 5768
5691 sdg->sgp->power_orig = power_orig; 5769 sdg->sgc->capacity_orig = capacity_orig;
5692 sdg->sgp->power = power; 5770 sdg->sgc->capacity = capacity;
5693} 5771}
5694 5772
5695/* 5773/*
@@ -5703,15 +5781,15 @@ static inline int
5703fix_small_capacity(struct sched_domain *sd, struct sched_group *group) 5781fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
5704{ 5782{
5705 /* 5783 /*
5706 * Only siblings can have significantly less than SCHED_POWER_SCALE 5784 * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
5707 */ 5785 */
5708 if (!(sd->flags & SD_SHARE_CPUPOWER)) 5786 if (!(sd->flags & SD_SHARE_CPUCAPACITY))
5709 return 0; 5787 return 0;
5710 5788
5711 /* 5789 /*
5712 * If ~90% of the cpu_power is still there, we're good. 5790 * If ~90% of the cpu_capacity is still there, we're good.
5713 */ 5791 */
5714 if (group->sgp->power * 32 > group->sgp->power_orig * 29) 5792 if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
5715 return 1; 5793 return 1;
5716 5794
5717 return 0; 5795 return 0;
@@ -5748,34 +5826,35 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
5748 5826
5749static inline int sg_imbalanced(struct sched_group *group) 5827static inline int sg_imbalanced(struct sched_group *group)
5750{ 5828{
5751 return group->sgp->imbalance; 5829 return group->sgc->imbalance;
5752} 5830}
5753 5831
5754/* 5832/*
5755 * Compute the group capacity. 5833 * Compute the group capacity factor.
5756 * 5834 *
5757 * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by 5835 * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
5758 * first dividing out the smt factor and computing the actual number of cores 5836 * first dividing out the smt factor and computing the actual number of cores
5759 * and limit power unit capacity with that. 5837 * and limit unit capacity with that.
5760 */ 5838 */
5761static inline int sg_capacity(struct lb_env *env, struct sched_group *group) 5839static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
5762{ 5840{
5763 unsigned int capacity, smt, cpus; 5841 unsigned int capacity_factor, smt, cpus;
5764 unsigned int power, power_orig; 5842 unsigned int capacity, capacity_orig;
5765 5843
5766 power = group->sgp->power; 5844 capacity = group->sgc->capacity;
5767 power_orig = group->sgp->power_orig; 5845 capacity_orig = group->sgc->capacity_orig;
5768 cpus = group->group_weight; 5846 cpus = group->group_weight;
5769 5847
5770 /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ 5848 /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
5771 smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); 5849 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
5772 capacity = cpus / smt; /* cores */ 5850 capacity_factor = cpus / smt; /* cores */
5773 5851
5774 capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); 5852 capacity_factor = min_t(unsigned,
5775 if (!capacity) 5853 capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
5776 capacity = fix_small_capacity(env->sd, group); 5854 if (!capacity_factor)
5855 capacity_factor = fix_small_capacity(env->sd, group);
5777 5856
5778 return capacity; 5857 return capacity_factor;
5779} 5858}
5780 5859
5781/** 5860/**
@@ -5815,9 +5894,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5815 sgs->idle_cpus++; 5894 sgs->idle_cpus++;
5816 } 5895 }
5817 5896
5818 /* Adjust by relative CPU power of the group */ 5897 /* Adjust by relative CPU capacity of the group */
5819 sgs->group_power = group->sgp->power; 5898 sgs->group_capacity = group->sgc->capacity;
5820 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; 5899 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
5821 5900
5822 if (sgs->sum_nr_running) 5901 if (sgs->sum_nr_running)
5823 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 5902 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
@@ -5825,10 +5904,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5825 sgs->group_weight = group->group_weight; 5904 sgs->group_weight = group->group_weight;
5826 5905
5827 sgs->group_imb = sg_imbalanced(group); 5906 sgs->group_imb = sg_imbalanced(group);
5828 sgs->group_capacity = sg_capacity(env, group); 5907 sgs->group_capacity_factor = sg_capacity_factor(env, group);
5829 5908
5830 if (sgs->group_capacity > sgs->sum_nr_running) 5909 if (sgs->group_capacity_factor > sgs->sum_nr_running)
5831 sgs->group_has_capacity = 1; 5910 sgs->group_has_free_capacity = 1;
5832} 5911}
5833 5912
5834/** 5913/**
@@ -5852,7 +5931,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
5852 if (sgs->avg_load <= sds->busiest_stat.avg_load) 5931 if (sgs->avg_load <= sds->busiest_stat.avg_load)
5853 return false; 5932 return false;
5854 5933
5855 if (sgs->sum_nr_running > sgs->group_capacity) 5934 if (sgs->sum_nr_running > sgs->group_capacity_factor)
5856 return true; 5935 return true;
5857 5936
5858 if (sgs->group_imb) 5937 if (sgs->group_imb)
@@ -5932,8 +6011,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
5932 sgs = &sds->local_stat; 6011 sgs = &sds->local_stat;
5933 6012
5934 if (env->idle != CPU_NEWLY_IDLE || 6013 if (env->idle != CPU_NEWLY_IDLE ||
5935 time_after_eq(jiffies, sg->sgp->next_update)) 6014 time_after_eq(jiffies, sg->sgc->next_update))
5936 update_group_power(env->sd, env->dst_cpu); 6015 update_group_capacity(env->sd, env->dst_cpu);
5937 } 6016 }
5938 6017
5939 update_sg_lb_stats(env, sg, load_idx, local_group, sgs); 6018 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
@@ -5943,17 +6022,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
5943 6022
5944 /* 6023 /*
5945 * In case the child domain prefers tasks go to siblings 6024 * In case the child domain prefers tasks go to siblings
5946 * first, lower the sg capacity to one so that we'll try 6025 * first, lower the sg capacity factor to one so that we'll try
5947 * and move all the excess tasks away. We lower the capacity 6026 * and move all the excess tasks away. We lower the capacity
5948 * of a group only if the local group has the capacity to fit 6027 * of a group only if the local group has the capacity to fit
5949 * these excess tasks, i.e. nr_running < group_capacity. The 6028 * these excess tasks, i.e. nr_running < group_capacity_factor. The
5950 * extra check prevents the case where you always pull from the 6029 * extra check prevents the case where you always pull from the
5951 * heaviest group when it is already under-utilized (possible 6030 * heaviest group when it is already under-utilized (possible
5952 * with a large weight task outweighs the tasks on the system). 6031 * with a large weight task outweighs the tasks on the system).
5953 */ 6032 */
5954 if (prefer_sibling && sds->local && 6033 if (prefer_sibling && sds->local &&
5955 sds->local_stat.group_has_capacity) 6034 sds->local_stat.group_has_free_capacity)
5956 sgs->group_capacity = min(sgs->group_capacity, 1U); 6035 sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
5957 6036
5958 if (update_sd_pick_busiest(env, sds, sg, sgs)) { 6037 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
5959 sds->busiest = sg; 6038 sds->busiest = sg;
@@ -5963,7 +6042,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
5963next_group: 6042next_group:
5964 /* Now, start updating sd_lb_stats */ 6043 /* Now, start updating sd_lb_stats */
5965 sds->total_load += sgs->group_load; 6044 sds->total_load += sgs->group_load;
5966 sds->total_pwr += sgs->group_power; 6045 sds->total_capacity += sgs->group_capacity;
5967 6046
5968 sg = sg->next; 6047 sg = sg->next;
5969 } while (sg != env->sd->groups); 6048 } while (sg != env->sd->groups);
@@ -6010,8 +6089,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
6010 return 0; 6089 return 0;
6011 6090
6012 env->imbalance = DIV_ROUND_CLOSEST( 6091 env->imbalance = DIV_ROUND_CLOSEST(
6013 sds->busiest_stat.avg_load * sds->busiest_stat.group_power, 6092 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
6014 SCHED_POWER_SCALE); 6093 SCHED_CAPACITY_SCALE);
6015 6094
6016 return 1; 6095 return 1;
6017} 6096}
@@ -6026,7 +6105,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
6026static inline 6105static inline
6027void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 6106void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6028{ 6107{
6029 unsigned long tmp, pwr_now = 0, pwr_move = 0; 6108 unsigned long tmp, capa_now = 0, capa_move = 0;
6030 unsigned int imbn = 2; 6109 unsigned int imbn = 2;
6031 unsigned long scaled_busy_load_per_task; 6110 unsigned long scaled_busy_load_per_task;
6032 struct sg_lb_stats *local, *busiest; 6111 struct sg_lb_stats *local, *busiest;
@@ -6040,8 +6119,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6040 imbn = 1; 6119 imbn = 1;
6041 6120
6042 scaled_busy_load_per_task = 6121 scaled_busy_load_per_task =
6043 (busiest->load_per_task * SCHED_POWER_SCALE) / 6122 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6044 busiest->group_power; 6123 busiest->group_capacity;
6045 6124
6046 if (busiest->avg_load + scaled_busy_load_per_task >= 6125 if (busiest->avg_load + scaled_busy_load_per_task >=
6047 local->avg_load + (scaled_busy_load_per_task * imbn)) { 6126 local->avg_load + (scaled_busy_load_per_task * imbn)) {
@@ -6051,38 +6130,38 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6051 6130
6052 /* 6131 /*
6053 * OK, we don't have enough imbalance to justify moving tasks, 6132 * OK, we don't have enough imbalance to justify moving tasks,
6054 * however we may be able to increase total CPU power used by 6133 * however we may be able to increase total CPU capacity used by
6055 * moving them. 6134 * moving them.
6056 */ 6135 */
6057 6136
6058 pwr_now += busiest->group_power * 6137 capa_now += busiest->group_capacity *
6059 min(busiest->load_per_task, busiest->avg_load); 6138 min(busiest->load_per_task, busiest->avg_load);
6060 pwr_now += local->group_power * 6139 capa_now += local->group_capacity *
6061 min(local->load_per_task, local->avg_load); 6140 min(local->load_per_task, local->avg_load);
6062 pwr_now /= SCHED_POWER_SCALE; 6141 capa_now /= SCHED_CAPACITY_SCALE;
6063 6142
6064 /* Amount of load we'd subtract */ 6143 /* Amount of load we'd subtract */
6065 if (busiest->avg_load > scaled_busy_load_per_task) { 6144 if (busiest->avg_load > scaled_busy_load_per_task) {
6066 pwr_move += busiest->group_power * 6145 capa_move += busiest->group_capacity *
6067 min(busiest->load_per_task, 6146 min(busiest->load_per_task,
6068 busiest->avg_load - scaled_busy_load_per_task); 6147 busiest->avg_load - scaled_busy_load_per_task);
6069 } 6148 }
6070 6149
6071 /* Amount of load we'd add */ 6150 /* Amount of load we'd add */
6072 if (busiest->avg_load * busiest->group_power < 6151 if (busiest->avg_load * busiest->group_capacity <
6073 busiest->load_per_task * SCHED_POWER_SCALE) { 6152 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
6074 tmp = (busiest->avg_load * busiest->group_power) / 6153 tmp = (busiest->avg_load * busiest->group_capacity) /
6075 local->group_power; 6154 local->group_capacity;
6076 } else { 6155 } else {
6077 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / 6156 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6078 local->group_power; 6157 local->group_capacity;
6079 } 6158 }
6080 pwr_move += local->group_power * 6159 capa_move += local->group_capacity *
6081 min(local->load_per_task, local->avg_load + tmp); 6160 min(local->load_per_task, local->avg_load + tmp);
6082 pwr_move /= SCHED_POWER_SCALE; 6161 capa_move /= SCHED_CAPACITY_SCALE;
6083 6162
6084 /* Move if we gain throughput */ 6163 /* Move if we gain throughput */
6085 if (pwr_move > pwr_now) 6164 if (capa_move > capa_now)
6086 env->imbalance = busiest->load_per_task; 6165 env->imbalance = busiest->load_per_task;
6087} 6166}
6088 6167
@@ -6112,7 +6191,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6112 /* 6191 /*
6113 * In the presence of smp nice balancing, certain scenarios can have 6192 * In the presence of smp nice balancing, certain scenarios can have
6114 * max load less than avg load(as we skip the groups at or below 6193 * max load less than avg load(as we skip the groups at or below
6115 * its cpu_power, while calculating max_load..) 6194 * its cpu_capacity, while calculating max_load..)
6116 */ 6195 */
6117 if (busiest->avg_load <= sds->avg_load || 6196 if (busiest->avg_load <= sds->avg_load ||
6118 local->avg_load >= sds->avg_load) { 6197 local->avg_load >= sds->avg_load) {
@@ -6127,10 +6206,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6127 * have to drop below capacity to reach cpu-load equilibrium. 6206 * have to drop below capacity to reach cpu-load equilibrium.
6128 */ 6207 */
6129 load_above_capacity = 6208 load_above_capacity =
6130 (busiest->sum_nr_running - busiest->group_capacity); 6209 (busiest->sum_nr_running - busiest->group_capacity_factor);
6131 6210
6132 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); 6211 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
6133 load_above_capacity /= busiest->group_power; 6212 load_above_capacity /= busiest->group_capacity;
6134 } 6213 }
6135 6214
6136 /* 6215 /*
@@ -6145,9 +6224,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6145 6224
6146 /* How much load to actually move to equalise the imbalance */ 6225 /* How much load to actually move to equalise the imbalance */
6147 env->imbalance = min( 6226 env->imbalance = min(
6148 max_pull * busiest->group_power, 6227 max_pull * busiest->group_capacity,
6149 (sds->avg_load - local->avg_load) * local->group_power 6228 (sds->avg_load - local->avg_load) * local->group_capacity
6150 ) / SCHED_POWER_SCALE; 6229 ) / SCHED_CAPACITY_SCALE;
6151 6230
6152 /* 6231 /*
6153 * if *imbalance is less than the average load per runnable task 6232 * if *imbalance is less than the average load per runnable task
@@ -6201,7 +6280,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6201 if (!sds.busiest || busiest->sum_nr_running == 0) 6280 if (!sds.busiest || busiest->sum_nr_running == 0)
6202 goto out_balanced; 6281 goto out_balanced;
6203 6282
6204 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; 6283 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
6284 / sds.total_capacity;
6205 6285
6206 /* 6286 /*
6207 * If the busiest group is imbalanced the below checks don't 6287 * If the busiest group is imbalanced the below checks don't
@@ -6212,8 +6292,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6212 goto force_balance; 6292 goto force_balance;
6213 6293
6214 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 6294 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
6215 if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && 6295 if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
6216 !busiest->group_has_capacity) 6296 !busiest->group_has_free_capacity)
6217 goto force_balance; 6297 goto force_balance;
6218 6298
6219 /* 6299 /*
@@ -6267,11 +6347,11 @@ static struct rq *find_busiest_queue(struct lb_env *env,
6267 struct sched_group *group) 6347 struct sched_group *group)
6268{ 6348{
6269 struct rq *busiest = NULL, *rq; 6349 struct rq *busiest = NULL, *rq;
6270 unsigned long busiest_load = 0, busiest_power = 1; 6350 unsigned long busiest_load = 0, busiest_capacity = 1;
6271 int i; 6351 int i;
6272 6352
6273 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 6353 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6274 unsigned long power, capacity, wl; 6354 unsigned long capacity, capacity_factor, wl;
6275 enum fbq_type rt; 6355 enum fbq_type rt;
6276 6356
6277 rq = cpu_rq(i); 6357 rq = cpu_rq(i);
@@ -6299,34 +6379,34 @@ static struct rq *find_busiest_queue(struct lb_env *env,
6299 if (rt > env->fbq_type) 6379 if (rt > env->fbq_type)
6300 continue; 6380 continue;
6301 6381
6302 power = power_of(i); 6382 capacity = capacity_of(i);
6303 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); 6383 capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
6304 if (!capacity) 6384 if (!capacity_factor)
6305 capacity = fix_small_capacity(env->sd, group); 6385 capacity_factor = fix_small_capacity(env->sd, group);
6306 6386
6307 wl = weighted_cpuload(i); 6387 wl = weighted_cpuload(i);
6308 6388
6309 /* 6389 /*
6310 * When comparing with imbalance, use weighted_cpuload() 6390 * When comparing with imbalance, use weighted_cpuload()
6311 * which is not scaled with the cpu power. 6391 * which is not scaled with the cpu capacity.
6312 */ 6392 */
6313 if (capacity && rq->nr_running == 1 && wl > env->imbalance) 6393 if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
6314 continue; 6394 continue;
6315 6395
6316 /* 6396 /*
6317 * For the load comparisons with the other cpu's, consider 6397 * For the load comparisons with the other cpu's, consider
6318 * the weighted_cpuload() scaled with the cpu power, so that 6398 * the weighted_cpuload() scaled with the cpu capacity, so
6319 * the load can be moved away from the cpu that is potentially 6399 * that the load can be moved away from the cpu that is
6320 * running at a lower capacity. 6400 * potentially running at a lower capacity.
6321 * 6401 *
6322 * Thus we're looking for max(wl_i / power_i), crosswise 6402 * Thus we're looking for max(wl_i / capacity_i), crosswise
6323 * multiplication to rid ourselves of the division works out 6403 * multiplication to rid ourselves of the division works out
6324 * to: wl_i * power_j > wl_j * power_i; where j is our 6404 * to: wl_i * capacity_j > wl_j * capacity_i; where j is
6325 * previous maximum. 6405 * our previous maximum.
6326 */ 6406 */
6327 if (wl * busiest_power > busiest_load * power) { 6407 if (wl * busiest_capacity > busiest_load * capacity) {
6328 busiest_load = wl; 6408 busiest_load = wl;
6329 busiest_power = power; 6409 busiest_capacity = capacity;
6330 busiest = rq; 6410 busiest = rq;
6331 } 6411 }
6332 } 6412 }
@@ -6534,7 +6614,7 @@ more_balance:
6534 * We failed to reach balance because of affinity. 6614 * We failed to reach balance because of affinity.
6535 */ 6615 */
6536 if (sd_parent) { 6616 if (sd_parent) {
6537 int *group_imbalance = &sd_parent->groups->sgp->imbalance; 6617 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
6538 6618
6539 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { 6619 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6540 *group_imbalance = 1; 6620 *group_imbalance = 1;
@@ -6640,27 +6720,62 @@ out:
6640 return ld_moved; 6720 return ld_moved;
6641} 6721}
6642 6722
6723static inline unsigned long
6724get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
6725{
6726 unsigned long interval = sd->balance_interval;
6727
6728 if (cpu_busy)
6729 interval *= sd->busy_factor;
6730
6731 /* scale ms to jiffies */
6732 interval = msecs_to_jiffies(interval);
6733 interval = clamp(interval, 1UL, max_load_balance_interval);
6734
6735 return interval;
6736}
6737
6738static inline void
6739update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
6740{
6741 unsigned long interval, next;
6742
6743 interval = get_sd_balance_interval(sd, cpu_busy);
6744 next = sd->last_balance + interval;
6745
6746 if (time_after(*next_balance, next))
6747 *next_balance = next;
6748}
6749
6643/* 6750/*
6644 * idle_balance is called by schedule() if this_cpu is about to become 6751 * idle_balance is called by schedule() if this_cpu is about to become
6645 * idle. Attempts to pull tasks from other CPUs. 6752 * idle. Attempts to pull tasks from other CPUs.
6646 */ 6753 */
6647static int idle_balance(struct rq *this_rq) 6754static int idle_balance(struct rq *this_rq)
6648{ 6755{
6756 unsigned long next_balance = jiffies + HZ;
6757 int this_cpu = this_rq->cpu;
6649 struct sched_domain *sd; 6758 struct sched_domain *sd;
6650 int pulled_task = 0; 6759 int pulled_task = 0;
6651 unsigned long next_balance = jiffies + HZ;
6652 u64 curr_cost = 0; 6760 u64 curr_cost = 0;
6653 int this_cpu = this_rq->cpu;
6654 6761
6655 idle_enter_fair(this_rq); 6762 idle_enter_fair(this_rq);
6763
6656 /* 6764 /*
6657 * We must set idle_stamp _before_ calling idle_balance(), such that we 6765 * We must set idle_stamp _before_ calling idle_balance(), such that we
6658 * measure the duration of idle_balance() as idle time. 6766 * measure the duration of idle_balance() as idle time.
6659 */ 6767 */
6660 this_rq->idle_stamp = rq_clock(this_rq); 6768 this_rq->idle_stamp = rq_clock(this_rq);
6661 6769
6662 if (this_rq->avg_idle < sysctl_sched_migration_cost) 6770 if (this_rq->avg_idle < sysctl_sched_migration_cost) {
6771 rcu_read_lock();
6772 sd = rcu_dereference_check_sched_domain(this_rq->sd);
6773 if (sd)
6774 update_next_balance(sd, 0, &next_balance);
6775 rcu_read_unlock();
6776
6663 goto out; 6777 goto out;
6778 }
6664 6779
6665 /* 6780 /*
6666 * Drop the rq->lock, but keep IRQ/preempt disabled. 6781 * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6670,20 +6785,20 @@ static int idle_balance(struct rq *this_rq)
6670 update_blocked_averages(this_cpu); 6785 update_blocked_averages(this_cpu);
6671 rcu_read_lock(); 6786 rcu_read_lock();
6672 for_each_domain(this_cpu, sd) { 6787 for_each_domain(this_cpu, sd) {
6673 unsigned long interval;
6674 int continue_balancing = 1; 6788 int continue_balancing = 1;
6675 u64 t0, domain_cost; 6789 u64 t0, domain_cost;
6676 6790
6677 if (!(sd->flags & SD_LOAD_BALANCE)) 6791 if (!(sd->flags & SD_LOAD_BALANCE))
6678 continue; 6792 continue;
6679 6793
6680 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) 6794 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
6795 update_next_balance(sd, 0, &next_balance);
6681 break; 6796 break;
6797 }
6682 6798
6683 if (sd->flags & SD_BALANCE_NEWIDLE) { 6799 if (sd->flags & SD_BALANCE_NEWIDLE) {
6684 t0 = sched_clock_cpu(this_cpu); 6800 t0 = sched_clock_cpu(this_cpu);
6685 6801
6686 /* If we've pulled tasks over stop searching: */
6687 pulled_task = load_balance(this_cpu, this_rq, 6802 pulled_task = load_balance(this_cpu, this_rq,
6688 sd, CPU_NEWLY_IDLE, 6803 sd, CPU_NEWLY_IDLE,
6689 &continue_balancing); 6804 &continue_balancing);
@@ -6695,42 +6810,37 @@ static int idle_balance(struct rq *this_rq)
6695 curr_cost += domain_cost; 6810 curr_cost += domain_cost;
6696 } 6811 }
6697 6812
6698 interval = msecs_to_jiffies(sd->balance_interval); 6813 update_next_balance(sd, 0, &next_balance);
6699 if (time_after(next_balance, sd->last_balance + interval)) 6814
6700 next_balance = sd->last_balance + interval; 6815 /*
6701 if (pulled_task) 6816 * Stop searching for tasks to pull if there are
6817 * now runnable tasks on this rq.
6818 */
6819 if (pulled_task || this_rq->nr_running > 0)
6702 break; 6820 break;
6703 } 6821 }
6704 rcu_read_unlock(); 6822 rcu_read_unlock();
6705 6823
6706 raw_spin_lock(&this_rq->lock); 6824 raw_spin_lock(&this_rq->lock);
6707 6825
6826 if (curr_cost > this_rq->max_idle_balance_cost)
6827 this_rq->max_idle_balance_cost = curr_cost;
6828
6708 /* 6829 /*
6709 * While browsing the domains, we released the rq lock. 6830 * While browsing the domains, we released the rq lock, a task could
6710 * A task could have be enqueued in the meantime 6831 * have been enqueued in the meantime. Since we're not going idle,
6832 * pretend we pulled a task.
6711 */ 6833 */
6712 if (this_rq->cfs.h_nr_running && !pulled_task) { 6834 if (this_rq->cfs.h_nr_running && !pulled_task)
6713 pulled_task = 1; 6835 pulled_task = 1;
6714 goto out;
6715 }
6716 6836
6717 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 6837out:
6718 /* 6838 /* Move the next balance forward */
6719 * We are going idle. next_balance may be set based on 6839 if (time_after(this_rq->next_balance, next_balance))
6720 * a busy processor. So reset next_balance.
6721 */
6722 this_rq->next_balance = next_balance; 6840 this_rq->next_balance = next_balance;
6723 }
6724 6841
6725 if (curr_cost > this_rq->max_idle_balance_cost)
6726 this_rq->max_idle_balance_cost = curr_cost;
6727
6728out:
6729 /* Is there a task of a high priority class? */ 6842 /* Is there a task of a high priority class? */
6730 if (this_rq->nr_running != this_rq->cfs.h_nr_running && 6843 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
6731 ((this_rq->stop && this_rq->stop->on_rq) ||
6732 this_rq->dl.dl_nr_running ||
6733 (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
6734 pulled_task = -1; 6844 pulled_task = -1;
6735 6845
6736 if (pulled_task) { 6846 if (pulled_task) {
@@ -6891,7 +7001,7 @@ static inline void set_cpu_sd_state_busy(void)
6891 goto unlock; 7001 goto unlock;
6892 sd->nohz_idle = 0; 7002 sd->nohz_idle = 0;
6893 7003
6894 atomic_inc(&sd->groups->sgp->nr_busy_cpus); 7004 atomic_inc(&sd->groups->sgc->nr_busy_cpus);
6895unlock: 7005unlock:
6896 rcu_read_unlock(); 7006 rcu_read_unlock();
6897} 7007}
@@ -6908,7 +7018,7 @@ void set_cpu_sd_state_idle(void)
6908 goto unlock; 7018 goto unlock;
6909 sd->nohz_idle = 1; 7019 sd->nohz_idle = 1;
6910 7020
6911 atomic_dec(&sd->groups->sgp->nr_busy_cpus); 7021 atomic_dec(&sd->groups->sgc->nr_busy_cpus);
6912unlock: 7022unlock:
6913 rcu_read_unlock(); 7023 rcu_read_unlock();
6914} 7024}
@@ -7011,16 +7121,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7011 break; 7121 break;
7012 } 7122 }
7013 7123
7014 interval = sd->balance_interval; 7124 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7015 if (idle != CPU_IDLE)
7016 interval *= sd->busy_factor;
7017
7018 /* scale ms to jiffies */
7019 interval = msecs_to_jiffies(interval);
7020 interval = clamp(interval, 1UL, max_load_balance_interval);
7021 7125
7022 need_serialize = sd->flags & SD_SERIALIZE; 7126 need_serialize = sd->flags & SD_SERIALIZE;
7023
7024 if (need_serialize) { 7127 if (need_serialize) {
7025 if (!spin_trylock(&balancing)) 7128 if (!spin_trylock(&balancing))
7026 goto out; 7129 goto out;
@@ -7036,6 +7139,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7036 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; 7139 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
7037 } 7140 }
7038 sd->last_balance = jiffies; 7141 sd->last_balance = jiffies;
7142 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7039 } 7143 }
7040 if (need_serialize) 7144 if (need_serialize)
7041 spin_unlock(&balancing); 7145 spin_unlock(&balancing);
@@ -7093,12 +7197,17 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7093 7197
7094 rq = cpu_rq(balance_cpu); 7198 rq = cpu_rq(balance_cpu);
7095 7199
7096 raw_spin_lock_irq(&rq->lock); 7200 /*
7097 update_rq_clock(rq); 7201 * If time for next balance is due,
7098 update_idle_cpu_load(rq); 7202 * do the balance.
7099 raw_spin_unlock_irq(&rq->lock); 7203 */
7100 7204 if (time_after_eq(jiffies, rq->next_balance)) {
7101 rebalance_domains(rq, CPU_IDLE); 7205 raw_spin_lock_irq(&rq->lock);
7206 update_rq_clock(rq);
7207 update_idle_cpu_load(rq);
7208 raw_spin_unlock_irq(&rq->lock);
7209 rebalance_domains(rq, CPU_IDLE);
7210 }
7102 7211
7103 if (time_after(this_rq->next_balance, rq->next_balance)) 7212 if (time_after(this_rq->next_balance, rq->next_balance))
7104 this_rq->next_balance = rq->next_balance; 7213 this_rq->next_balance = rq->next_balance;
@@ -7113,7 +7222,7 @@ end:
7113 * of an idle cpu is the system. 7222 * of an idle cpu is the system.
7114 * - This rq has more than one task. 7223 * - This rq has more than one task.
7115 * - At any scheduler domain level, this cpu's scheduler group has multiple 7224 * - At any scheduler domain level, this cpu's scheduler group has multiple
7116 * busy cpu's exceeding the group's power. 7225 * busy cpu's exceeding the group's capacity.
7117 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 7226 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
7118 * domain span are idle. 7227 * domain span are idle.
7119 */ 7228 */
@@ -7121,7 +7230,7 @@ static inline int nohz_kick_needed(struct rq *rq)
7121{ 7230{
7122 unsigned long now = jiffies; 7231 unsigned long now = jiffies;
7123 struct sched_domain *sd; 7232 struct sched_domain *sd;
7124 struct sched_group_power *sgp; 7233 struct sched_group_capacity *sgc;
7125 int nr_busy, cpu = rq->cpu; 7234 int nr_busy, cpu = rq->cpu;
7126 7235
7127 if (unlikely(rq->idle_balance)) 7236 if (unlikely(rq->idle_balance))
@@ -7151,8 +7260,8 @@ static inline int nohz_kick_needed(struct rq *rq)
7151 sd = rcu_dereference(per_cpu(sd_busy, cpu)); 7260 sd = rcu_dereference(per_cpu(sd_busy, cpu));
7152 7261
7153 if (sd) { 7262 if (sd) {
7154 sgp = sd->groups->sgp; 7263 sgc = sd->groups->sgc;
7155 nr_busy = atomic_read(&sgp->nr_busy_cpus); 7264 nr_busy = atomic_read(&sgc->nr_busy_cpus);
7156 7265
7157 if (nr_busy > 1) 7266 if (nr_busy > 1)
7158 goto need_kick_unlock; 7267 goto need_kick_unlock;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 5716929a2e3a..90284d117fe6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
37SCHED_FEAT(WAKEUP_PREEMPTION, true) 37SCHED_FEAT(WAKEUP_PREEMPTION, true)
38 38
39/* 39/*
40 * Use arch dependent cpu power functions 40 * Use arch dependent cpu capacity functions
41 */ 41 */
42SCHED_FEAT(ARCH_POWER, true) 42SCHED_FEAT(ARCH_CAPACITY, true)
43 43
44SCHED_FEAT(HRTICK, false) 44SCHED_FEAT(HRTICK, false)
45SCHED_FEAT(DOUBLE_TICK, false) 45SCHED_FEAT(DOUBLE_TICK, false)
46SCHED_FEAT(LB_BIAS, true) 46SCHED_FEAT(LB_BIAS, true)
47 47
48/* 48/*
49 * Decrement CPU power based on time not spent running tasks 49 * Decrement CPU capacity based on time not spent running tasks
50 */ 50 */
51SCHED_FEAT(NONTASK_POWER, true) 51SCHED_FEAT(NONTASK_CAPACITY, true)
52 52
53/* 53/*
54 * Queue remote wakeups on the target CPU and process them 54 * Queue remote wakeups on the target CPU and process them
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f4390a079c7..cf009fb0bc25 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -12,6 +12,8 @@
12 12
13#include <trace/events/power.h> 13#include <trace/events/power.h>
14 14
15#include "sched.h"
16
15static int __read_mostly cpu_idle_force_poll; 17static int __read_mostly cpu_idle_force_poll;
16 18
17void cpu_idle_poll_ctrl(bool enable) 19void cpu_idle_poll_ctrl(bool enable)
@@ -67,24 +69,25 @@ void __weak arch_cpu_idle(void)
67 * cpuidle_idle_call - the main idle function 69 * cpuidle_idle_call - the main idle function
68 * 70 *
69 * NOTE: no locks or semaphores should be used here 71 * NOTE: no locks or semaphores should be used here
70 * return non-zero on failure 72 *
73 * On archs that support TIF_POLLING_NRFLAG, is called with polling
74 * set, and it returns with polling set. If it ever stops polling, it
75 * must clear the polling bit.
71 */ 76 */
72static int cpuidle_idle_call(void) 77static void cpuidle_idle_call(void)
73{ 78{
74 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); 79 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
75 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); 80 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
76 int next_state, entered_state, ret; 81 int next_state, entered_state;
77 bool broadcast; 82 bool broadcast;
78 83
79 /* 84 /*
80 * Check if the idle task must be rescheduled. If it is the 85 * Check if the idle task must be rescheduled. If it is the
81 * case, exit the function after re-enabling the local irq and 86 * case, exit the function after re-enabling the local irq.
82 * set again the polling flag
83 */ 87 */
84 if (current_clr_polling_and_test()) { 88 if (need_resched()) {
85 local_irq_enable(); 89 local_irq_enable();
86 __current_set_polling(); 90 return;
87 return 0;
88 } 91 }
89 92
90 /* 93 /*
@@ -101,104 +104,99 @@ static int cpuidle_idle_call(void)
101 rcu_idle_enter(); 104 rcu_idle_enter();
102 105
103 /* 106 /*
104 * Check if the cpuidle framework is ready, otherwise fallback 107 * Ask the cpuidle framework to choose a convenient idle state.
105 * to the default arch specific idle method 108 * Fall back to the default arch idle method on errors.
106 */ 109 */
107 ret = cpuidle_enabled(drv, dev); 110 next_state = cpuidle_select(drv, dev);
108 111 if (next_state < 0) {
109 if (!ret) { 112use_default:
110 /* 113 /*
111 * Ask the governor to choose an idle state it thinks 114 * We can't use the cpuidle framework, let's use the default
112 * it is convenient to go to. There is *always* a 115 * idle routine.
113 * convenient idle state
114 */ 116 */
115 next_state = cpuidle_select(drv, dev); 117 if (current_clr_polling_and_test())
116
117 /*
118 * The idle task must be scheduled, it is pointless to
119 * go to idle, just update no idle residency and get
120 * out of this function
121 */
122 if (current_clr_polling_and_test()) {
123 dev->last_residency = 0;
124 entered_state = next_state;
125 local_irq_enable(); 118 local_irq_enable();
126 } else { 119 else
127 broadcast = !!(drv->states[next_state].flags & 120 arch_cpu_idle();
128 CPUIDLE_FLAG_TIMER_STOP); 121
129 122 goto exit_idle;
130 if (broadcast)
131 /*
132 * Tell the time framework to switch
133 * to a broadcast timer because our
134 * local timer will be shutdown. If a
135 * local timer is used from another
136 * cpu as a broadcast timer, this call
137 * may fail if it is not available
138 */
139 ret = clockevents_notify(
140 CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
141 &dev->cpu);
142
143 if (!ret) {
144 trace_cpu_idle_rcuidle(next_state, dev->cpu);
145
146 /*
147 * Enter the idle state previously
148 * returned by the governor
149 * decision. This function will block
150 * until an interrupt occurs and will
151 * take care of re-enabling the local
152 * interrupts
153 */
154 entered_state = cpuidle_enter(drv, dev,
155 next_state);
156
157 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
158 dev->cpu);
159
160 if (broadcast)
161 clockevents_notify(
162 CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
163 &dev->cpu);
164
165 /*
166 * Give the governor an opportunity to reflect on the
167 * outcome
168 */
169 cpuidle_reflect(dev, entered_state);
170 }
171 }
172 } 123 }
173 124
125
126 /*
127 * The idle task must be scheduled, it is pointless to
128 * go to idle, just update no idle residency and get
129 * out of this function
130 */
131 if (current_clr_polling_and_test()) {
132 dev->last_residency = 0;
133 entered_state = next_state;
134 local_irq_enable();
135 goto exit_idle;
136 }
137
138 broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
139
174 /* 140 /*
175 * We can't use the cpuidle framework, let's use the default 141 * Tell the time framework to switch to a broadcast timer
176 * idle routine 142 * because our local timer will be shutdown. If a local timer
143 * is used from another cpu as a broadcast timer, this call may
144 * fail if it is not available
177 */ 145 */
178 if (ret) 146 if (broadcast &&
179 arch_cpu_idle(); 147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
148 goto use_default;
180 149
150 trace_cpu_idle_rcuidle(next_state, dev->cpu);
151
152 /*
153 * Enter the idle state previously returned by the governor decision.
154 * This function will block until an interrupt occurs and will take
155 * care of re-enabling the local interrupts
156 */
157 entered_state = cpuidle_enter(drv, dev, next_state);
158
159 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
160
161 if (broadcast)
162 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
163
164 /*
165 * Give the governor an opportunity to reflect on the outcome
166 */
167 cpuidle_reflect(dev, entered_state);
168
169exit_idle:
181 __current_set_polling(); 170 __current_set_polling();
182 171
183 /* 172 /*
184 * It is up to the idle functions to enable back the local 173 * It is up to the idle functions to reenable local interrupts
185 * interrupt
186 */ 174 */
187 if (WARN_ON_ONCE(irqs_disabled())) 175 if (WARN_ON_ONCE(irqs_disabled()))
188 local_irq_enable(); 176 local_irq_enable();
189 177
190 rcu_idle_exit(); 178 rcu_idle_exit();
191 start_critical_timings(); 179 start_critical_timings();
192
193 return 0;
194} 180}
195 181
196/* 182/*
197 * Generic idle loop implementation 183 * Generic idle loop implementation
184 *
185 * Called with polling cleared.
198 */ 186 */
199static void cpu_idle_loop(void) 187static void cpu_idle_loop(void)
200{ 188{
201 while (1) { 189 while (1) {
190 /*
191 * If the arch has a polling bit, we maintain an invariant:
192 *
193 * Our polling bit is clear if we're not scheduled (i.e. if
194 * rq->curr != rq->idle). This means that, if rq->idle has
195 * the polling bit set, then setting need_resched is
196 * guaranteed to cause the cpu to reschedule.
197 */
198
199 __current_set_polling();
202 tick_nohz_idle_enter(); 200 tick_nohz_idle_enter();
203 201
204 while (!need_resched()) { 202 while (!need_resched()) {
@@ -238,6 +236,17 @@ static void cpu_idle_loop(void)
238 */ 236 */
239 preempt_set_need_resched(); 237 preempt_set_need_resched();
240 tick_nohz_idle_exit(); 238 tick_nohz_idle_exit();
239 __current_clr_polling();
240
241 /*
242 * We promise to call sched_ttwu_pending and reschedule
243 * if need_resched is set while polling is set. That
244 * means that clearing polling needs to be visible
245 * before doing these things.
246 */
247 smp_mb__after_atomic();
248
249 sched_ttwu_pending();
241 schedule_preempt_disabled(); 250 schedule_preempt_disabled();
242 } 251 }
243} 252}
@@ -259,7 +268,6 @@ void cpu_startup_entry(enum cpuhp_state state)
259 */ 268 */
260 boot_init_stack_canary(); 269 boot_init_stack_canary();
261#endif 270#endif
262 __current_set_polling();
263 arch_cpu_idle_prepare(); 271 arch_cpu_idle_prepare();
264 cpu_idle_loop(); 272 cpu_idle_loop();
265} 273}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index bd2267ad404f..a49083192c64 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
79 rt_rq->overloaded = 0; 79 rt_rq->overloaded = 0;
80 plist_head_init(&rt_rq->pushable_tasks); 80 plist_head_init(&rt_rq->pushable_tasks);
81#endif 81#endif
82 /* We start is dequeued state, because no RT tasks are queued */
83 rt_rq->rt_queued = 0;
82 84
83 rt_rq->rt_time = 0; 85 rt_rq->rt_time = 0;
84 rt_rq->rt_throttled = 0; 86 rt_rq->rt_throttled = 0;
@@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
112 return rt_se->rt_rq; 114 return rt_se->rt_rq;
113} 115}
114 116
117static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
118{
119 struct rt_rq *rt_rq = rt_se->rt_rq;
120
121 return rt_rq->rq;
122}
123
115void free_rt_sched_group(struct task_group *tg) 124void free_rt_sched_group(struct task_group *tg)
116{ 125{
117 int i; 126 int i;
@@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
211 return container_of(rt_rq, struct rq, rt); 220 return container_of(rt_rq, struct rq, rt);
212} 221}
213 222
214static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 223static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
215{ 224{
216 struct task_struct *p = rt_task_of(rt_se); 225 struct task_struct *p = rt_task_of(rt_se);
217 struct rq *rq = task_rq(p); 226
227 return task_rq(p);
228}
229
230static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
231{
232 struct rq *rq = rq_of_rt_se(rt_se);
218 233
219 return &rq->rt; 234 return &rq->rt;
220} 235}
@@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)
391} 406}
392#endif /* CONFIG_SMP */ 407#endif /* CONFIG_SMP */
393 408
409static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
410static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
411
394static inline int on_rt_rq(struct sched_rt_entity *rt_se) 412static inline int on_rt_rq(struct sched_rt_entity *rt_se)
395{ 413{
396 return !list_empty(&rt_se->run_list); 414 return !list_empty(&rt_se->run_list);
@@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
452 rt_se = rt_rq->tg->rt_se[cpu]; 470 rt_se = rt_rq->tg->rt_se[cpu];
453 471
454 if (rt_rq->rt_nr_running) { 472 if (rt_rq->rt_nr_running) {
455 if (rt_se && !on_rt_rq(rt_se)) 473 if (!rt_se)
474 enqueue_top_rt_rq(rt_rq);
475 else if (!on_rt_rq(rt_se))
456 enqueue_rt_entity(rt_se, false); 476 enqueue_rt_entity(rt_se, false);
477
457 if (rt_rq->highest_prio.curr < curr->prio) 478 if (rt_rq->highest_prio.curr < curr->prio)
458 resched_task(curr); 479 resched_task(curr);
459 } 480 }
@@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
466 487
467 rt_se = rt_rq->tg->rt_se[cpu]; 488 rt_se = rt_rq->tg->rt_se[cpu];
468 489
469 if (rt_se && on_rt_rq(rt_se)) 490 if (!rt_se)
491 dequeue_top_rt_rq(rt_rq);
492 else if (on_rt_rq(rt_se))
470 dequeue_rt_entity(rt_se); 493 dequeue_rt_entity(rt_se);
471} 494}
472 495
496static inline int rt_rq_throttled(struct rt_rq *rt_rq)
497{
498 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
499}
500
473static int rt_se_boosted(struct sched_rt_entity *rt_se) 501static int rt_se_boosted(struct sched_rt_entity *rt_se)
474{ 502{
475 struct rt_rq *rt_rq = group_rt_rq(rt_se); 503 struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
532 560
533static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 561static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
534{ 562{
535 if (rt_rq->rt_nr_running) 563 struct rq *rq = rq_of_rt_rq(rt_rq);
536 resched_task(rq_of_rt_rq(rt_rq)->curr); 564
565 if (!rt_rq->rt_nr_running)
566 return;
567
568 enqueue_top_rt_rq(rt_rq);
569 resched_task(rq->curr);
537} 570}
538 571
539static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 572static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
540{ 573{
574 dequeue_top_rt_rq(rt_rq);
575}
576
577static inline int rt_rq_throttled(struct rt_rq *rt_rq)
578{
579 return rt_rq->rt_throttled;
541} 580}
542 581
543static inline const struct cpumask *sched_rt_period_mask(void) 582static inline const struct cpumask *sched_rt_period_mask(void)
@@ -851,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
851 * but accrue some time due to boosting. 890 * but accrue some time due to boosting.
852 */ 891 */
853 if (likely(rt_b->rt_runtime)) { 892 if (likely(rt_b->rt_runtime)) {
854 static bool once = false;
855
856 rt_rq->rt_throttled = 1; 893 rt_rq->rt_throttled = 1;
857 894 printk_deferred_once("sched: RT throttling activated\n");
858 if (!once) {
859 once = true;
860 printk_sched("sched: RT throttling activated\n");
861 }
862 } else { 895 } else {
863 /* 896 /*
864 * In case we did anyway, make it go away, 897 * In case we did anyway, make it go away,
@@ -885,7 +918,6 @@ static void update_curr_rt(struct rq *rq)
885{ 918{
886 struct task_struct *curr = rq->curr; 919 struct task_struct *curr = rq->curr;
887 struct sched_rt_entity *rt_se = &curr->rt; 920 struct sched_rt_entity *rt_se = &curr->rt;
888 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
889 u64 delta_exec; 921 u64 delta_exec;
890 922
891 if (curr->sched_class != &rt_sched_class) 923 if (curr->sched_class != &rt_sched_class)
@@ -910,7 +942,7 @@ static void update_curr_rt(struct rq *rq)
910 return; 942 return;
911 943
912 for_each_sched_rt_entity(rt_se) { 944 for_each_sched_rt_entity(rt_se) {
913 rt_rq = rt_rq_of_se(rt_se); 945 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
914 946
915 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 947 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
916 raw_spin_lock(&rt_rq->rt_runtime_lock); 948 raw_spin_lock(&rt_rq->rt_runtime_lock);
@@ -922,6 +954,38 @@ static void update_curr_rt(struct rq *rq)
922 } 954 }
923} 955}
924 956
957static void
958dequeue_top_rt_rq(struct rt_rq *rt_rq)
959{
960 struct rq *rq = rq_of_rt_rq(rt_rq);
961
962 BUG_ON(&rq->rt != rt_rq);
963
964 if (!rt_rq->rt_queued)
965 return;
966
967 BUG_ON(!rq->nr_running);
968
969 sub_nr_running(rq, rt_rq->rt_nr_running);
970 rt_rq->rt_queued = 0;
971}
972
973static void
974enqueue_top_rt_rq(struct rt_rq *rt_rq)
975{
976 struct rq *rq = rq_of_rt_rq(rt_rq);
977
978 BUG_ON(&rq->rt != rt_rq);
979
980 if (rt_rq->rt_queued)
981 return;
982 if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
983 return;
984
985 add_nr_running(rq, rt_rq->rt_nr_running);
986 rt_rq->rt_queued = 1;
987}
988
925#if defined CONFIG_SMP 989#if defined CONFIG_SMP
926 990
927static void 991static void
@@ -1045,12 +1109,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1045#endif /* CONFIG_RT_GROUP_SCHED */ 1109#endif /* CONFIG_RT_GROUP_SCHED */
1046 1110
1047static inline 1111static inline
1112unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1113{
1114 struct rt_rq *group_rq = group_rt_rq(rt_se);
1115
1116 if (group_rq)
1117 return group_rq->rt_nr_running;
1118 else
1119 return 1;
1120}
1121
1122static inline
1048void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1123void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1049{ 1124{
1050 int prio = rt_se_prio(rt_se); 1125 int prio = rt_se_prio(rt_se);
1051 1126
1052 WARN_ON(!rt_prio(prio)); 1127 WARN_ON(!rt_prio(prio));
1053 rt_rq->rt_nr_running++; 1128 rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1054 1129
1055 inc_rt_prio(rt_rq, prio); 1130 inc_rt_prio(rt_rq, prio);
1056 inc_rt_migration(rt_se, rt_rq); 1131 inc_rt_migration(rt_se, rt_rq);
@@ -1062,7 +1137,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1062{ 1137{
1063 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1138 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1064 WARN_ON(!rt_rq->rt_nr_running); 1139 WARN_ON(!rt_rq->rt_nr_running);
1065 rt_rq->rt_nr_running--; 1140 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1066 1141
1067 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1142 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1068 dec_rt_migration(rt_se, rt_rq); 1143 dec_rt_migration(rt_se, rt_rq);
@@ -1119,6 +1194,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1119 back = rt_se; 1194 back = rt_se;
1120 } 1195 }
1121 1196
1197 dequeue_top_rt_rq(rt_rq_of_se(back));
1198
1122 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1199 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1123 if (on_rt_rq(rt_se)) 1200 if (on_rt_rq(rt_se))
1124 __dequeue_rt_entity(rt_se); 1201 __dequeue_rt_entity(rt_se);
@@ -1127,13 +1204,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1127 1204
1128static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1205static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1129{ 1206{
1207 struct rq *rq = rq_of_rt_se(rt_se);
1208
1130 dequeue_rt_stack(rt_se); 1209 dequeue_rt_stack(rt_se);
1131 for_each_sched_rt_entity(rt_se) 1210 for_each_sched_rt_entity(rt_se)
1132 __enqueue_rt_entity(rt_se, head); 1211 __enqueue_rt_entity(rt_se, head);
1212 enqueue_top_rt_rq(&rq->rt);
1133} 1213}
1134 1214
1135static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 1215static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1136{ 1216{
1217 struct rq *rq = rq_of_rt_se(rt_se);
1218
1137 dequeue_rt_stack(rt_se); 1219 dequeue_rt_stack(rt_se);
1138 1220
1139 for_each_sched_rt_entity(rt_se) { 1221 for_each_sched_rt_entity(rt_se) {
@@ -1142,6 +1224,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1142 if (rt_rq && rt_rq->rt_nr_running) 1224 if (rt_rq && rt_rq->rt_nr_running)
1143 __enqueue_rt_entity(rt_se, false); 1225 __enqueue_rt_entity(rt_se, false);
1144 } 1226 }
1227 enqueue_top_rt_rq(&rq->rt);
1145} 1228}
1146 1229
1147/* 1230/*
@@ -1159,8 +1242,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1159 1242
1160 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1243 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1161 enqueue_pushable_task(rq, p); 1244 enqueue_pushable_task(rq, p);
1162
1163 inc_nr_running(rq);
1164} 1245}
1165 1246
1166static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1247static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1171,8 +1252,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1171 dequeue_rt_entity(rt_se); 1252 dequeue_rt_entity(rt_se);
1172 1253
1173 dequeue_pushable_task(rq, p); 1254 dequeue_pushable_task(rq, p);
1174
1175 dec_nr_running(rq);
1176} 1255}
1177 1256
1178/* 1257/*
@@ -1377,10 +1456,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1377 if (prev->sched_class == &rt_sched_class) 1456 if (prev->sched_class == &rt_sched_class)
1378 update_curr_rt(rq); 1457 update_curr_rt(rq);
1379 1458
1380 if (!rt_rq->rt_nr_running) 1459 if (!rt_rq->rt_queued)
1381 return NULL;
1382
1383 if (rt_rq_throttled(rt_rq))
1384 return NULL; 1460 return NULL;
1385 1461
1386 put_prev_task(rq, prev); 1462 put_prev_task(rq, prev);
@@ -1892,9 +1968,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1892 */ 1968 */
1893 if (p->on_rq && rq->curr != p) { 1969 if (p->on_rq && rq->curr != p) {
1894#ifdef CONFIG_SMP 1970#ifdef CONFIG_SMP
1895 if (rq->rt.overloaded && push_rt_task(rq) && 1971 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
1896 /* Don't resched if we changed runqueues */ 1972 /* Don't resched if we changed runqueues */
1897 rq != task_rq(p)) 1973 push_rt_task(rq) && rq != task_rq(p))
1898 check_resched = 0; 1974 check_resched = 0;
1899#endif /* CONFIG_SMP */ 1975#endif /* CONFIG_SMP */
1900 if (check_resched && p->prio < rq->curr->prio) 1976 if (check_resched && p->prio < rq->curr->prio)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 456e492a3dca..31cc02ebc54e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -278,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
278extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); 278extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
279 279
280extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); 280extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
281extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 281extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
282extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); 282extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
283 283
284extern void free_rt_sched_group(struct task_group *tg); 284extern void free_rt_sched_group(struct task_group *tg);
@@ -409,6 +409,8 @@ struct rt_rq {
409 int overloaded; 409 int overloaded;
410 struct plist_head pushable_tasks; 410 struct plist_head pushable_tasks;
411#endif 411#endif
412 int rt_queued;
413
412 int rt_throttled; 414 int rt_throttled;
413 u64 rt_time; 415 u64 rt_time;
414 u64 rt_runtime; 416 u64 rt_runtime;
@@ -423,18 +425,6 @@ struct rt_rq {
423#endif 425#endif
424}; 426};
425 427
426#ifdef CONFIG_RT_GROUP_SCHED
427static inline int rt_rq_throttled(struct rt_rq *rt_rq)
428{
429 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
430}
431#else
432static inline int rt_rq_throttled(struct rt_rq *rt_rq)
433{
434 return rt_rq->rt_throttled;
435}
436#endif
437
438/* Deadline class' related fields in a runqueue */ 428/* Deadline class' related fields in a runqueue */
439struct dl_rq { 429struct dl_rq {
440 /* runqueue is an rbtree, ordered by deadline */ 430 /* runqueue is an rbtree, ordered by deadline */
@@ -577,7 +567,7 @@ struct rq {
577 struct root_domain *rd; 567 struct root_domain *rd;
578 struct sched_domain *sd; 568 struct sched_domain *sd;
579 569
580 unsigned long cpu_power; 570 unsigned long cpu_capacity;
581 571
582 unsigned char idle_balance; 572 unsigned char idle_balance;
583 /* For active balancing */ 573 /* For active balancing */
@@ -680,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
680 670
681#ifdef CONFIG_SMP 671#ifdef CONFIG_SMP
682 672
673extern void sched_ttwu_pending(void);
674
683#define rcu_dereference_check_sched_domain(p) \ 675#define rcu_dereference_check_sched_domain(p) \
684 rcu_dereference_check((p), \ 676 rcu_dereference_check((p), \
685 lockdep_is_held(&sched_domains_mutex)) 677 lockdep_is_held(&sched_domains_mutex))
@@ -738,15 +730,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
738DECLARE_PER_CPU(struct sched_domain *, sd_busy); 730DECLARE_PER_CPU(struct sched_domain *, sd_busy);
739DECLARE_PER_CPU(struct sched_domain *, sd_asym); 731DECLARE_PER_CPU(struct sched_domain *, sd_asym);
740 732
741struct sched_group_power { 733struct sched_group_capacity {
742 atomic_t ref; 734 atomic_t ref;
743 /* 735 /*
744 * CPU power of this group, SCHED_LOAD_SCALE being max power for a 736 * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
745 * single CPU. 737 * for a single CPU.
746 */ 738 */
747 unsigned int power, power_orig; 739 unsigned int capacity, capacity_orig;
748 unsigned long next_update; 740 unsigned long next_update;
749 int imbalance; /* XXX unrelated to power but shared group state */ 741 int imbalance; /* XXX unrelated to capacity but shared group state */
750 /* 742 /*
751 * Number of busy cpus in this group. 743 * Number of busy cpus in this group.
752 */ 744 */
@@ -760,7 +752,7 @@ struct sched_group {
760 atomic_t ref; 752 atomic_t ref;
761 753
762 unsigned int group_weight; 754 unsigned int group_weight;
763 struct sched_group_power *sgp; 755 struct sched_group_capacity *sgc;
764 756
765 /* 757 /*
766 * The CPUs this group covers. 758 * The CPUs this group covers.
@@ -783,7 +775,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
783 */ 775 */
784static inline struct cpumask *sched_group_mask(struct sched_group *sg) 776static inline struct cpumask *sched_group_mask(struct sched_group *sg)
785{ 777{
786 return to_cpumask(sg->sgp->cpumask); 778 return to_cpumask(sg->sgc->cpumask);
787} 779}
788 780
789/** 781/**
@@ -797,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
797 789
798extern int group_balance_cpu(struct sched_group *sg); 790extern int group_balance_cpu(struct sched_group *sg);
799 791
792#else
793
794static inline void sched_ttwu_pending(void) { }
795
800#endif /* CONFIG_SMP */ 796#endif /* CONFIG_SMP */
801 797
802#include "stats.h" 798#include "stats.h"
@@ -1177,7 +1173,7 @@ extern const struct sched_class idle_sched_class;
1177 1173
1178#ifdef CONFIG_SMP 1174#ifdef CONFIG_SMP
1179 1175
1180extern void update_group_power(struct sched_domain *sd, int cpu); 1176extern void update_group_capacity(struct sched_domain *sd, int cpu);
1181 1177
1182extern void trigger_load_balance(struct rq *rq); 1178extern void trigger_load_balance(struct rq *rq);
1183 1179
@@ -1216,12 +1212,14 @@ extern void update_idle_cpu_load(struct rq *this_rq);
1216 1212
1217extern void init_task_runnable_average(struct task_struct *p); 1213extern void init_task_runnable_average(struct task_struct *p);
1218 1214
1219static inline void inc_nr_running(struct rq *rq) 1215static inline void add_nr_running(struct rq *rq, unsigned count)
1220{ 1216{
1221 rq->nr_running++; 1217 unsigned prev_nr = rq->nr_running;
1218
1219 rq->nr_running = prev_nr + count;
1222 1220
1223#ifdef CONFIG_NO_HZ_FULL 1221#ifdef CONFIG_NO_HZ_FULL
1224 if (rq->nr_running == 2) { 1222 if (prev_nr < 2 && rq->nr_running >= 2) {
1225 if (tick_nohz_full_cpu(rq->cpu)) { 1223 if (tick_nohz_full_cpu(rq->cpu)) {
1226 /* Order rq->nr_running write against the IPI */ 1224 /* Order rq->nr_running write against the IPI */
1227 smp_wmb(); 1225 smp_wmb();
@@ -1231,9 +1229,9 @@ static inline void inc_nr_running(struct rq *rq)
1231#endif 1229#endif
1232} 1230}
1233 1231
1234static inline void dec_nr_running(struct rq *rq) 1232static inline void sub_nr_running(struct rq *rq, unsigned count)
1235{ 1233{
1236 rq->nr_running--; 1234 rq->nr_running -= count;
1237} 1235}
1238 1236
1239static inline void rq_last_tick_reset(struct rq *rq) 1237static inline void rq_last_tick_reset(struct rq *rq)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index d6ce65dde541..bfe0edadbfbb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
41static void 41static void
42enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) 42enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
43{ 43{
44 inc_nr_running(rq); 44 add_nr_running(rq, 1);
45} 45}
46 46
47static void 47static void
48dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) 48dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
49{ 49{
50 dec_nr_running(rq); 50 sub_nr_running(rq, 1);
51} 51}
52 52
53static void yield_task_stop(struct rq *rq) 53static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 7d50f794e248..0ffa20ae657b 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -394,7 +394,7 @@ EXPORT_SYMBOL(__wake_up_bit);
394 * 394 *
395 * In order for this to function properly, as it uses waitqueue_active() 395 * In order for this to function properly, as it uses waitqueue_active()
396 * internally, some kind of memory barrier must be done prior to calling 396 * internally, some kind of memory barrier must be done prior to calling
397 * this. Typically, this will be smp_mb__after_clear_bit(), but in some 397 * this. Typically, this will be smp_mb__after_atomic(), but in some
398 * cases where bitflags are manipulated non-atomically under a lock, one 398 * cases where bitflags are manipulated non-atomically under a lock, one
399 * may need to use a less regular barrier, such fs/inode.c's smp_mb(), 399 * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
400 * because spin_unlock() does not guarantee a memory barrier. 400 * because spin_unlock() does not guarantee a memory barrier.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b35c21503a36..301bbc24739c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -39,7 +39,7 @@
39 * is only needed for handling filters shared across tasks. 39 * is only needed for handling filters shared across tasks.
40 * @prev: points to a previously installed, or inherited, filter 40 * @prev: points to a previously installed, or inherited, filter
41 * @len: the number of instructions in the program 41 * @len: the number of instructions in the program
42 * @insns: the BPF program instructions to evaluate 42 * @insnsi: the BPF program instructions to evaluate
43 * 43 *
44 * seccomp_filter objects are organized in a tree linked via the @prev 44 * seccomp_filter objects are organized in a tree linked via the @prev
45 * pointer. For any task, it appears to be a singly-linked list starting 45 * pointer. For any task, it appears to be a singly-linked list starting
@@ -54,8 +54,7 @@
54struct seccomp_filter { 54struct seccomp_filter {
55 atomic_t usage; 55 atomic_t usage;
56 struct seccomp_filter *prev; 56 struct seccomp_filter *prev;
57 unsigned short len; /* Instruction count */ 57 struct sk_filter *prog;
58 struct sock_filter_int insnsi[];
59}; 58};
60 59
61/* Limit any path through the tree to 256KB worth of instructions. */ 60/* Limit any path through the tree to 256KB worth of instructions. */
@@ -104,60 +103,59 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
104 u32 k = ftest->k; 103 u32 k = ftest->k;
105 104
106 switch (code) { 105 switch (code) {
107 case BPF_S_LD_W_ABS: 106 case BPF_LD | BPF_W | BPF_ABS:
108 ftest->code = BPF_LDX | BPF_W | BPF_ABS; 107 ftest->code = BPF_LDX | BPF_W | BPF_ABS;
109 /* 32-bit aligned and not out of bounds. */ 108 /* 32-bit aligned and not out of bounds. */
110 if (k >= sizeof(struct seccomp_data) || k & 3) 109 if (k >= sizeof(struct seccomp_data) || k & 3)
111 return -EINVAL; 110 return -EINVAL;
112 continue; 111 continue;
113 case BPF_S_LD_W_LEN: 112 case BPF_LD | BPF_W | BPF_LEN:
114 ftest->code = BPF_LD | BPF_IMM; 113 ftest->code = BPF_LD | BPF_IMM;
115 ftest->k = sizeof(struct seccomp_data); 114 ftest->k = sizeof(struct seccomp_data);
116 continue; 115 continue;
117 case BPF_S_LDX_W_LEN: 116 case BPF_LDX | BPF_W | BPF_LEN:
118 ftest->code = BPF_LDX | BPF_IMM; 117 ftest->code = BPF_LDX | BPF_IMM;
119 ftest->k = sizeof(struct seccomp_data); 118 ftest->k = sizeof(struct seccomp_data);
120 continue; 119 continue;
121 /* Explicitly include allowed calls. */ 120 /* Explicitly include allowed calls. */
122 case BPF_S_RET_K: 121 case BPF_RET | BPF_K:
123 case BPF_S_RET_A: 122 case BPF_RET | BPF_A:
124 case BPF_S_ALU_ADD_K: 123 case BPF_ALU | BPF_ADD | BPF_K:
125 case BPF_S_ALU_ADD_X: 124 case BPF_ALU | BPF_ADD | BPF_X:
126 case BPF_S_ALU_SUB_K: 125 case BPF_ALU | BPF_SUB | BPF_K:
127 case BPF_S_ALU_SUB_X: 126 case BPF_ALU | BPF_SUB | BPF_X:
128 case BPF_S_ALU_MUL_K: 127 case BPF_ALU | BPF_MUL | BPF_K:
129 case BPF_S_ALU_MUL_X: 128 case BPF_ALU | BPF_MUL | BPF_X:
130 case BPF_S_ALU_DIV_X: 129 case BPF_ALU | BPF_DIV | BPF_K:
131 case BPF_S_ALU_AND_K: 130 case BPF_ALU | BPF_DIV | BPF_X:
132 case BPF_S_ALU_AND_X: 131 case BPF_ALU | BPF_AND | BPF_K:
133 case BPF_S_ALU_OR_K: 132 case BPF_ALU | BPF_AND | BPF_X:
134 case BPF_S_ALU_OR_X: 133 case BPF_ALU | BPF_OR | BPF_K:
135 case BPF_S_ALU_XOR_K: 134 case BPF_ALU | BPF_OR | BPF_X:
136 case BPF_S_ALU_XOR_X: 135 case BPF_ALU | BPF_XOR | BPF_K:
137 case BPF_S_ALU_LSH_K: 136 case BPF_ALU | BPF_XOR | BPF_X:
138 case BPF_S_ALU_LSH_X: 137 case BPF_ALU | BPF_LSH | BPF_K:
139 case BPF_S_ALU_RSH_K: 138 case BPF_ALU | BPF_LSH | BPF_X:
140 case BPF_S_ALU_RSH_X: 139 case BPF_ALU | BPF_RSH | BPF_K:
141 case BPF_S_ALU_NEG: 140 case BPF_ALU | BPF_RSH | BPF_X:
142 case BPF_S_LD_IMM: 141 case BPF_ALU | BPF_NEG:
143 case BPF_S_LDX_IMM: 142 case BPF_LD | BPF_IMM:
144 case BPF_S_MISC_TAX: 143 case BPF_LDX | BPF_IMM:
145 case BPF_S_MISC_TXA: 144 case BPF_MISC | BPF_TAX:
146 case BPF_S_ALU_DIV_K: 145 case BPF_MISC | BPF_TXA:
147 case BPF_S_LD_MEM: 146 case BPF_LD | BPF_MEM:
148 case BPF_S_LDX_MEM: 147 case BPF_LDX | BPF_MEM:
149 case BPF_S_ST: 148 case BPF_ST:
150 case BPF_S_STX: 149 case BPF_STX:
151 case BPF_S_JMP_JA: 150 case BPF_JMP | BPF_JA:
152 case BPF_S_JMP_JEQ_K: 151 case BPF_JMP | BPF_JEQ | BPF_K:
153 case BPF_S_JMP_JEQ_X: 152 case BPF_JMP | BPF_JEQ | BPF_X:
154 case BPF_S_JMP_JGE_K: 153 case BPF_JMP | BPF_JGE | BPF_K:
155 case BPF_S_JMP_JGE_X: 154 case BPF_JMP | BPF_JGE | BPF_X:
156 case BPF_S_JMP_JGT_K: 155 case BPF_JMP | BPF_JGT | BPF_K:
157 case BPF_S_JMP_JGT_X: 156 case BPF_JMP | BPF_JGT | BPF_X:
158 case BPF_S_JMP_JSET_K: 157 case BPF_JMP | BPF_JSET | BPF_K:
159 case BPF_S_JMP_JSET_X: 158 case BPF_JMP | BPF_JSET | BPF_X:
160 sk_decode_filter(ftest, ftest);
161 continue; 159 continue;
162 default: 160 default:
163 return -EINVAL; 161 return -EINVAL;
@@ -189,7 +187,8 @@ static u32 seccomp_run_filters(int syscall)
189 * value always takes priority (ignoring the DATA). 187 * value always takes priority (ignoring the DATA).
190 */ 188 */
191 for (f = current->seccomp.filter; f; f = f->prev) { 189 for (f = current->seccomp.filter; f; f = f->prev) {
192 u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi); 190 u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd);
191
193 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 192 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
194 ret = cur_ret; 193 ret = cur_ret;
195 } 194 }
@@ -215,12 +214,12 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
215 return -EINVAL; 214 return -EINVAL;
216 215
217 for (filter = current->seccomp.filter; filter; filter = filter->prev) 216 for (filter = current->seccomp.filter; filter; filter = filter->prev)
218 total_insns += filter->len + 4; /* include a 4 instr penalty */ 217 total_insns += filter->prog->len + 4; /* include a 4 instr penalty */
219 if (total_insns > MAX_INSNS_PER_PATH) 218 if (total_insns > MAX_INSNS_PER_PATH)
220 return -ENOMEM; 219 return -ENOMEM;
221 220
222 /* 221 /*
223 * Installing a seccomp filter requires that the task have 222 * Installing a seccomp filter requires that the task has
224 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. 223 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
225 * This avoids scenarios where unprivileged tasks can affect the 224 * This avoids scenarios where unprivileged tasks can affect the
226 * behavior of privileged children. 225 * behavior of privileged children.
@@ -256,19 +255,25 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
256 255
257 /* Allocate a new seccomp_filter */ 256 /* Allocate a new seccomp_filter */
258 ret = -ENOMEM; 257 ret = -ENOMEM;
259 filter = kzalloc(sizeof(struct seccomp_filter) + 258 filter = kzalloc(sizeof(struct seccomp_filter),
260 sizeof(struct sock_filter_int) * new_len,
261 GFP_KERNEL|__GFP_NOWARN); 259 GFP_KERNEL|__GFP_NOWARN);
262 if (!filter) 260 if (!filter)
263 goto free_prog; 261 goto free_prog;
264 262
265 ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); 263 filter->prog = kzalloc(sk_filter_size(new_len),
266 if (ret) 264 GFP_KERNEL|__GFP_NOWARN);
265 if (!filter->prog)
267 goto free_filter; 266 goto free_filter;
267
268 ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
269 if (ret)
270 goto free_filter_prog;
268 kfree(fp); 271 kfree(fp);
269 272
270 atomic_set(&filter->usage, 1); 273 atomic_set(&filter->usage, 1);
271 filter->len = new_len; 274 filter->prog->len = new_len;
275
276 sk_filter_select_runtime(filter->prog);
272 277
273 /* 278 /*
274 * If there is an existing filter, make it the prev and don't drop its 279 * If there is an existing filter, make it the prev and don't drop its
@@ -278,6 +283,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
278 current->seccomp.filter = filter; 283 current->seccomp.filter = filter;
279 return 0; 284 return 0;
280 285
286free_filter_prog:
287 kfree(filter->prog);
281free_filter: 288free_filter:
282 kfree(filter); 289 kfree(filter);
283free_prog: 290free_prog:
@@ -330,6 +337,7 @@ void put_seccomp_filter(struct task_struct *tsk)
330 while (orig && atomic_dec_and_test(&orig->usage)) { 337 while (orig && atomic_dec_and_test(&orig->usage)) {
331 struct seccomp_filter *freeme = orig; 338 struct seccomp_filter *freeme = orig;
332 orig = orig->prev; 339 orig = orig->prev;
340 sk_filter_free(freeme->prog);
333 kfree(freeme); 341 kfree(freeme);
334 } 342 }
335} 343}
diff --git a/kernel/signal.c b/kernel/signal.c
index 6ea13c09ae56..a4077e90f19f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -277,6 +277,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
277{ 277{
278 if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { 278 if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
279 task->jobctl &= ~JOBCTL_TRAPPING; 279 task->jobctl &= ~JOBCTL_TRAPPING;
280 smp_mb(); /* advised by wake_up_bit() */
280 wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); 281 wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
281 } 282 }
282} 283}
@@ -705,11 +706,8 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state)
705 * Returns 1 if any signals were found. 706 * Returns 1 if any signals were found.
706 * 707 *
707 * All callers must be holding the siglock. 708 * All callers must be holding the siglock.
708 *
709 * This version takes a sigset mask and looks at all signals,
710 * not just those in the first mask word.
711 */ 709 */
712static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) 710static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
713{ 711{
714 struct sigqueue *q, *n; 712 struct sigqueue *q, *n;
715 sigset_t m; 713 sigset_t m;
@@ -727,29 +725,6 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
727 } 725 }
728 return 1; 726 return 1;
729} 727}
730/*
731 * Remove signals in mask from the pending set and queue.
732 * Returns 1 if any signals were found.
733 *
734 * All callers must be holding the siglock.
735 */
736static int rm_from_queue(unsigned long mask, struct sigpending *s)
737{
738 struct sigqueue *q, *n;
739
740 if (!sigtestsetmask(&s->signal, mask))
741 return 0;
742
743 sigdelsetmask(&s->signal, mask);
744 list_for_each_entry_safe(q, n, &s->list, list) {
745 if (q->info.si_signo < SIGRTMIN &&
746 (mask & sigmask(q->info.si_signo))) {
747 list_del_init(&q->list);
748 __sigqueue_free(q);
749 }
750 }
751 return 1;
752}
753 728
754static inline int is_si_special(const struct siginfo *info) 729static inline int is_si_special(const struct siginfo *info)
755{ 730{
@@ -861,6 +836,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
861{ 836{
862 struct signal_struct *signal = p->signal; 837 struct signal_struct *signal = p->signal;
863 struct task_struct *t; 838 struct task_struct *t;
839 sigset_t flush;
864 840
865 if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { 841 if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
866 if (signal->flags & SIGNAL_GROUP_COREDUMP) 842 if (signal->flags & SIGNAL_GROUP_COREDUMP)
@@ -872,26 +848,25 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
872 /* 848 /*
873 * This is a stop signal. Remove SIGCONT from all queues. 849 * This is a stop signal. Remove SIGCONT from all queues.
874 */ 850 */
875 rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); 851 siginitset(&flush, sigmask(SIGCONT));
876 t = p; 852 flush_sigqueue_mask(&flush, &signal->shared_pending);
877 do { 853 for_each_thread(p, t)
878 rm_from_queue(sigmask(SIGCONT), &t->pending); 854 flush_sigqueue_mask(&flush, &t->pending);
879 } while_each_thread(p, t);
880 } else if (sig == SIGCONT) { 855 } else if (sig == SIGCONT) {
881 unsigned int why; 856 unsigned int why;
882 /* 857 /*
883 * Remove all stop signals from all queues, wake all threads. 858 * Remove all stop signals from all queues, wake all threads.
884 */ 859 */
885 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); 860 siginitset(&flush, SIG_KERNEL_STOP_MASK);
886 t = p; 861 flush_sigqueue_mask(&flush, &signal->shared_pending);
887 do { 862 for_each_thread(p, t) {
863 flush_sigqueue_mask(&flush, &t->pending);
888 task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); 864 task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
889 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
890 if (likely(!(t->ptrace & PT_SEIZED))) 865 if (likely(!(t->ptrace & PT_SEIZED)))
891 wake_up_state(t, __TASK_STOPPED); 866 wake_up_state(t, __TASK_STOPPED);
892 else 867 else
893 ptrace_trap_notify(t); 868 ptrace_trap_notify(t);
894 } while_each_thread(p, t); 869 }
895 870
896 /* 871 /*
897 * Notify the parent with CLD_CONTINUED if we were stopped. 872 * Notify the parent with CLD_CONTINUED if we were stopped.
@@ -2854,7 +2829,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2854 2829
2855 spin_lock_irq(&tsk->sighand->siglock); 2830 spin_lock_irq(&tsk->sighand->siglock);
2856 __set_task_blocked(tsk, &tsk->real_blocked); 2831 __set_task_blocked(tsk, &tsk->real_blocked);
2857 siginitset(&tsk->real_blocked, 0); 2832 sigemptyset(&tsk->real_blocked);
2858 sig = dequeue_signal(tsk, &mask, info); 2833 sig = dequeue_signal(tsk, &mask, info);
2859 } 2834 }
2860 spin_unlock_irq(&tsk->sighand->siglock); 2835 spin_unlock_irq(&tsk->sighand->siglock);
@@ -3091,18 +3066,39 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
3091} 3066}
3092#endif 3067#endif
3093 3068
3069/*
3070 * For kthreads only, must not be used if cloned with CLONE_SIGHAND
3071 */
3072void kernel_sigaction(int sig, __sighandler_t action)
3073{
3074 spin_lock_irq(&current->sighand->siglock);
3075 current->sighand->action[sig - 1].sa.sa_handler = action;
3076 if (action == SIG_IGN) {
3077 sigset_t mask;
3078
3079 sigemptyset(&mask);
3080 sigaddset(&mask, sig);
3081
3082 flush_sigqueue_mask(&mask, &current->signal->shared_pending);
3083 flush_sigqueue_mask(&mask, &current->pending);
3084 recalc_sigpending();
3085 }
3086 spin_unlock_irq(&current->sighand->siglock);
3087}
3088EXPORT_SYMBOL(kernel_sigaction);
3089
3094int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 3090int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3095{ 3091{
3096 struct task_struct *t = current; 3092 struct task_struct *p = current, *t;
3097 struct k_sigaction *k; 3093 struct k_sigaction *k;
3098 sigset_t mask; 3094 sigset_t mask;
3099 3095
3100 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) 3096 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
3101 return -EINVAL; 3097 return -EINVAL;
3102 3098
3103 k = &t->sighand->action[sig-1]; 3099 k = &p->sighand->action[sig-1];
3104 3100
3105 spin_lock_irq(&current->sighand->siglock); 3101 spin_lock_irq(&p->sighand->siglock);
3106 if (oact) 3102 if (oact)
3107 *oact = *k; 3103 *oact = *k;
3108 3104
@@ -3121,21 +3117,20 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3121 * (for example, SIGCHLD), shall cause the pending signal to 3117 * (for example, SIGCHLD), shall cause the pending signal to
3122 * be discarded, whether or not it is blocked" 3118 * be discarded, whether or not it is blocked"
3123 */ 3119 */
3124 if (sig_handler_ignored(sig_handler(t, sig), sig)) { 3120 if (sig_handler_ignored(sig_handler(p, sig), sig)) {
3125 sigemptyset(&mask); 3121 sigemptyset(&mask);
3126 sigaddset(&mask, sig); 3122 sigaddset(&mask, sig);
3127 rm_from_queue_full(&mask, &t->signal->shared_pending); 3123 flush_sigqueue_mask(&mask, &p->signal->shared_pending);
3128 do { 3124 for_each_thread(p, t)
3129 rm_from_queue_full(&mask, &t->pending); 3125 flush_sigqueue_mask(&mask, &t->pending);
3130 } while_each_thread(current, t);
3131 } 3126 }
3132 } 3127 }
3133 3128
3134 spin_unlock_irq(&current->sighand->siglock); 3129 spin_unlock_irq(&p->sighand->siglock);
3135 return 0; 3130 return 0;
3136} 3131}
3137 3132
3138static int 3133static int
3139do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) 3134do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
3140{ 3135{
3141 stack_t oss; 3136 stack_t oss;
@@ -3496,7 +3491,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
3496} 3491}
3497#endif 3492#endif
3498 3493
3499#ifdef __ARCH_WANT_SYS_SGETMASK 3494#ifdef CONFIG_SGETMASK_SYSCALL
3500 3495
3501/* 3496/*
3502 * For backwards compatibility. Functionality superseded by sigprocmask. 3497 * For backwards compatibility. Functionality superseded by sigprocmask.
@@ -3517,7 +3512,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
3517 3512
3518 return old; 3513 return old;
3519} 3514}
3520#endif /* __ARCH_WANT_SGETMASK */ 3515#endif /* CONFIG_SGETMASK_SYSCALL */
3521 3516
3522#ifdef __ARCH_WANT_SYS_SIGNAL 3517#ifdef __ARCH_WANT_SYS_SIGNAL
3523/* 3518/*
diff --git a/kernel/smp.c b/kernel/smp.c
index 06d574e42c72..80c33f8de14f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
29 29
30static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); 30static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
31 31
32static void flush_smp_call_function_queue(bool warn_cpu_offline);
33
32static int 34static int
33hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 35hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
34{ 36{
@@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
51#ifdef CONFIG_HOTPLUG_CPU 53#ifdef CONFIG_HOTPLUG_CPU
52 case CPU_UP_CANCELED: 54 case CPU_UP_CANCELED:
53 case CPU_UP_CANCELED_FROZEN: 55 case CPU_UP_CANCELED_FROZEN:
56 /* Fall-through to the CPU_DEAD[_FROZEN] case. */
54 57
55 case CPU_DEAD: 58 case CPU_DEAD:
56 case CPU_DEAD_FROZEN: 59 case CPU_DEAD_FROZEN:
57 free_cpumask_var(cfd->cpumask); 60 free_cpumask_var(cfd->cpumask);
58 free_percpu(cfd->csd); 61 free_percpu(cfd->csd);
59 break; 62 break;
63
64 case CPU_DYING:
65 case CPU_DYING_FROZEN:
66 /*
67 * The IPIs for the smp-call-function callbacks queued by other
68 * CPUs might arrive late, either due to hardware latencies or
69 * because this CPU disabled interrupts (inside stop-machine)
70 * before the IPIs were sent. So flush out any pending callbacks
71 * explicitly (without waiting for the IPIs to arrive), to
72 * ensure that the outgoing CPU doesn't go offline with work
73 * still pending.
74 */
75 flush_smp_call_function_queue(false);
76 break;
60#endif 77#endif
61 }; 78 };
62 79
@@ -177,23 +194,59 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
177 return 0; 194 return 0;
178} 195}
179 196
180/* 197/**
181 * Invoked by arch to handle an IPI for call function single. Must be 198 * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
182 * called from the arch with interrupts disabled. 199 *
200 * Invoked by arch to handle an IPI for call function single.
201 * Must be called with interrupts disabled.
183 */ 202 */
184void generic_smp_call_function_single_interrupt(void) 203void generic_smp_call_function_single_interrupt(void)
185{ 204{
205 flush_smp_call_function_queue(true);
206}
207
208/**
209 * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
210 *
211 * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
212 * offline CPU. Skip this check if set to 'false'.
213 *
214 * Flush any pending smp-call-function callbacks queued on this CPU. This is
215 * invoked by the generic IPI handler, as well as by a CPU about to go offline,
216 * to ensure that all pending IPI callbacks are run before it goes completely
217 * offline.
218 *
219 * Loop through the call_single_queue and run all the queued callbacks.
220 * Must be called with interrupts disabled.
221 */
222static void flush_smp_call_function_queue(bool warn_cpu_offline)
223{
224 struct llist_head *head;
186 struct llist_node *entry; 225 struct llist_node *entry;
187 struct call_single_data *csd, *csd_next; 226 struct call_single_data *csd, *csd_next;
227 static bool warned;
188 228
189 /* 229 WARN_ON(!irqs_disabled());
190 * Shouldn't receive this interrupt on a cpu that is not yet online.
191 */
192 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
193 230
194 entry = llist_del_all(&__get_cpu_var(call_single_queue)); 231 head = &__get_cpu_var(call_single_queue);
232 entry = llist_del_all(head);
195 entry = llist_reverse_order(entry); 233 entry = llist_reverse_order(entry);
196 234
235 /* There shouldn't be any pending callbacks on an offline CPU. */
236 if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
237 !warned && !llist_empty(head))) {
238 warned = true;
239 WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
240
241 /*
242 * We don't have to use the _safe() variant here
243 * because we are not invoking the IPI handlers yet.
244 */
245 llist_for_each_entry(csd, entry, llist)
246 pr_warn("IPI callback %pS sent to offline CPU\n",
247 csd->func);
248 }
249
197 llist_for_each_entry_safe(csd, csd_next, entry, llist) { 250 llist_for_each_entry_safe(csd, csd_next, entry, llist) {
198 csd->func(csd->info); 251 csd->func(csd->info);
199 csd_unlock(csd); 252 csd_unlock(csd);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 92f24f5e8d52..5918d227730f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -232,7 +232,6 @@ asmlinkage __visible void __do_softirq(void)
232 bool in_hardirq; 232 bool in_hardirq;
233 __u32 pending; 233 __u32 pending;
234 int softirq_bit; 234 int softirq_bit;
235 int cpu;
236 235
237 /* 236 /*
238 * Mask out PF_MEMALLOC s current task context is borrowed for the 237 * Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -247,7 +246,6 @@ asmlinkage __visible void __do_softirq(void)
247 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); 246 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
248 in_hardirq = lockdep_softirq_start(); 247 in_hardirq = lockdep_softirq_start();
249 248
250 cpu = smp_processor_id();
251restart: 249restart:
252 /* Reset the pending bitmask before enabling irqs */ 250 /* Reset the pending bitmask before enabling irqs */
253 set_softirq_pending(0); 251 set_softirq_pending(0);
@@ -276,11 +274,11 @@ restart:
276 prev_count, preempt_count()); 274 prev_count, preempt_count());
277 preempt_count_set(prev_count); 275 preempt_count_set(prev_count);
278 } 276 }
279 rcu_bh_qs(cpu);
280 h++; 277 h++;
281 pending >>= softirq_bit; 278 pending >>= softirq_bit;
282 } 279 }
283 280
281 rcu_bh_qs(smp_processor_id());
284 local_irq_disable(); 282 local_irq_disable();
285 283
286 pending = local_softirq_pending(); 284 pending = local_softirq_pending();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 01fbae5b97b7..695f0c6cd169 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
307 * @cpu: cpu to stop 307 * @cpu: cpu to stop
308 * @fn: function to execute 308 * @fn: function to execute
309 * @arg: argument to @fn 309 * @arg: argument to @fn
310 * @work_buf: pointer to cpu_stop_work structure
310 * 311 *
311 * Similar to stop_one_cpu() but doesn't wait for completion. The 312 * Similar to stop_one_cpu() but doesn't wait for completion. The
312 * caller is responsible for ensuring @work_buf is currently unused 313 * caller is responsible for ensuring @work_buf is currently unused
diff --git a/kernel/sys.c b/kernel/sys.c
index fba0f29401ea..66a751ebf9d9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
250 else 250 else
251 p = current; 251 p = current;
252 if (p) { 252 if (p) {
253 niceval = 20 - task_nice(p); 253 niceval = nice_to_rlimit(task_nice(p));
254 if (niceval > retval) 254 if (niceval > retval)
255 retval = niceval; 255 retval = niceval;
256 } 256 }
@@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
261 else 261 else
262 pgrp = task_pgrp(current); 262 pgrp = task_pgrp(current);
263 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 263 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
264 niceval = 20 - task_nice(p); 264 niceval = nice_to_rlimit(task_nice(p));
265 if (niceval > retval) 265 if (niceval > retval)
266 retval = niceval; 266 retval = niceval;
267 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 267 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
277 277
278 do_each_thread(g, p) { 278 do_each_thread(g, p) {
279 if (uid_eq(task_uid(p), uid)) { 279 if (uid_eq(task_uid(p), uid)) {
280 niceval = 20 - task_nice(p); 280 niceval = nice_to_rlimit(task_nice(p));
281 if (niceval > retval) 281 if (niceval > retval)
282 retval = niceval; 282 retval = niceval;
283 } 283 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bc8d1b74a6b9..36441b51b5df 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16);
135cond_syscall(sys_setresuid16); 135cond_syscall(sys_setresuid16);
136cond_syscall(sys_setreuid16); 136cond_syscall(sys_setreuid16);
137cond_syscall(sys_setuid16); 137cond_syscall(sys_setuid16);
138cond_syscall(sys_sgetmask);
139cond_syscall(sys_ssetmask);
138cond_syscall(sys_vm86old); 140cond_syscall(sys_vm86old);
139cond_syscall(sys_vm86); 141cond_syscall(sys_vm86);
140cond_syscall(sys_ipc); 142cond_syscall(sys_ipc);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 74f5b580fe34..75b22e22a72c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
136/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 136/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
137static int maxolduid = 65535; 137static int maxolduid = 65535;
138static int minolduid; 138static int minolduid;
139static int min_percpu_pagelist_fract = 8;
140 139
141static int ngroups_max = NGROUPS_MAX; 140static int ngroups_max = NGROUPS_MAX;
142static const int cap_last_cap = CAP_LAST_CAP; 141static const int cap_last_cap = CAP_LAST_CAP;
@@ -152,10 +151,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
152#ifdef CONFIG_SPARC 151#ifdef CONFIG_SPARC
153#endif 152#endif
154 153
155#ifdef CONFIG_SPARC64
156extern int sysctl_tsb_ratio;
157#endif
158
159#ifdef __hppa__ 154#ifdef __hppa__
160extern int pwrsw_enabled; 155extern int pwrsw_enabled;
161#endif 156#endif
@@ -173,6 +168,13 @@ extern int no_unaligned_warning;
173#endif 168#endif
174 169
175#ifdef CONFIG_PROC_SYSCTL 170#ifdef CONFIG_PROC_SYSCTL
171
172#define SYSCTL_WRITES_LEGACY -1
173#define SYSCTL_WRITES_WARN 0
174#define SYSCTL_WRITES_STRICT 1
175
176static int sysctl_writes_strict = SYSCTL_WRITES_WARN;
177
176static int proc_do_cad_pid(struct ctl_table *table, int write, 178static int proc_do_cad_pid(struct ctl_table *table, int write,
177 void __user *buffer, size_t *lenp, loff_t *ppos); 179 void __user *buffer, size_t *lenp, loff_t *ppos);
178static int proc_taint(struct ctl_table *table, int write, 180static int proc_taint(struct ctl_table *table, int write,
@@ -195,7 +197,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
195/* Note: sysrq code uses it's own private copy */ 197/* Note: sysrq code uses it's own private copy */
196static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; 198static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
197 199
198static int sysrq_sysctl_handler(ctl_table *table, int write, 200static int sysrq_sysctl_handler(struct ctl_table *table, int write,
199 void __user *buffer, size_t *lenp, 201 void __user *buffer, size_t *lenp,
200 loff_t *ppos) 202 loff_t *ppos)
201{ 203{
@@ -495,6 +497,15 @@ static struct ctl_table kern_table[] = {
495 .mode = 0644, 497 .mode = 0644,
496 .proc_handler = proc_taint, 498 .proc_handler = proc_taint,
497 }, 499 },
500 {
501 .procname = "sysctl_writes_strict",
502 .data = &sysctl_writes_strict,
503 .maxlen = sizeof(int),
504 .mode = 0644,
505 .proc_handler = proc_dointvec_minmax,
506 .extra1 = &neg_one,
507 .extra2 = &one,
508 },
498#endif 509#endif
499#ifdef CONFIG_LATENCYTOP 510#ifdef CONFIG_LATENCYTOP
500 { 511 {
@@ -643,7 +654,7 @@ static struct ctl_table kern_table[] = {
643 .extra2 = &one, 654 .extra2 = &one,
644 }, 655 },
645#endif 656#endif
646 657#ifdef CONFIG_UEVENT_HELPER
647 { 658 {
648 .procname = "hotplug", 659 .procname = "hotplug",
649 .data = &uevent_helper, 660 .data = &uevent_helper,
@@ -651,7 +662,7 @@ static struct ctl_table kern_table[] = {
651 .mode = 0644, 662 .mode = 0644,
652 .proc_handler = proc_dostring, 663 .proc_handler = proc_dostring,
653 }, 664 },
654 665#endif
655#ifdef CONFIG_CHR_DEV_SG 666#ifdef CONFIG_CHR_DEV_SG
656 { 667 {
657 .procname = "sg-big-buff", 668 .procname = "sg-big-buff",
@@ -849,6 +860,17 @@ static struct ctl_table kern_table[] = {
849 .extra1 = &zero, 860 .extra1 = &zero,
850 .extra2 = &one, 861 .extra2 = &one,
851 }, 862 },
863#ifdef CONFIG_SMP
864 {
865 .procname = "softlockup_all_cpu_backtrace",
866 .data = &sysctl_softlockup_all_cpu_backtrace,
867 .maxlen = sizeof(int),
868 .mode = 0644,
869 .proc_handler = proc_dointvec_minmax,
870 .extra1 = &zero,
871 .extra2 = &one,
872 },
873#endif /* CONFIG_SMP */
852 { 874 {
853 .procname = "nmi_watchdog", 875 .procname = "nmi_watchdog",
854 .data = &watchdog_user_enabled, 876 .data = &watchdog_user_enabled,
@@ -1305,7 +1327,7 @@ static struct ctl_table vm_table[] = {
1305 .maxlen = sizeof(percpu_pagelist_fraction), 1327 .maxlen = sizeof(percpu_pagelist_fraction),
1306 .mode = 0644, 1328 .mode = 0644,
1307 .proc_handler = percpu_pagelist_fraction_sysctl_handler, 1329 .proc_handler = percpu_pagelist_fraction_sysctl_handler,
1308 .extra1 = &min_percpu_pagelist_fract, 1330 .extra1 = &zero,
1309 }, 1331 },
1310#ifdef CONFIG_MMU 1332#ifdef CONFIG_MMU
1311 { 1333 {
@@ -1418,8 +1440,13 @@ static struct ctl_table vm_table[] = {
1418 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1440 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
1419 { 1441 {
1420 .procname = "vdso_enabled", 1442 .procname = "vdso_enabled",
1443#ifdef CONFIG_X86_32
1444 .data = &vdso32_enabled,
1445 .maxlen = sizeof(vdso32_enabled),
1446#else
1421 .data = &vdso_enabled, 1447 .data = &vdso_enabled,
1422 .maxlen = sizeof(vdso_enabled), 1448 .maxlen = sizeof(vdso_enabled),
1449#endif
1423 .mode = 0644, 1450 .mode = 0644,
1424 .proc_handler = proc_dointvec, 1451 .proc_handler = proc_dointvec,
1425 .extra1 = &zero, 1452 .extra1 = &zero,
@@ -1698,8 +1725,8 @@ int __init sysctl_init(void)
1698 1725
1699#ifdef CONFIG_PROC_SYSCTL 1726#ifdef CONFIG_PROC_SYSCTL
1700 1727
1701static int _proc_do_string(void* data, int maxlen, int write, 1728static int _proc_do_string(char *data, int maxlen, int write,
1702 void __user *buffer, 1729 char __user *buffer,
1703 size_t *lenp, loff_t *ppos) 1730 size_t *lenp, loff_t *ppos)
1704{ 1731{
1705 size_t len; 1732 size_t len;
@@ -1712,21 +1739,30 @@ static int _proc_do_string(void* data, int maxlen, int write,
1712 } 1739 }
1713 1740
1714 if (write) { 1741 if (write) {
1715 len = 0; 1742 if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
1743 /* Only continue writes not past the end of buffer. */
1744 len = strlen(data);
1745 if (len > maxlen - 1)
1746 len = maxlen - 1;
1747
1748 if (*ppos > len)
1749 return 0;
1750 len = *ppos;
1751 } else {
1752 /* Start writing from beginning of buffer. */
1753 len = 0;
1754 }
1755
1756 *ppos += *lenp;
1716 p = buffer; 1757 p = buffer;
1717 while (len < *lenp) { 1758 while ((p - buffer) < *lenp && len < maxlen - 1) {
1718 if (get_user(c, p++)) 1759 if (get_user(c, p++))
1719 return -EFAULT; 1760 return -EFAULT;
1720 if (c == 0 || c == '\n') 1761 if (c == 0 || c == '\n')
1721 break; 1762 break;
1722 len++; 1763 data[len++] = c;
1723 } 1764 }
1724 if (len >= maxlen) 1765 data[len] = 0;
1725 len = maxlen-1;
1726 if(copy_from_user(data, buffer, len))
1727 return -EFAULT;
1728 ((char *) data)[len] = 0;
1729 *ppos += *lenp;
1730 } else { 1766 } else {
1731 len = strlen(data); 1767 len = strlen(data);
1732 if (len > maxlen) 1768 if (len > maxlen)
@@ -1743,10 +1779,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
1743 if (len > *lenp) 1779 if (len > *lenp)
1744 len = *lenp; 1780 len = *lenp;
1745 if (len) 1781 if (len)
1746 if(copy_to_user(buffer, data, len)) 1782 if (copy_to_user(buffer, data, len))
1747 return -EFAULT; 1783 return -EFAULT;
1748 if (len < *lenp) { 1784 if (len < *lenp) {
1749 if(put_user('\n', ((char __user *) buffer) + len)) 1785 if (put_user('\n', buffer + len))
1750 return -EFAULT; 1786 return -EFAULT;
1751 len++; 1787 len++;
1752 } 1788 }
@@ -1756,6 +1792,14 @@ static int _proc_do_string(void* data, int maxlen, int write,
1756 return 0; 1792 return 0;
1757} 1793}
1758 1794
1795static void warn_sysctl_write(struct ctl_table *table)
1796{
1797 pr_warn_once("%s wrote to %s when file position was not 0!\n"
1798 "This will not be supported in the future. To silence this\n"
1799 "warning, set kernel.sysctl_writes_strict = -1\n",
1800 current->comm, table->procname);
1801}
1802
1759/** 1803/**
1760 * proc_dostring - read a string sysctl 1804 * proc_dostring - read a string sysctl
1761 * @table: the sysctl table 1805 * @table: the sysctl table
@@ -1776,8 +1820,11 @@ static int _proc_do_string(void* data, int maxlen, int write,
1776int proc_dostring(struct ctl_table *table, int write, 1820int proc_dostring(struct ctl_table *table, int write,
1777 void __user *buffer, size_t *lenp, loff_t *ppos) 1821 void __user *buffer, size_t *lenp, loff_t *ppos)
1778{ 1822{
1779 return _proc_do_string(table->data, table->maxlen, write, 1823 if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN)
1780 buffer, lenp, ppos); 1824 warn_sysctl_write(table);
1825
1826 return _proc_do_string((char *)(table->data), table->maxlen, write,
1827 (char __user *)buffer, lenp, ppos);
1781} 1828}
1782 1829
1783static size_t proc_skip_spaces(char **buf) 1830static size_t proc_skip_spaces(char **buf)
@@ -1951,6 +1998,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
1951 conv = do_proc_dointvec_conv; 1998 conv = do_proc_dointvec_conv;
1952 1999
1953 if (write) { 2000 if (write) {
2001 if (*ppos) {
2002 switch (sysctl_writes_strict) {
2003 case SYSCTL_WRITES_STRICT:
2004 goto out;
2005 case SYSCTL_WRITES_WARN:
2006 warn_sysctl_write(table);
2007 break;
2008 default:
2009 break;
2010 }
2011 }
2012
1954 if (left > PAGE_SIZE - 1) 2013 if (left > PAGE_SIZE - 1)
1955 left = PAGE_SIZE - 1; 2014 left = PAGE_SIZE - 1;
1956 page = __get_free_page(GFP_TEMPORARY); 2015 page = __get_free_page(GFP_TEMPORARY);
@@ -2008,6 +2067,7 @@ free:
2008 return err ? : -EINVAL; 2067 return err ? : -EINVAL;
2009 } 2068 }
2010 *lenp -= left; 2069 *lenp -= left;
2070out:
2011 *ppos += *lenp; 2071 *ppos += *lenp;
2012 return err; 2072 return err;
2013} 2073}
@@ -2200,6 +2260,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2200 left = *lenp; 2260 left = *lenp;
2201 2261
2202 if (write) { 2262 if (write) {
2263 if (*ppos) {
2264 switch (sysctl_writes_strict) {
2265 case SYSCTL_WRITES_STRICT:
2266 goto out;
2267 case SYSCTL_WRITES_WARN:
2268 warn_sysctl_write(table);
2269 break;
2270 default:
2271 break;
2272 }
2273 }
2274
2203 if (left > PAGE_SIZE - 1) 2275 if (left > PAGE_SIZE - 1)
2204 left = PAGE_SIZE - 1; 2276 left = PAGE_SIZE - 1;
2205 page = __get_free_page(GFP_TEMPORARY); 2277 page = __get_free_page(GFP_TEMPORARY);
@@ -2255,6 +2327,7 @@ free:
2255 return err ? : -EINVAL; 2327 return err ? : -EINVAL;
2256 } 2328 }
2257 *lenp -= left; 2329 *lenp -= left;
2330out:
2258 *ppos += *lenp; 2331 *ppos += *lenp;
2259 return err; 2332 return err;
2260} 2333}
@@ -2501,11 +2574,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2501 bool first = 1; 2574 bool first = 1;
2502 size_t left = *lenp; 2575 size_t left = *lenp;
2503 unsigned long bitmap_len = table->maxlen; 2576 unsigned long bitmap_len = table->maxlen;
2504 unsigned long *bitmap = (unsigned long *) table->data; 2577 unsigned long *bitmap = *(unsigned long **) table->data;
2505 unsigned long *tmp_bitmap = NULL; 2578 unsigned long *tmp_bitmap = NULL;
2506 char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; 2579 char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
2507 2580
2508 if (!bitmap_len || !left || (*ppos && !write)) { 2581 if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
2509 *lenp = 0; 2582 *lenp = 0;
2510 return 0; 2583 return 0;
2511 } 2584 }
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 88c9c65a430d..fe75444ae7ec 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
585 struct itimerspec *new_setting, 585 struct itimerspec *new_setting,
586 struct itimerspec *old_setting) 586 struct itimerspec *old_setting)
587{ 587{
588 ktime_t exp;
589
588 if (!rtcdev) 590 if (!rtcdev)
589 return -ENOTSUPP; 591 return -ENOTSUPP;
590 592
593 if (flags & ~TIMER_ABSTIME)
594 return -EINVAL;
595
591 if (old_setting) 596 if (old_setting)
592 alarm_timer_get(timr, old_setting); 597 alarm_timer_get(timr, old_setting);
593 598
@@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
597 602
598 /* start the timer */ 603 /* start the timer */
599 timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); 604 timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
600 alarm_start(&timr->it.alarm.alarmtimer, 605 exp = timespec_to_ktime(new_setting->it_value);
601 timespec_to_ktime(new_setting->it_value)); 606 /* Convert (if necessary) to absolute time */
607 if (flags != TIMER_ABSTIME) {
608 ktime_t now;
609
610 now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
611 exp = ktime_add(now, exp);
612 }
613
614 alarm_start(&timr->it.alarm.alarmtimer, exp);
602 return 0; 615 return 0;
603} 616}
604 617
@@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
730 if (!alarmtimer_get_rtcdev()) 743 if (!alarmtimer_get_rtcdev())
731 return -ENOTSUPP; 744 return -ENOTSUPP;
732 745
746 if (flags & ~TIMER_ABSTIME)
747 return -EINVAL;
748
733 if (!capable(CAP_WAKE_ALARM)) 749 if (!capable(CAP_WAKE_ALARM))
734 return -EPERM; 750 return -EPERM;
735 751
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 419a52cecd20..33db43a39515 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -165,21 +165,21 @@ static inline void pps_set_freq(s64 freq)
165 165
166static inline int is_error_status(int status) 166static inline int is_error_status(int status)
167{ 167{
168 return (time_status & (STA_UNSYNC|STA_CLOCKERR)) 168 return (status & (STA_UNSYNC|STA_CLOCKERR))
169 /* PPS signal lost when either PPS time or 169 /* PPS signal lost when either PPS time or
170 * PPS frequency synchronization requested 170 * PPS frequency synchronization requested
171 */ 171 */
172 || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) 172 || ((status & (STA_PPSFREQ|STA_PPSTIME))
173 && !(time_status & STA_PPSSIGNAL)) 173 && !(status & STA_PPSSIGNAL))
174 /* PPS jitter exceeded when 174 /* PPS jitter exceeded when
175 * PPS time synchronization requested */ 175 * PPS time synchronization requested */
176 || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) 176 || ((status & (STA_PPSTIME|STA_PPSJITTER))
177 == (STA_PPSTIME|STA_PPSJITTER)) 177 == (STA_PPSTIME|STA_PPSJITTER))
178 /* PPS wander exceeded or calibration error when 178 /* PPS wander exceeded or calibration error when
179 * PPS frequency synchronization requested 179 * PPS frequency synchronization requested
180 */ 180 */
181 || ((time_status & STA_PPSFREQ) 181 || ((status & STA_PPSFREQ)
182 && (time_status & (STA_PPSWANDER|STA_PPSERROR))); 182 && (status & (STA_PPSWANDER|STA_PPSERROR)));
183} 183}
184 184
185static inline void pps_fill_timex(struct timex *txc) 185static inline void pps_fill_timex(struct timex *txc)
@@ -786,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
786 time_status |= STA_PPSERROR; 786 time_status |= STA_PPSERROR;
787 pps_errcnt++; 787 pps_errcnt++;
788 pps_dec_freq_interval(); 788 pps_dec_freq_interval();
789 pr_err("hardpps: PPSERROR: interval too long - %ld s\n", 789 printk_deferred(KERN_ERR
790 freq_norm.sec); 790 "hardpps: PPSERROR: interval too long - %ld s\n",
791 freq_norm.sec);
791 return 0; 792 return 0;
792 } 793 }
793 794
@@ -800,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
800 delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); 801 delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
801 pps_freq = ftemp; 802 pps_freq = ftemp;
802 if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { 803 if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
803 pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); 804 printk_deferred(KERN_WARNING
805 "hardpps: PPSWANDER: change=%ld\n", delta);
804 time_status |= STA_PPSWANDER; 806 time_status |= STA_PPSWANDER;
805 pps_stbcnt++; 807 pps_stbcnt++;
806 pps_dec_freq_interval(); 808 pps_dec_freq_interval();
@@ -844,8 +846,9 @@ static void hardpps_update_phase(long error)
844 * the time offset is updated. 846 * the time offset is updated.
845 */ 847 */
846 if (jitter > (pps_jitter << PPS_POPCORN)) { 848 if (jitter > (pps_jitter << PPS_POPCORN)) {
847 pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", 849 printk_deferred(KERN_WARNING
848 jitter, (pps_jitter << PPS_POPCORN)); 850 "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
851 jitter, (pps_jitter << PPS_POPCORN));
849 time_status |= STA_PPSJITTER; 852 time_status |= STA_PPSJITTER;
850 pps_jitcnt++; 853 pps_jitcnt++;
851 } else if (time_status & STA_PPSTIME) { 854 } else if (time_status & STA_PPSTIME) {
@@ -902,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
902 time_status |= STA_PPSJITTER; 905 time_status |= STA_PPSJITTER;
903 /* restart the frequency calibration interval */ 906 /* restart the frequency calibration interval */
904 pps_fbase = *raw_ts; 907 pps_fbase = *raw_ts;
905 pr_err("hardpps: PPSJITTER: bad pulse\n"); 908 printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n");
906 return; 909 return;
907 } 910 }
908 911
@@ -923,7 +926,10 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
923 926
924static int __init ntp_tick_adj_setup(char *str) 927static int __init ntp_tick_adj_setup(char *str)
925{ 928{
926 ntp_tick_adj = simple_strtol(str, NULL, 0); 929 int rc = kstrtol(str, 0, (long *)&ntp_tick_adj);
930
931 if (rc)
932 return rc;
927 ntp_tick_adj <<= NTP_SCALE_SHIFT; 933 ntp_tick_adj <<= NTP_SCALE_SHIFT;
928 934
929 return 1; 935 return 1;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 4d23dc4d8139..445106d2c729 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -49,13 +49,6 @@ static u64 notrace jiffy_sched_clock_read(void)
49 return (u64)(jiffies - INITIAL_JIFFIES); 49 return (u64)(jiffies - INITIAL_JIFFIES);
50} 50}
51 51
52static u32 __read_mostly (*read_sched_clock_32)(void);
53
54static u64 notrace read_sched_clock_32_wrapper(void)
55{
56 return read_sched_clock_32();
57}
58
59static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; 52static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
60 53
61static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) 54static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
@@ -176,12 +169,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
176 pr_debug("Registered %pF as sched_clock source\n", read); 169 pr_debug("Registered %pF as sched_clock source\n", read);
177} 170}
178 171
179void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
180{
181 read_sched_clock_32 = read;
182 sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
183}
184
185void __init sched_clock_postinit(void) 172void __init sched_clock_postinit(void)
186{ 173{
187 /* 174 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f7df8ea21707..32d8d6aaedb8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -852,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
852 struct timespec *delta) 852 struct timespec *delta)
853{ 853{
854 if (!timespec_valid_strict(delta)) { 854 if (!timespec_valid_strict(delta)) {
855 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " 855 printk_deferred(KERN_WARNING
856 "sleep delta value!\n"); 856 "__timekeeping_inject_sleeptime: Invalid "
857 "sleep delta value!\n");
857 return; 858 return;
858 } 859 }
859 tk_xtime_add(tk, delta); 860 tk_xtime_add(tk, delta);
@@ -1157,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1157 1158
1158 if (unlikely(tk->clock->maxadj && 1159 if (unlikely(tk->clock->maxadj &&
1159 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { 1160 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
1160 printk_once(KERN_WARNING 1161 printk_deferred_once(KERN_WARNING
1161 "Adjusting %s more than 11%% (%ld vs %ld)\n", 1162 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1162 tk->clock->name, (long)tk->mult + adj, 1163 tk->clock->name, (long)tk->mult + adj,
1163 (long)tk->clock->mult + tk->clock->maxadj); 1164 (long)tk->clock->mult + tk->clock->maxadj);
diff --git a/kernel/torture.c b/kernel/torture.c
index acc9afc2f26e..40bb511cca48 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -335,13 +335,8 @@ static void torture_shuffle_tasks(void)
335 shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask); 335 shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask);
336 if (shuffle_idle_cpu >= nr_cpu_ids) 336 if (shuffle_idle_cpu >= nr_cpu_ids)
337 shuffle_idle_cpu = -1; 337 shuffle_idle_cpu = -1;
338 if (shuffle_idle_cpu != -1) { 338 else
339 cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); 339 cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
340 if (cpumask_empty(shuffle_tmp_mask)) {
341 put_online_cpus();
342 return;
343 }
344 }
345 340
346 mutex_lock(&shuffle_task_mutex); 341 mutex_lock(&shuffle_task_mutex);
347 list_for_each_entry(stp, &shuffle_task_list, st_l) 342 list_for_each_entry(stp, &shuffle_task_list, st_l)
@@ -533,7 +528,11 @@ void stutter_wait(const char *title)
533 while (ACCESS_ONCE(stutter_pause_test) || 528 while (ACCESS_ONCE(stutter_pause_test) ||
534 (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { 529 (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
535 if (stutter_pause_test) 530 if (stutter_pause_test)
536 schedule_timeout_interruptible(1); 531 if (ACCESS_ONCE(stutter_pause_test) == 1)
532 schedule_timeout_interruptible(1);
533 else
534 while (ACCESS_ONCE(stutter_pause_test))
535 cond_resched();
537 else 536 else
538 schedule_timeout_interruptible(round_jiffies_relative(HZ)); 537 schedule_timeout_interruptible(round_jiffies_relative(HZ));
539 torture_shutdown_absorb(title); 538 torture_shutdown_absorb(title);
@@ -550,7 +549,11 @@ static int torture_stutter(void *arg)
550 VERBOSE_TOROUT_STRING("torture_stutter task started"); 549 VERBOSE_TOROUT_STRING("torture_stutter task started");
551 do { 550 do {
552 if (!torture_must_stop()) { 551 if (!torture_must_stop()) {
553 schedule_timeout_interruptible(stutter); 552 if (stutter > 1) {
553 schedule_timeout_interruptible(stutter - 1);
554 ACCESS_ONCE(stutter_pause_test) = 2;
555 }
556 schedule_timeout_interruptible(1);
554 ACCESS_ONCE(stutter_pause_test) = 1; 557 ACCESS_ONCE(stutter_pause_test) = 1;
555 } 558 }
556 if (!torture_must_stop()) 559 if (!torture_must_stop())
@@ -596,21 +599,27 @@ static void torture_stutter_cleanup(void)
596 * The runnable parameter points to a flag that controls whether or not 599 * The runnable parameter points to a flag that controls whether or not
597 * the test is currently runnable. If there is no such flag, pass in NULL. 600 * the test is currently runnable. If there is no such flag, pass in NULL.
598 */ 601 */
599void __init torture_init_begin(char *ttype, bool v, int *runnable) 602bool torture_init_begin(char *ttype, bool v, int *runnable)
600{ 603{
601 mutex_lock(&fullstop_mutex); 604 mutex_lock(&fullstop_mutex);
605 if (torture_type != NULL) {
606 pr_alert("torture_init_begin: refusing %s init: %s running",
607 ttype, torture_type);
608 mutex_unlock(&fullstop_mutex);
609 return false;
610 }
602 torture_type = ttype; 611 torture_type = ttype;
603 verbose = v; 612 verbose = v;
604 torture_runnable = runnable; 613 torture_runnable = runnable;
605 fullstop = FULLSTOP_DONTSTOP; 614 fullstop = FULLSTOP_DONTSTOP;
606 615 return true;
607} 616}
608EXPORT_SYMBOL_GPL(torture_init_begin); 617EXPORT_SYMBOL_GPL(torture_init_begin);
609 618
610/* 619/*
611 * Tell the torture module that initialization is complete. 620 * Tell the torture module that initialization is complete.
612 */ 621 */
613void __init torture_init_end(void) 622void torture_init_end(void)
614{ 623{
615 mutex_unlock(&fullstop_mutex); 624 mutex_unlock(&fullstop_mutex);
616 register_reboot_notifier(&torture_shutdown_nb); 625 register_reboot_notifier(&torture_shutdown_nb);
@@ -642,6 +651,9 @@ bool torture_cleanup(void)
642 torture_shuffle_cleanup(); 651 torture_shuffle_cleanup();
643 torture_stutter_cleanup(); 652 torture_stutter_cleanup();
644 torture_onoff_cleanup(); 653 torture_onoff_cleanup();
654 mutex_lock(&fullstop_mutex);
655 torture_type = NULL;
656 mutex_unlock(&fullstop_mutex);
645 return false; 657 return false;
646} 658}
647EXPORT_SYMBOL_GPL(torture_cleanup); 659EXPORT_SYMBOL_GPL(torture_cleanup);
@@ -674,8 +686,10 @@ EXPORT_SYMBOL_GPL(torture_must_stop_irq);
674 */ 686 */
675void torture_kthread_stopping(char *title) 687void torture_kthread_stopping(char *title)
676{ 688{
677 if (verbose) 689 char buf[128];
678 VERBOSE_TOROUT_STRING(title); 690
691 snprintf(buf, sizeof(buf), "Stopping %s", title);
692 VERBOSE_TOROUT_STRING(buf);
679 while (!kthread_should_stop()) { 693 while (!kthread_should_stop()) {
680 torture_shutdown_absorb(title); 694 torture_shutdown_absorb(title);
681 schedule_timeout_uninterruptible(1); 695 schedule_timeout_uninterruptible(1);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8639819f6cef..d4409356f40d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -535,6 +535,36 @@ config MMIOTRACE_TEST
535 535
536 Say N, unless you absolutely know what you are doing. 536 Say N, unless you absolutely know what you are doing.
537 537
538config TRACEPOINT_BENCHMARK
539 bool "Add tracepoint that benchmarks tracepoints"
540 help
541 This option creates the tracepoint "benchmark:benchmark_event".
542 When the tracepoint is enabled, it kicks off a kernel thread that
543 goes into an infinite loop (calling cond_sched() to let other tasks
544 run), and calls the tracepoint. Each iteration will record the time
545 it took to write to the tracepoint and the next iteration that
546 data will be passed to the tracepoint itself. That is, the tracepoint
547 will report the time it took to do the previous tracepoint.
548 The string written to the tracepoint is a static string of 128 bytes
549 to keep the time the same. The initial string is simply a write of
550 "START". The second string records the cold cache time of the first
551 write which is not added to the rest of the calculations.
552
553 As it is a tight loop, it benchmarks as hot cache. That's fine because
554 we care most about hot paths that are probably in cache already.
555
556 An example of the output:
557
558 START
559 first=3672 [COLD CACHED]
560 last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712
561 last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337
562 last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064
563 last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411
564 last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389
565 last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666
566
567
538config RING_BUFFER_BENCHMARK 568config RING_BUFFER_BENCHMARK
539 tristate "Ring buffer benchmark stress tester" 569 tristate "Ring buffer benchmark stress tester"
540 depends on RING_BUFFER 570 depends on RING_BUFFER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1378e84fbe39..2611613f14f1 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -17,6 +17,7 @@ ifdef CONFIG_TRACING_BRANCHES
17KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 17KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
18endif 18endif
19 19
20CFLAGS_trace_benchmark.o := -I$(src)
20CFLAGS_trace_events_filter.o := -I$(src) 21CFLAGS_trace_events_filter.o := -I$(src)
21 22
22obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o 23obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o
@@ -62,4 +63,6 @@ endif
62obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o 63obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
63obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o 64obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
64 65
66obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
67
65libftrace-y := ftrace.o 68libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4a54a25afa2f..ac9d1dad630b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,7 +62,7 @@
62#define FTRACE_HASH_DEFAULT_BITS 10 62#define FTRACE_HASH_DEFAULT_BITS 10
63#define FTRACE_HASH_MAX_BITS 12 63#define FTRACE_HASH_MAX_BITS 12
64 64
65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) 65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
66 66
67#ifdef CONFIG_DYNAMIC_FTRACE 67#ifdef CONFIG_DYNAMIC_FTRACE
68#define INIT_REGEX_LOCK(opsname) \ 68#define INIT_REGEX_LOCK(opsname) \
@@ -103,7 +103,6 @@ static int ftrace_disabled __read_mostly;
103 103
104static DEFINE_MUTEX(ftrace_lock); 104static DEFINE_MUTEX(ftrace_lock);
105 105
106static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
107static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; 106static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
108static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 107static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
109ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 108ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
@@ -171,23 +170,6 @@ int ftrace_nr_registered_ops(void)
171 return cnt; 170 return cnt;
172} 171}
173 172
174static void
175ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
176 struct ftrace_ops *op, struct pt_regs *regs)
177{
178 int bit;
179
180 bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
181 if (bit < 0)
182 return;
183
184 do_for_each_ftrace_op(op, ftrace_global_list) {
185 op->func(ip, parent_ip, op, regs);
186 } while_for_each_ftrace_op(op);
187
188 trace_clear_recursion(bit);
189}
190
191static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, 173static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
192 struct ftrace_ops *op, struct pt_regs *regs) 174 struct ftrace_ops *op, struct pt_regs *regs)
193{ 175{
@@ -237,43 +219,6 @@ static int control_ops_alloc(struct ftrace_ops *ops)
237 return 0; 219 return 0;
238} 220}
239 221
240static void update_global_ops(void)
241{
242 ftrace_func_t func = ftrace_global_list_func;
243 void *private = NULL;
244
245 /* The list has its own recursion protection. */
246 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
247
248 /*
249 * If there's only one function registered, then call that
250 * function directly. Otherwise, we need to iterate over the
251 * registered callers.
252 */
253 if (ftrace_global_list == &ftrace_list_end ||
254 ftrace_global_list->next == &ftrace_list_end) {
255 func = ftrace_global_list->func;
256 private = ftrace_global_list->private;
257 /*
258 * As we are calling the function directly.
259 * If it does not have recursion protection,
260 * the function_trace_op needs to be updated
261 * accordingly.
262 */
263 if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE))
264 global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
265 }
266
267 /* If we filter on pids, update to use the pid function */
268 if (!list_empty(&ftrace_pids)) {
269 set_ftrace_pid_function(func);
270 func = ftrace_pid_func;
271 }
272
273 global_ops.func = func;
274 global_ops.private = private;
275}
276
277static void ftrace_sync(struct work_struct *work) 222static void ftrace_sync(struct work_struct *work)
278{ 223{
279 /* 224 /*
@@ -301,8 +246,6 @@ static void update_ftrace_function(void)
301{ 246{
302 ftrace_func_t func; 247 ftrace_func_t func;
303 248
304 update_global_ops();
305
306 /* 249 /*
307 * If we are at the end of the list and this ops is 250 * If we are at the end of the list and this ops is
308 * recursion safe and not dynamic and the arch supports passing ops, 251 * recursion safe and not dynamic and the arch supports passing ops,
@@ -314,10 +257,7 @@ static void update_ftrace_function(void)
314 (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && 257 (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
315 !FTRACE_FORCE_LIST_FUNC)) { 258 !FTRACE_FORCE_LIST_FUNC)) {
316 /* Set the ftrace_ops that the arch callback uses */ 259 /* Set the ftrace_ops that the arch callback uses */
317 if (ftrace_ops_list == &global_ops) 260 set_function_trace_op = ftrace_ops_list;
318 set_function_trace_op = ftrace_global_list;
319 else
320 set_function_trace_op = ftrace_ops_list;
321 func = ftrace_ops_list->func; 261 func = ftrace_ops_list->func;
322 } else { 262 } else {
323 /* Just use the default ftrace_ops */ 263 /* Just use the default ftrace_ops */
@@ -325,12 +265,12 @@ static void update_ftrace_function(void)
325 func = ftrace_ops_list_func; 265 func = ftrace_ops_list_func;
326 } 266 }
327 267
268 update_function_graph_func();
269
328 /* If there's no change, then do nothing more here */ 270 /* If there's no change, then do nothing more here */
329 if (ftrace_trace_function == func) 271 if (ftrace_trace_function == func)
330 return; 272 return;
331 273
332 update_function_graph_func();
333
334 /* 274 /*
335 * If we are using the list function, it doesn't care 275 * If we are using the list function, it doesn't care
336 * about the function_trace_ops. 276 * about the function_trace_ops.
@@ -373,6 +313,11 @@ static void update_ftrace_function(void)
373 ftrace_trace_function = func; 313 ftrace_trace_function = func;
374} 314}
375 315
316int using_ftrace_ops_list_func(void)
317{
318 return ftrace_trace_function == ftrace_ops_list_func;
319}
320
376static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) 321static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
377{ 322{
378 ops->next = *list; 323 ops->next = *list;
@@ -434,16 +379,9 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
434 if (ops->flags & FTRACE_OPS_FL_DELETED) 379 if (ops->flags & FTRACE_OPS_FL_DELETED)
435 return -EINVAL; 380 return -EINVAL;
436 381
437 if (FTRACE_WARN_ON(ops == &global_ops))
438 return -EINVAL;
439
440 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) 382 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
441 return -EBUSY; 383 return -EBUSY;
442 384
443 /* We don't support both control and global flags set. */
444 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
445 return -EINVAL;
446
447#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS 385#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
448 /* 386 /*
449 * If the ftrace_ops specifies SAVE_REGS, then it only can be used 387 * If the ftrace_ops specifies SAVE_REGS, then it only can be used
@@ -461,10 +399,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
461 if (!core_kernel_data((unsigned long)ops)) 399 if (!core_kernel_data((unsigned long)ops))
462 ops->flags |= FTRACE_OPS_FL_DYNAMIC; 400 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
463 401
464 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 402 if (ops->flags & FTRACE_OPS_FL_CONTROL) {
465 add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);
466 ops->flags |= FTRACE_OPS_FL_ENABLED;
467 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
468 if (control_ops_alloc(ops)) 403 if (control_ops_alloc(ops))
469 return -ENOMEM; 404 return -ENOMEM;
470 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); 405 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
@@ -484,15 +419,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
484 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) 419 if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
485 return -EBUSY; 420 return -EBUSY;
486 421
487 if (FTRACE_WARN_ON(ops == &global_ops)) 422 if (ops->flags & FTRACE_OPS_FL_CONTROL) {
488 return -EINVAL;
489
490 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
491 ret = remove_ftrace_list_ops(&ftrace_global_list,
492 &global_ops, ops);
493 if (!ret)
494 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
495 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
496 ret = remove_ftrace_list_ops(&ftrace_control_list, 423 ret = remove_ftrace_list_ops(&ftrace_control_list,
497 &control_ops, ops); 424 &control_ops, ops);
498 } else 425 } else
@@ -895,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
895 822
896 local_irq_save(flags); 823 local_irq_save(flags);
897 824
898 stat = &__get_cpu_var(ftrace_profile_stats); 825 stat = this_cpu_ptr(&ftrace_profile_stats);
899 if (!stat->hash || !ftrace_profile_enabled) 826 if (!stat->hash || !ftrace_profile_enabled)
900 goto out; 827 goto out;
901 828
@@ -926,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
926 unsigned long flags; 853 unsigned long flags;
927 854
928 local_irq_save(flags); 855 local_irq_save(flags);
929 stat = &__get_cpu_var(ftrace_profile_stats); 856 stat = this_cpu_ptr(&ftrace_profile_stats);
930 if (!stat->hash || !ftrace_profile_enabled) 857 if (!stat->hash || !ftrace_profile_enabled)
931 goto out; 858 goto out;
932 859
@@ -1178,7 +1105,7 @@ struct ftrace_page {
1178static struct ftrace_page *ftrace_pages_start; 1105static struct ftrace_page *ftrace_pages_start;
1179static struct ftrace_page *ftrace_pages; 1106static struct ftrace_page *ftrace_pages;
1180 1107
1181static bool ftrace_hash_empty(struct ftrace_hash *hash) 1108static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
1182{ 1109{
1183 return !hash || !hash->count; 1110 return !hash || !hash->count;
1184} 1111}
@@ -1625,7 +1552,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1625 in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); 1552 in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
1626 1553
1627 /* 1554 /*
1555 * If filter_hash is set, we want to match all functions
1556 * that are in the hash but not in the other hash.
1628 * 1557 *
1558 * If filter_hash is not set, then we are decrementing.
1559 * That means we match anything that is in the hash
1560 * and also in the other_hash. That is, we need to turn
1561 * off functions in the other hash because they are disabled
1562 * by this hash.
1629 */ 1563 */
1630 if (filter_hash && in_hash && !in_other_hash) 1564 if (filter_hash && in_hash && !in_other_hash)
1631 match = 1; 1565 match = 1;
@@ -1767,19 +1701,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1767 /* 1701 /*
1768 * If this record is being updated from a nop, then 1702 * If this record is being updated from a nop, then
1769 * return UPDATE_MAKE_CALL. 1703 * return UPDATE_MAKE_CALL.
1770 * Otherwise, if the EN flag is set, then return
1771 * UPDATE_MODIFY_CALL_REGS to tell the caller to convert
1772 * from the non-save regs, to a save regs function.
1773 * Otherwise, 1704 * Otherwise,
1774 * return UPDATE_MODIFY_CALL to tell the caller to convert 1705 * return UPDATE_MODIFY_CALL to tell the caller to convert
1775 * from the save regs, to a non-save regs function. 1706 * from the save regs, to a non-save regs function or
1707 * vice versa.
1776 */ 1708 */
1777 if (flag & FTRACE_FL_ENABLED) 1709 if (flag & FTRACE_FL_ENABLED)
1778 return FTRACE_UPDATE_MAKE_CALL; 1710 return FTRACE_UPDATE_MAKE_CALL;
1779 else if (rec->flags & FTRACE_FL_REGS_EN) 1711
1780 return FTRACE_UPDATE_MODIFY_CALL_REGS; 1712 return FTRACE_UPDATE_MODIFY_CALL;
1781 else
1782 return FTRACE_UPDATE_MODIFY_CALL;
1783 } 1713 }
1784 1714
1785 if (update) { 1715 if (update) {
@@ -1821,6 +1751,42 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1821 return ftrace_check_record(rec, enable, 0); 1751 return ftrace_check_record(rec, enable, 0);
1822} 1752}
1823 1753
1754/**
1755 * ftrace_get_addr_new - Get the call address to set to
1756 * @rec: The ftrace record descriptor
1757 *
1758 * If the record has the FTRACE_FL_REGS set, that means that it
1759 * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
1760 * is not not set, then it wants to convert to the normal callback.
1761 *
1762 * Returns the address of the trampoline to set to
1763 */
1764unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
1765{
1766 if (rec->flags & FTRACE_FL_REGS)
1767 return (unsigned long)FTRACE_REGS_ADDR;
1768 else
1769 return (unsigned long)FTRACE_ADDR;
1770}
1771
1772/**
1773 * ftrace_get_addr_curr - Get the call address that is already there
1774 * @rec: The ftrace record descriptor
1775 *
1776 * The FTRACE_FL_REGS_EN is set when the record already points to
1777 * a function that saves all the regs. Basically the '_EN' version
1778 * represents the current state of the function.
1779 *
1780 * Returns the address of the trampoline that is currently being called
1781 */
1782unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
1783{
1784 if (rec->flags & FTRACE_FL_REGS_EN)
1785 return (unsigned long)FTRACE_REGS_ADDR;
1786 else
1787 return (unsigned long)FTRACE_ADDR;
1788}
1789
1824static int 1790static int
1825__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1791__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1826{ 1792{
@@ -1828,12 +1794,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1828 unsigned long ftrace_addr; 1794 unsigned long ftrace_addr;
1829 int ret; 1795 int ret;
1830 1796
1831 ret = ftrace_update_record(rec, enable); 1797 ftrace_addr = ftrace_get_addr_new(rec);
1832 1798
1833 if (rec->flags & FTRACE_FL_REGS) 1799 /* This needs to be done before we call ftrace_update_record */
1834 ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; 1800 ftrace_old_addr = ftrace_get_addr_curr(rec);
1835 else 1801
1836 ftrace_addr = (unsigned long)FTRACE_ADDR; 1802 ret = ftrace_update_record(rec, enable);
1837 1803
1838 switch (ret) { 1804 switch (ret) {
1839 case FTRACE_UPDATE_IGNORE: 1805 case FTRACE_UPDATE_IGNORE:
@@ -1845,13 +1811,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1845 case FTRACE_UPDATE_MAKE_NOP: 1811 case FTRACE_UPDATE_MAKE_NOP:
1846 return ftrace_make_nop(NULL, rec, ftrace_addr); 1812 return ftrace_make_nop(NULL, rec, ftrace_addr);
1847 1813
1848 case FTRACE_UPDATE_MODIFY_CALL_REGS:
1849 case FTRACE_UPDATE_MODIFY_CALL: 1814 case FTRACE_UPDATE_MODIFY_CALL:
1850 if (rec->flags & FTRACE_FL_REGS)
1851 ftrace_old_addr = (unsigned long)FTRACE_ADDR;
1852 else
1853 ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR;
1854
1855 return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); 1815 return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
1856 } 1816 }
1857 1817
@@ -2115,7 +2075,6 @@ static void ftrace_startup_enable(int command)
2115 2075
2116static int ftrace_startup(struct ftrace_ops *ops, int command) 2076static int ftrace_startup(struct ftrace_ops *ops, int command)
2117{ 2077{
2118 bool hash_enable = true;
2119 int ret; 2078 int ret;
2120 2079
2121 if (unlikely(ftrace_disabled)) 2080 if (unlikely(ftrace_disabled))
@@ -2128,18 +2087,9 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
2128 ftrace_start_up++; 2087 ftrace_start_up++;
2129 command |= FTRACE_UPDATE_CALLS; 2088 command |= FTRACE_UPDATE_CALLS;
2130 2089
2131 /* ops marked global share the filter hashes */
2132 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
2133 ops = &global_ops;
2134 /* Don't update hash if global is already set */
2135 if (global_start_up)
2136 hash_enable = false;
2137 global_start_up++;
2138 }
2139
2140 ops->flags |= FTRACE_OPS_FL_ENABLED; 2090 ops->flags |= FTRACE_OPS_FL_ENABLED;
2141 if (hash_enable) 2091
2142 ftrace_hash_rec_enable(ops, 1); 2092 ftrace_hash_rec_enable(ops, 1);
2143 2093
2144 ftrace_startup_enable(command); 2094 ftrace_startup_enable(command);
2145 2095
@@ -2148,7 +2098,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
2148 2098
2149static int ftrace_shutdown(struct ftrace_ops *ops, int command) 2099static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2150{ 2100{
2151 bool hash_disable = true;
2152 int ret; 2101 int ret;
2153 2102
2154 if (unlikely(ftrace_disabled)) 2103 if (unlikely(ftrace_disabled))
@@ -2166,21 +2115,9 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2166 */ 2115 */
2167 WARN_ON_ONCE(ftrace_start_up < 0); 2116 WARN_ON_ONCE(ftrace_start_up < 0);
2168 2117
2169 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 2118 ftrace_hash_rec_disable(ops, 1);
2170 ops = &global_ops;
2171 global_start_up--;
2172 WARN_ON_ONCE(global_start_up < 0);
2173 /* Don't update hash if global still has users */
2174 if (global_start_up) {
2175 WARN_ON_ONCE(!ftrace_start_up);
2176 hash_disable = false;
2177 }
2178 }
2179
2180 if (hash_disable)
2181 ftrace_hash_rec_disable(ops, 1);
2182 2119
2183 if (ops != &global_ops || !global_start_up) 2120 if (!global_start_up)
2184 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 2121 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
2185 2122
2186 command |= FTRACE_UPDATE_CALLS; 2123 command |= FTRACE_UPDATE_CALLS;
@@ -3524,10 +3461,6 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3524 struct ftrace_hash *hash; 3461 struct ftrace_hash *hash;
3525 int ret; 3462 int ret;
3526 3463
3527 /* All global ops uses the global ops filters */
3528 if (ops->flags & FTRACE_OPS_FL_GLOBAL)
3529 ops = &global_ops;
3530
3531 if (unlikely(ftrace_disabled)) 3464 if (unlikely(ftrace_disabled))
3532 return -ENODEV; 3465 return -ENODEV;
3533 3466
@@ -3639,8 +3572,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
3639} 3572}
3640EXPORT_SYMBOL_GPL(ftrace_set_notrace); 3573EXPORT_SYMBOL_GPL(ftrace_set_notrace);
3641/** 3574/**
3642 * ftrace_set_filter - set a function to filter on in ftrace 3575 * ftrace_set_global_filter - set a function to filter on with global tracers
3643 * @ops - the ops to set the filter with
3644 * @buf - the string that holds the function filter text. 3576 * @buf - the string that holds the function filter text.
3645 * @len - the length of the string. 3577 * @len - the length of the string.
3646 * @reset - non zero to reset all filters before applying this filter. 3578 * @reset - non zero to reset all filters before applying this filter.
@@ -3655,8 +3587,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
3655EXPORT_SYMBOL_GPL(ftrace_set_global_filter); 3587EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
3656 3588
3657/** 3589/**
3658 * ftrace_set_notrace - set a function to not trace in ftrace 3590 * ftrace_set_global_notrace - set a function to not trace with global tracers
3659 * @ops - the ops to set the notrace filter with
3660 * @buf - the string that holds the function notrace text. 3591 * @buf - the string that holds the function notrace text.
3661 * @len - the length of the string. 3592 * @len - the length of the string.
3662 * @reset - non zero to reset all filters before applying this filter. 3593 * @reset - non zero to reset all filters before applying this filter.
@@ -4443,6 +4374,34 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
4443 4374
4444#endif /* CONFIG_DYNAMIC_FTRACE */ 4375#endif /* CONFIG_DYNAMIC_FTRACE */
4445 4376
4377__init void ftrace_init_global_array_ops(struct trace_array *tr)
4378{
4379 tr->ops = &global_ops;
4380 tr->ops->private = tr;
4381}
4382
4383void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
4384{
4385 /* If we filter on pids, update to use the pid function */
4386 if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
4387 if (WARN_ON(tr->ops->func != ftrace_stub))
4388 printk("ftrace ops had %pS for function\n",
4389 tr->ops->func);
4390 /* Only the top level instance does pid tracing */
4391 if (!list_empty(&ftrace_pids)) {
4392 set_ftrace_pid_function(func);
4393 func = ftrace_pid_func;
4394 }
4395 }
4396 tr->ops->func = func;
4397 tr->ops->private = tr;
4398}
4399
4400void ftrace_reset_array_ops(struct trace_array *tr)
4401{
4402 tr->ops->func = ftrace_stub;
4403}
4404
4446static void 4405static void
4447ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, 4406ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4448 struct ftrace_ops *op, struct pt_regs *regs) 4407 struct ftrace_ops *op, struct pt_regs *regs)
@@ -4501,9 +4460,16 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4501 */ 4460 */
4502 preempt_disable_notrace(); 4461 preempt_disable_notrace();
4503 do_for_each_ftrace_op(op, ftrace_ops_list) { 4462 do_for_each_ftrace_op(op, ftrace_ops_list) {
4504 if (ftrace_ops_test(op, ip, regs)) 4463 if (ftrace_ops_test(op, ip, regs)) {
4464 if (WARN_ON(!op->func)) {
4465 function_trace_stop = 1;
4466 printk("op=%p %pS\n", op, op);
4467 goto out;
4468 }
4505 op->func(ip, parent_ip, op, regs); 4469 op->func(ip, parent_ip, op, regs);
4470 }
4506 } while_for_each_ftrace_op(op); 4471 } while_for_each_ftrace_op(op);
4472out:
4507 preempt_enable_notrace(); 4473 preempt_enable_notrace();
4508 trace_clear_recursion(bit); 4474 trace_clear_recursion(bit);
4509} 4475}
@@ -4908,7 +4874,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
4908#ifdef CONFIG_FUNCTION_GRAPH_TRACER 4874#ifdef CONFIG_FUNCTION_GRAPH_TRACER
4909 4875
4910static int ftrace_graph_active; 4876static int ftrace_graph_active;
4911static struct notifier_block ftrace_suspend_notifier;
4912 4877
4913int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) 4878int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
4914{ 4879{
@@ -5054,13 +5019,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
5054 return NOTIFY_DONE; 5019 return NOTIFY_DONE;
5055} 5020}
5056 5021
5057/* Just a place holder for function graph */
5058static struct ftrace_ops fgraph_ops __read_mostly = {
5059 .func = ftrace_stub,
5060 .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
5061 FTRACE_OPS_FL_RECURSION_SAFE,
5062};
5063
5064static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) 5022static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
5065{ 5023{
5066 if (!ftrace_ops_test(&global_ops, trace->func, NULL)) 5024 if (!ftrace_ops_test(&global_ops, trace->func, NULL))
@@ -5085,6 +5043,10 @@ static void update_function_graph_func(void)
5085 ftrace_graph_entry = ftrace_graph_entry_test; 5043 ftrace_graph_entry = ftrace_graph_entry_test;
5086} 5044}
5087 5045
5046static struct notifier_block ftrace_suspend_notifier = {
5047 .notifier_call = ftrace_suspend_notifier_call,
5048};
5049
5088int register_ftrace_graph(trace_func_graph_ret_t retfunc, 5050int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5089 trace_func_graph_ent_t entryfunc) 5051 trace_func_graph_ent_t entryfunc)
5090{ 5052{
@@ -5098,7 +5060,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5098 goto out; 5060 goto out;
5099 } 5061 }
5100 5062
5101 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
5102 register_pm_notifier(&ftrace_suspend_notifier); 5063 register_pm_notifier(&ftrace_suspend_notifier);
5103 5064
5104 ftrace_graph_active++; 5065 ftrace_graph_active++;
@@ -5120,7 +5081,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5120 ftrace_graph_entry = ftrace_graph_entry_test; 5081 ftrace_graph_entry = ftrace_graph_entry_test;
5121 update_function_graph_func(); 5082 update_function_graph_func();
5122 5083
5123 ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); 5084 /* Function graph doesn't use the .func field of global_ops */
5085 global_ops.flags |= FTRACE_OPS_FL_STUB;
5086
5087 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
5124 5088
5125out: 5089out:
5126 mutex_unlock(&ftrace_lock); 5090 mutex_unlock(&ftrace_lock);
@@ -5138,7 +5102,8 @@ void unregister_ftrace_graph(void)
5138 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 5102 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
5139 ftrace_graph_entry = ftrace_graph_entry_stub; 5103 ftrace_graph_entry = ftrace_graph_entry_stub;
5140 __ftrace_graph_entry = ftrace_graph_entry_stub; 5104 __ftrace_graph_entry = ftrace_graph_entry_stub;
5141 ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); 5105 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
5106 global_ops.flags &= ~FTRACE_OPS_FL_STUB;
5142 unregister_pm_notifier(&ftrace_suspend_notifier); 5107 unregister_pm_notifier(&ftrace_suspend_notifier);
5143 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 5108 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
5144 5109
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c634868c2921..ff7027199a9a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -543,7 +543,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
543 * as data is added to any of the @buffer's cpu buffers. Otherwise 543 * as data is added to any of the @buffer's cpu buffers. Otherwise
544 * it will wait for data to be added to a specific cpu buffer. 544 * it will wait for data to be added to a specific cpu buffer.
545 */ 545 */
546void ring_buffer_wait(struct ring_buffer *buffer, int cpu) 546int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
547{ 547{
548 struct ring_buffer_per_cpu *cpu_buffer; 548 struct ring_buffer_per_cpu *cpu_buffer;
549 DEFINE_WAIT(wait); 549 DEFINE_WAIT(wait);
@@ -557,6 +557,8 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
557 if (cpu == RING_BUFFER_ALL_CPUS) 557 if (cpu == RING_BUFFER_ALL_CPUS)
558 work = &buffer->irq_work; 558 work = &buffer->irq_work;
559 else { 559 else {
560 if (!cpumask_test_cpu(cpu, buffer->cpumask))
561 return -ENODEV;
560 cpu_buffer = buffer->buffers[cpu]; 562 cpu_buffer = buffer->buffers[cpu];
561 work = &cpu_buffer->irq_work; 563 work = &cpu_buffer->irq_work;
562 } 564 }
@@ -591,6 +593,7 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
591 schedule(); 593 schedule();
592 594
593 finish_wait(&work->waiters, &wait); 595 finish_wait(&work->waiters, &wait);
596 return 0;
594} 597}
595 598
596/** 599/**
@@ -613,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
613 struct ring_buffer_per_cpu *cpu_buffer; 616 struct ring_buffer_per_cpu *cpu_buffer;
614 struct rb_irq_work *work; 617 struct rb_irq_work *work;
615 618
616 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
617 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
618 return POLLIN | POLLRDNORM;
619
620 if (cpu == RING_BUFFER_ALL_CPUS) 619 if (cpu == RING_BUFFER_ALL_CPUS)
621 work = &buffer->irq_work; 620 work = &buffer->irq_work;
622 else { 621 else {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 737b0efa1a62..bda9621638cc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -275,7 +275,7 @@ int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
275} 275}
276EXPORT_SYMBOL_GPL(call_filter_check_discard); 276EXPORT_SYMBOL_GPL(call_filter_check_discard);
277 277
278cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) 278static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
279{ 279{
280 u64 ts; 280 u64 ts;
281 281
@@ -466,6 +466,12 @@ int __trace_puts(unsigned long ip, const char *str, int size)
466 struct print_entry *entry; 466 struct print_entry *entry;
467 unsigned long irq_flags; 467 unsigned long irq_flags;
468 int alloc; 468 int alloc;
469 int pc;
470
471 if (!(trace_flags & TRACE_ITER_PRINTK))
472 return 0;
473
474 pc = preempt_count();
469 475
470 if (unlikely(tracing_selftest_running || tracing_disabled)) 476 if (unlikely(tracing_selftest_running || tracing_disabled))
471 return 0; 477 return 0;
@@ -475,7 +481,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
475 local_save_flags(irq_flags); 481 local_save_flags(irq_flags);
476 buffer = global_trace.trace_buffer.buffer; 482 buffer = global_trace.trace_buffer.buffer;
477 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 483 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
478 irq_flags, preempt_count()); 484 irq_flags, pc);
479 if (!event) 485 if (!event)
480 return 0; 486 return 0;
481 487
@@ -492,6 +498,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
492 entry->buf[size] = '\0'; 498 entry->buf[size] = '\0';
493 499
494 __buffer_unlock_commit(buffer, event); 500 __buffer_unlock_commit(buffer, event);
501 ftrace_trace_stack(buffer, irq_flags, 4, pc);
495 502
496 return size; 503 return size;
497} 504}
@@ -509,6 +516,12 @@ int __trace_bputs(unsigned long ip, const char *str)
509 struct bputs_entry *entry; 516 struct bputs_entry *entry;
510 unsigned long irq_flags; 517 unsigned long irq_flags;
511 int size = sizeof(struct bputs_entry); 518 int size = sizeof(struct bputs_entry);
519 int pc;
520
521 if (!(trace_flags & TRACE_ITER_PRINTK))
522 return 0;
523
524 pc = preempt_count();
512 525
513 if (unlikely(tracing_selftest_running || tracing_disabled)) 526 if (unlikely(tracing_selftest_running || tracing_disabled))
514 return 0; 527 return 0;
@@ -516,7 +529,7 @@ int __trace_bputs(unsigned long ip, const char *str)
516 local_save_flags(irq_flags); 529 local_save_flags(irq_flags);
517 buffer = global_trace.trace_buffer.buffer; 530 buffer = global_trace.trace_buffer.buffer;
518 event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, 531 event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
519 irq_flags, preempt_count()); 532 irq_flags, pc);
520 if (!event) 533 if (!event)
521 return 0; 534 return 0;
522 535
@@ -525,6 +538,7 @@ int __trace_bputs(unsigned long ip, const char *str)
525 entry->str = str; 538 entry->str = str;
526 539
527 __buffer_unlock_commit(buffer, event); 540 __buffer_unlock_commit(buffer, event);
541 ftrace_trace_stack(buffer, irq_flags, 4, pc);
528 542
529 return 1; 543 return 1;
530} 544}
@@ -599,7 +613,7 @@ static int alloc_snapshot(struct trace_array *tr)
599 return 0; 613 return 0;
600} 614}
601 615
602void free_snapshot(struct trace_array *tr) 616static void free_snapshot(struct trace_array *tr)
603{ 617{
604 /* 618 /*
605 * We don't free the ring buffer. instead, resize it because 619 * We don't free the ring buffer. instead, resize it because
@@ -963,27 +977,9 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
963 return cnt; 977 return cnt;
964} 978}
965 979
966/*
967 * ftrace_max_lock is used to protect the swapping of buffers
968 * when taking a max snapshot. The buffers themselves are
969 * protected by per_cpu spinlocks. But the action of the swap
970 * needs its own lock.
971 *
972 * This is defined as a arch_spinlock_t in order to help
973 * with performance when lockdep debugging is enabled.
974 *
975 * It is also used in other places outside the update_max_tr
976 * so it needs to be defined outside of the
977 * CONFIG_TRACER_MAX_TRACE.
978 */
979static arch_spinlock_t ftrace_max_lock =
980 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
981
982unsigned long __read_mostly tracing_thresh; 980unsigned long __read_mostly tracing_thresh;
983 981
984#ifdef CONFIG_TRACER_MAX_TRACE 982#ifdef CONFIG_TRACER_MAX_TRACE
985unsigned long __read_mostly tracing_max_latency;
986
987/* 983/*
988 * Copy the new maximum trace into the separate maximum-trace 984 * Copy the new maximum trace into the separate maximum-trace
989 * structure. (this way the maximum trace is permanently saved, 985 * structure. (this way the maximum trace is permanently saved,
@@ -1000,7 +996,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
1000 max_buf->cpu = cpu; 996 max_buf->cpu = cpu;
1001 max_buf->time_start = data->preempt_timestamp; 997 max_buf->time_start = data->preempt_timestamp;
1002 998
1003 max_data->saved_latency = tracing_max_latency; 999 max_data->saved_latency = tr->max_latency;
1004 max_data->critical_start = data->critical_start; 1000 max_data->critical_start = data->critical_start;
1005 max_data->critical_end = data->critical_end; 1001 max_data->critical_end = data->critical_end;
1006 1002
@@ -1048,14 +1044,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
1048 return; 1044 return;
1049 } 1045 }
1050 1046
1051 arch_spin_lock(&ftrace_max_lock); 1047 arch_spin_lock(&tr->max_lock);
1052 1048
1053 buf = tr->trace_buffer.buffer; 1049 buf = tr->trace_buffer.buffer;
1054 tr->trace_buffer.buffer = tr->max_buffer.buffer; 1050 tr->trace_buffer.buffer = tr->max_buffer.buffer;
1055 tr->max_buffer.buffer = buf; 1051 tr->max_buffer.buffer = buf;
1056 1052
1057 __update_max_tr(tr, tsk, cpu); 1053 __update_max_tr(tr, tsk, cpu);
1058 arch_spin_unlock(&ftrace_max_lock); 1054 arch_spin_unlock(&tr->max_lock);
1059} 1055}
1060 1056
1061/** 1057/**
@@ -1081,7 +1077,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
1081 return; 1077 return;
1082 } 1078 }
1083 1079
1084 arch_spin_lock(&ftrace_max_lock); 1080 arch_spin_lock(&tr->max_lock);
1085 1081
1086 ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); 1082 ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
1087 1083
@@ -1099,17 +1095,17 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
1099 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); 1095 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
1100 1096
1101 __update_max_tr(tr, tsk, cpu); 1097 __update_max_tr(tr, tsk, cpu);
1102 arch_spin_unlock(&ftrace_max_lock); 1098 arch_spin_unlock(&tr->max_lock);
1103} 1099}
1104#endif /* CONFIG_TRACER_MAX_TRACE */ 1100#endif /* CONFIG_TRACER_MAX_TRACE */
1105 1101
1106static void default_wait_pipe(struct trace_iterator *iter) 1102static int wait_on_pipe(struct trace_iterator *iter)
1107{ 1103{
1108 /* Iterators are static, they should be filled or empty */ 1104 /* Iterators are static, they should be filled or empty */
1109 if (trace_buffer_iter(iter, iter->cpu_file)) 1105 if (trace_buffer_iter(iter, iter->cpu_file))
1110 return; 1106 return 0;
1111 1107
1112 ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); 1108 return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
1113} 1109}
1114 1110
1115#ifdef CONFIG_FTRACE_STARTUP_TEST 1111#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1220,8 +1216,6 @@ int register_tracer(struct tracer *type)
1220 else 1216 else
1221 if (!type->flags->opts) 1217 if (!type->flags->opts)
1222 type->flags->opts = dummy_tracer_opt; 1218 type->flags->opts = dummy_tracer_opt;
1223 if (!type->wait_pipe)
1224 type->wait_pipe = default_wait_pipe;
1225 1219
1226 ret = run_tracer_selftest(type); 1220 ret = run_tracer_selftest(type);
1227 if (ret < 0) 1221 if (ret < 0)
@@ -1305,22 +1299,71 @@ void tracing_reset_all_online_cpus(void)
1305 } 1299 }
1306} 1300}
1307 1301
1308#define SAVED_CMDLINES 128 1302#define SAVED_CMDLINES_DEFAULT 128
1309#define NO_CMDLINE_MAP UINT_MAX 1303#define NO_CMDLINE_MAP UINT_MAX
1310static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
1311static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
1312static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
1313static int cmdline_idx;
1314static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; 1304static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
1305struct saved_cmdlines_buffer {
1306 unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
1307 unsigned *map_cmdline_to_pid;
1308 unsigned cmdline_num;
1309 int cmdline_idx;
1310 char *saved_cmdlines;
1311};
1312static struct saved_cmdlines_buffer *savedcmd;
1315 1313
1316/* temporary disable recording */ 1314/* temporary disable recording */
1317static atomic_t trace_record_cmdline_disabled __read_mostly; 1315static atomic_t trace_record_cmdline_disabled __read_mostly;
1318 1316
1319static void trace_init_cmdlines(void) 1317static inline char *get_saved_cmdlines(int idx)
1320{ 1318{
1321 memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); 1319 return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
1322 memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); 1320}
1323 cmdline_idx = 0; 1321
1322static inline void set_cmdline(int idx, const char *cmdline)
1323{
1324 memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
1325}
1326
1327static int allocate_cmdlines_buffer(unsigned int val,
1328 struct saved_cmdlines_buffer *s)
1329{
1330 s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid),
1331 GFP_KERNEL);
1332 if (!s->map_cmdline_to_pid)
1333 return -ENOMEM;
1334
1335 s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL);
1336 if (!s->saved_cmdlines) {
1337 kfree(s->map_cmdline_to_pid);
1338 return -ENOMEM;
1339 }
1340
1341 s->cmdline_idx = 0;
1342 s->cmdline_num = val;
1343 memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
1344 sizeof(s->map_pid_to_cmdline));
1345 memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
1346 val * sizeof(*s->map_cmdline_to_pid));
1347
1348 return 0;
1349}
1350
1351static int trace_create_savedcmd(void)
1352{
1353 int ret;
1354
1355 savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL);
1356 if (!savedcmd)
1357 return -ENOMEM;
1358
1359 ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd);
1360 if (ret < 0) {
1361 kfree(savedcmd);
1362 savedcmd = NULL;
1363 return -ENOMEM;
1364 }
1365
1366 return 0;
1324} 1367}
1325 1368
1326int is_tracing_stopped(void) 1369int is_tracing_stopped(void)
@@ -1353,7 +1396,7 @@ void tracing_start(void)
1353 } 1396 }
1354 1397
1355 /* Prevent the buffers from switching */ 1398 /* Prevent the buffers from switching */
1356 arch_spin_lock(&ftrace_max_lock); 1399 arch_spin_lock(&global_trace.max_lock);
1357 1400
1358 buffer = global_trace.trace_buffer.buffer; 1401 buffer = global_trace.trace_buffer.buffer;
1359 if (buffer) 1402 if (buffer)
@@ -1365,9 +1408,8 @@ void tracing_start(void)
1365 ring_buffer_record_enable(buffer); 1408 ring_buffer_record_enable(buffer);
1366#endif 1409#endif
1367 1410
1368 arch_spin_unlock(&ftrace_max_lock); 1411 arch_spin_unlock(&global_trace.max_lock);
1369 1412
1370 ftrace_start();
1371 out: 1413 out:
1372 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); 1414 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
1373} 1415}
@@ -1414,13 +1456,12 @@ void tracing_stop(void)
1414 struct ring_buffer *buffer; 1456 struct ring_buffer *buffer;
1415 unsigned long flags; 1457 unsigned long flags;
1416 1458
1417 ftrace_stop();
1418 raw_spin_lock_irqsave(&global_trace.start_lock, flags); 1459 raw_spin_lock_irqsave(&global_trace.start_lock, flags);
1419 if (global_trace.stop_count++) 1460 if (global_trace.stop_count++)
1420 goto out; 1461 goto out;
1421 1462
1422 /* Prevent the buffers from switching */ 1463 /* Prevent the buffers from switching */
1423 arch_spin_lock(&ftrace_max_lock); 1464 arch_spin_lock(&global_trace.max_lock);
1424 1465
1425 buffer = global_trace.trace_buffer.buffer; 1466 buffer = global_trace.trace_buffer.buffer;
1426 if (buffer) 1467 if (buffer)
@@ -1432,7 +1473,7 @@ void tracing_stop(void)
1432 ring_buffer_record_disable(buffer); 1473 ring_buffer_record_disable(buffer);
1433#endif 1474#endif
1434 1475
1435 arch_spin_unlock(&ftrace_max_lock); 1476 arch_spin_unlock(&global_trace.max_lock);
1436 1477
1437 out: 1478 out:
1438 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); 1479 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
@@ -1461,12 +1502,12 @@ static void tracing_stop_tr(struct trace_array *tr)
1461 1502
1462void trace_stop_cmdline_recording(void); 1503void trace_stop_cmdline_recording(void);
1463 1504
1464static void trace_save_cmdline(struct task_struct *tsk) 1505static int trace_save_cmdline(struct task_struct *tsk)
1465{ 1506{
1466 unsigned pid, idx; 1507 unsigned pid, idx;
1467 1508
1468 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) 1509 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
1469 return; 1510 return 0;
1470 1511
1471 /* 1512 /*
1472 * It's not the end of the world if we don't get 1513 * It's not the end of the world if we don't get
@@ -1475,11 +1516,11 @@ static void trace_save_cmdline(struct task_struct *tsk)
1475 * so if we miss here, then better luck next time. 1516 * so if we miss here, then better luck next time.
1476 */ 1517 */
1477 if (!arch_spin_trylock(&trace_cmdline_lock)) 1518 if (!arch_spin_trylock(&trace_cmdline_lock))
1478 return; 1519 return 0;
1479 1520
1480 idx = map_pid_to_cmdline[tsk->pid]; 1521 idx = savedcmd->map_pid_to_cmdline[tsk->pid];
1481 if (idx == NO_CMDLINE_MAP) { 1522 if (idx == NO_CMDLINE_MAP) {
1482 idx = (cmdline_idx + 1) % SAVED_CMDLINES; 1523 idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
1483 1524
1484 /* 1525 /*
1485 * Check whether the cmdline buffer at idx has a pid 1526 * Check whether the cmdline buffer at idx has a pid
@@ -1487,22 +1528,24 @@ static void trace_save_cmdline(struct task_struct *tsk)
1487 * need to clear the map_pid_to_cmdline. Otherwise we 1528 * need to clear the map_pid_to_cmdline. Otherwise we
1488 * would read the new comm for the old pid. 1529 * would read the new comm for the old pid.
1489 */ 1530 */
1490 pid = map_cmdline_to_pid[idx]; 1531 pid = savedcmd->map_cmdline_to_pid[idx];
1491 if (pid != NO_CMDLINE_MAP) 1532 if (pid != NO_CMDLINE_MAP)
1492 map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; 1533 savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
1493 1534
1494 map_cmdline_to_pid[idx] = tsk->pid; 1535 savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
1495 map_pid_to_cmdline[tsk->pid] = idx; 1536 savedcmd->map_pid_to_cmdline[tsk->pid] = idx;
1496 1537
1497 cmdline_idx = idx; 1538 savedcmd->cmdline_idx = idx;
1498 } 1539 }
1499 1540
1500 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); 1541 set_cmdline(idx, tsk->comm);
1501 1542
1502 arch_spin_unlock(&trace_cmdline_lock); 1543 arch_spin_unlock(&trace_cmdline_lock);
1544
1545 return 1;
1503} 1546}
1504 1547
1505void trace_find_cmdline(int pid, char comm[]) 1548static void __trace_find_cmdline(int pid, char comm[])
1506{ 1549{
1507 unsigned map; 1550 unsigned map;
1508 1551
@@ -1521,13 +1564,19 @@ void trace_find_cmdline(int pid, char comm[])
1521 return; 1564 return;
1522 } 1565 }
1523 1566
1524 preempt_disable(); 1567 map = savedcmd->map_pid_to_cmdline[pid];
1525 arch_spin_lock(&trace_cmdline_lock);
1526 map = map_pid_to_cmdline[pid];
1527 if (map != NO_CMDLINE_MAP) 1568 if (map != NO_CMDLINE_MAP)
1528 strcpy(comm, saved_cmdlines[map]); 1569 strcpy(comm, get_saved_cmdlines(map));
1529 else 1570 else
1530 strcpy(comm, "<...>"); 1571 strcpy(comm, "<...>");
1572}
1573
1574void trace_find_cmdline(int pid, char comm[])
1575{
1576 preempt_disable();
1577 arch_spin_lock(&trace_cmdline_lock);
1578
1579 __trace_find_cmdline(pid, comm);
1531 1580
1532 arch_spin_unlock(&trace_cmdline_lock); 1581 arch_spin_unlock(&trace_cmdline_lock);
1533 preempt_enable(); 1582 preempt_enable();
@@ -1541,9 +1590,8 @@ void tracing_record_cmdline(struct task_struct *tsk)
1541 if (!__this_cpu_read(trace_cmdline_save)) 1590 if (!__this_cpu_read(trace_cmdline_save))
1542 return; 1591 return;
1543 1592
1544 __this_cpu_write(trace_cmdline_save, false); 1593 if (trace_save_cmdline(tsk))
1545 1594 __this_cpu_write(trace_cmdline_save, false);
1546 trace_save_cmdline(tsk);
1547} 1595}
1548 1596
1549void 1597void
@@ -1746,7 +1794,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1746 */ 1794 */
1747 barrier(); 1795 barrier();
1748 if (use_stack == 1) { 1796 if (use_stack == 1) {
1749 trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; 1797 trace.entries = this_cpu_ptr(ftrace_stack.calls);
1750 trace.max_entries = FTRACE_STACK_MAX_ENTRIES; 1798 trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
1751 1799
1752 if (regs) 1800 if (regs)
@@ -1995,7 +2043,21 @@ void trace_printk_init_buffers(void)
1995 if (alloc_percpu_trace_buffer()) 2043 if (alloc_percpu_trace_buffer())
1996 return; 2044 return;
1997 2045
1998 pr_info("ftrace: Allocated trace_printk buffers\n"); 2046 /* trace_printk() is for debug use only. Don't use it in production. */
2047
2048 pr_warning("\n**********************************************************\n");
2049 pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
2050 pr_warning("** **\n");
2051 pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
2052 pr_warning("** **\n");
2053 pr_warning("** This means that this is a DEBUG kernel and it is **\n");
2054 pr_warning("** unsafe for produciton use. **\n");
2055 pr_warning("** **\n");
2056 pr_warning("** If you see this message and you are not debugging **\n");
2057 pr_warning("** the kernel, report this immediately to your vendor! **\n");
2058 pr_warning("** **\n");
2059 pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
2060 pr_warning("**********************************************************\n");
1999 2061
2000 /* Expand the buffers to set size */ 2062 /* Expand the buffers to set size */
2001 tracing_update_buffers(); 2063 tracing_update_buffers();
@@ -3333,7 +3395,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3333 mutex_lock(&tracing_cpumask_update_lock); 3395 mutex_lock(&tracing_cpumask_update_lock);
3334 3396
3335 local_irq_disable(); 3397 local_irq_disable();
3336 arch_spin_lock(&ftrace_max_lock); 3398 arch_spin_lock(&tr->max_lock);
3337 for_each_tracing_cpu(cpu) { 3399 for_each_tracing_cpu(cpu) {
3338 /* 3400 /*
3339 * Increase/decrease the disabled counter if we are 3401 * Increase/decrease the disabled counter if we are
@@ -3350,7 +3412,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
3350 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); 3412 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
3351 } 3413 }
3352 } 3414 }
3353 arch_spin_unlock(&ftrace_max_lock); 3415 arch_spin_unlock(&tr->max_lock);
3354 local_irq_enable(); 3416 local_irq_enable();
3355 3417
3356 cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); 3418 cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
@@ -3592,6 +3654,7 @@ static const char readme_msg[] =
3592 " trace_options\t\t- Set format or modify how tracing happens\n" 3654 " trace_options\t\t- Set format or modify how tracing happens\n"
3593 "\t\t\t Disable an option by adding a suffix 'no' to the\n" 3655 "\t\t\t Disable an option by adding a suffix 'no' to the\n"
3594 "\t\t\t option name\n" 3656 "\t\t\t option name\n"
3657 " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"
3595#ifdef CONFIG_DYNAMIC_FTRACE 3658#ifdef CONFIG_DYNAMIC_FTRACE
3596 "\n available_filter_functions - list of functions that can be filtered on\n" 3659 "\n available_filter_functions - list of functions that can be filtered on\n"
3597 " set_ftrace_filter\t- echo function name in here to only trace these\n" 3660 " set_ftrace_filter\t- echo function name in here to only trace these\n"
@@ -3705,55 +3768,153 @@ static const struct file_operations tracing_readme_fops = {
3705 .llseek = generic_file_llseek, 3768 .llseek = generic_file_llseek,
3706}; 3769};
3707 3770
3771static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
3772{
3773 unsigned int *ptr = v;
3774
3775 if (*pos || m->count)
3776 ptr++;
3777
3778 (*pos)++;
3779
3780 for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
3781 ptr++) {
3782 if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
3783 continue;
3784
3785 return ptr;
3786 }
3787
3788 return NULL;
3789}
3790
3791static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
3792{
3793 void *v;
3794 loff_t l = 0;
3795
3796 preempt_disable();
3797 arch_spin_lock(&trace_cmdline_lock);
3798
3799 v = &savedcmd->map_cmdline_to_pid[0];
3800 while (l <= *pos) {
3801 v = saved_cmdlines_next(m, v, &l);
3802 if (!v)
3803 return NULL;
3804 }
3805
3806 return v;
3807}
3808
3809static void saved_cmdlines_stop(struct seq_file *m, void *v)
3810{
3811 arch_spin_unlock(&trace_cmdline_lock);
3812 preempt_enable();
3813}
3814
3815static int saved_cmdlines_show(struct seq_file *m, void *v)
3816{
3817 char buf[TASK_COMM_LEN];
3818 unsigned int *pid = v;
3819
3820 __trace_find_cmdline(*pid, buf);
3821 seq_printf(m, "%d %s\n", *pid, buf);
3822 return 0;
3823}
3824
3825static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
3826 .start = saved_cmdlines_start,
3827 .next = saved_cmdlines_next,
3828 .stop = saved_cmdlines_stop,
3829 .show = saved_cmdlines_show,
3830};
3831
3832static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
3833{
3834 if (tracing_disabled)
3835 return -ENODEV;
3836
3837 return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
3838}
3839
3840static const struct file_operations tracing_saved_cmdlines_fops = {
3841 .open = tracing_saved_cmdlines_open,
3842 .read = seq_read,
3843 .llseek = seq_lseek,
3844 .release = seq_release,
3845};
3846
3708static ssize_t 3847static ssize_t
3709tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, 3848tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
3710 size_t cnt, loff_t *ppos) 3849 size_t cnt, loff_t *ppos)
3711{ 3850{
3712 char *buf_comm; 3851 char buf[64];
3713 char *file_buf; 3852 int r;
3714 char *buf; 3853
3715 int len = 0; 3854 arch_spin_lock(&trace_cmdline_lock);
3716 int pid; 3855 r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
3717 int i; 3856 arch_spin_unlock(&trace_cmdline_lock);
3718 3857
3719 file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); 3858 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3720 if (!file_buf) 3859}
3860
3861static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
3862{
3863 kfree(s->saved_cmdlines);
3864 kfree(s->map_cmdline_to_pid);
3865 kfree(s);
3866}
3867
3868static int tracing_resize_saved_cmdlines(unsigned int val)
3869{
3870 struct saved_cmdlines_buffer *s, *savedcmd_temp;
3871
3872 s = kmalloc(sizeof(*s), GFP_KERNEL);
3873 if (!s)
3721 return -ENOMEM; 3874 return -ENOMEM;
3722 3875
3723 buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); 3876 if (allocate_cmdlines_buffer(val, s) < 0) {
3724 if (!buf_comm) { 3877 kfree(s);
3725 kfree(file_buf);
3726 return -ENOMEM; 3878 return -ENOMEM;
3727 } 3879 }
3728 3880
3729 buf = file_buf; 3881 arch_spin_lock(&trace_cmdline_lock);
3882 savedcmd_temp = savedcmd;
3883 savedcmd = s;
3884 arch_spin_unlock(&trace_cmdline_lock);
3885 free_saved_cmdlines_buffer(savedcmd_temp);
3730 3886
3731 for (i = 0; i < SAVED_CMDLINES; i++) { 3887 return 0;
3732 int r; 3888}
3733 3889
3734 pid = map_cmdline_to_pid[i]; 3890static ssize_t
3735 if (pid == -1 || pid == NO_CMDLINE_MAP) 3891tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
3736 continue; 3892 size_t cnt, loff_t *ppos)
3893{
3894 unsigned long val;
3895 int ret;
3737 3896
3738 trace_find_cmdline(pid, buf_comm); 3897 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
3739 r = sprintf(buf, "%d %s\n", pid, buf_comm); 3898 if (ret)
3740 buf += r; 3899 return ret;
3741 len += r;
3742 }
3743 3900
3744 len = simple_read_from_buffer(ubuf, cnt, ppos, 3901 /* must have at least 1 entry or less than PID_MAX_DEFAULT */
3745 file_buf, len); 3902 if (!val || val > PID_MAX_DEFAULT)
3903 return -EINVAL;
3746 3904
3747 kfree(file_buf); 3905 ret = tracing_resize_saved_cmdlines((unsigned int)val);
3748 kfree(buf_comm); 3906 if (ret < 0)
3907 return ret;
3749 3908
3750 return len; 3909 *ppos += cnt;
3910
3911 return cnt;
3751} 3912}
3752 3913
3753static const struct file_operations tracing_saved_cmdlines_fops = { 3914static const struct file_operations tracing_saved_cmdlines_size_fops = {
3754 .open = tracing_open_generic, 3915 .open = tracing_open_generic,
3755 .read = tracing_saved_cmdlines_read, 3916 .read = tracing_saved_cmdlines_size_read,
3756 .llseek = generic_file_llseek, 3917 .write = tracing_saved_cmdlines_size_write,
3757}; 3918};
3758 3919
3759static ssize_t 3920static ssize_t
@@ -4225,29 +4386,11 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
4225 return trace_poll(iter, filp, poll_table); 4386 return trace_poll(iter, filp, poll_table);
4226} 4387}
4227 4388
4228/*
4229 * This is a make-shift waitqueue.
4230 * A tracer might use this callback on some rare cases:
4231 *
4232 * 1) the current tracer might hold the runqueue lock when it wakes up
4233 * a reader, hence a deadlock (sched, function, and function graph tracers)
4234 * 2) the function tracers, trace all functions, we don't want
4235 * the overhead of calling wake_up and friends
4236 * (and tracing them too)
4237 *
4238 * Anyway, this is really very primitive wakeup.
4239 */
4240void poll_wait_pipe(struct trace_iterator *iter)
4241{
4242 set_current_state(TASK_INTERRUPTIBLE);
4243 /* sleep for 100 msecs, and try again. */
4244 schedule_timeout(HZ / 10);
4245}
4246
4247/* Must be called with trace_types_lock mutex held. */ 4389/* Must be called with trace_types_lock mutex held. */
4248static int tracing_wait_pipe(struct file *filp) 4390static int tracing_wait_pipe(struct file *filp)
4249{ 4391{
4250 struct trace_iterator *iter = filp->private_data; 4392 struct trace_iterator *iter = filp->private_data;
4393 int ret;
4251 4394
4252 while (trace_empty(iter)) { 4395 while (trace_empty(iter)) {
4253 4396
@@ -4255,15 +4398,6 @@ static int tracing_wait_pipe(struct file *filp)
4255 return -EAGAIN; 4398 return -EAGAIN;
4256 } 4399 }
4257 4400
4258 mutex_unlock(&iter->mutex);
4259
4260 iter->trace->wait_pipe(iter);
4261
4262 mutex_lock(&iter->mutex);
4263
4264 if (signal_pending(current))
4265 return -EINTR;
4266
4267 /* 4401 /*
4268 * We block until we read something and tracing is disabled. 4402 * We block until we read something and tracing is disabled.
4269 * We still block if tracing is disabled, but we have never 4403 * We still block if tracing is disabled, but we have never
@@ -4275,6 +4409,18 @@ static int tracing_wait_pipe(struct file *filp)
4275 */ 4409 */
4276 if (!tracing_is_on() && iter->pos) 4410 if (!tracing_is_on() && iter->pos)
4277 break; 4411 break;
4412
4413 mutex_unlock(&iter->mutex);
4414
4415 ret = wait_on_pipe(iter);
4416
4417 mutex_lock(&iter->mutex);
4418
4419 if (ret)
4420 return ret;
4421
4422 if (signal_pending(current))
4423 return -EINTR;
4278 } 4424 }
4279 4425
4280 return 1; 4426 return 1;
@@ -5197,8 +5343,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5197 goto out_unlock; 5343 goto out_unlock;
5198 } 5344 }
5199 mutex_unlock(&trace_types_lock); 5345 mutex_unlock(&trace_types_lock);
5200 iter->trace->wait_pipe(iter); 5346 ret = wait_on_pipe(iter);
5201 mutex_lock(&trace_types_lock); 5347 mutex_lock(&trace_types_lock);
5348 if (ret) {
5349 size = ret;
5350 goto out_unlock;
5351 }
5202 if (signal_pending(current)) { 5352 if (signal_pending(current)) {
5203 size = -EINTR; 5353 size = -EINTR;
5204 goto out_unlock; 5354 goto out_unlock;
@@ -5408,8 +5558,10 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5408 goto out; 5558 goto out;
5409 } 5559 }
5410 mutex_unlock(&trace_types_lock); 5560 mutex_unlock(&trace_types_lock);
5411 iter->trace->wait_pipe(iter); 5561 ret = wait_on_pipe(iter);
5412 mutex_lock(&trace_types_lock); 5562 mutex_lock(&trace_types_lock);
5563 if (ret)
5564 goto out;
5413 if (signal_pending(current)) { 5565 if (signal_pending(current)) {
5414 ret = -EINTR; 5566 ret = -EINTR;
5415 goto out; 5567 goto out;
@@ -6102,6 +6254,28 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
6102 return 0; 6254 return 0;
6103} 6255}
6104 6256
6257static void free_trace_buffer(struct trace_buffer *buf)
6258{
6259 if (buf->buffer) {
6260 ring_buffer_free(buf->buffer);
6261 buf->buffer = NULL;
6262 free_percpu(buf->data);
6263 buf->data = NULL;
6264 }
6265}
6266
6267static void free_trace_buffers(struct trace_array *tr)
6268{
6269 if (!tr)
6270 return;
6271
6272 free_trace_buffer(&tr->trace_buffer);
6273
6274#ifdef CONFIG_TRACER_MAX_TRACE
6275 free_trace_buffer(&tr->max_buffer);
6276#endif
6277}
6278
6105static int new_instance_create(const char *name) 6279static int new_instance_create(const char *name)
6106{ 6280{
6107 struct trace_array *tr; 6281 struct trace_array *tr;
@@ -6131,6 +6305,8 @@ static int new_instance_create(const char *name)
6131 6305
6132 raw_spin_lock_init(&tr->start_lock); 6306 raw_spin_lock_init(&tr->start_lock);
6133 6307
6308 tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
6309
6134 tr->current_trace = &nop_trace; 6310 tr->current_trace = &nop_trace;
6135 6311
6136 INIT_LIST_HEAD(&tr->systems); 6312 INIT_LIST_HEAD(&tr->systems);
@@ -6158,8 +6334,7 @@ static int new_instance_create(const char *name)
6158 return 0; 6334 return 0;
6159 6335
6160 out_free_tr: 6336 out_free_tr:
6161 if (tr->trace_buffer.buffer) 6337 free_trace_buffers(tr);
6162 ring_buffer_free(tr->trace_buffer.buffer);
6163 free_cpumask_var(tr->tracing_cpumask); 6338 free_cpumask_var(tr->tracing_cpumask);
6164 kfree(tr->name); 6339 kfree(tr->name);
6165 kfree(tr); 6340 kfree(tr);
@@ -6199,8 +6374,7 @@ static int instance_delete(const char *name)
6199 event_trace_del_tracer(tr); 6374 event_trace_del_tracer(tr);
6200 ftrace_destroy_function_files(tr); 6375 ftrace_destroy_function_files(tr);
6201 debugfs_remove_recursive(tr->dir); 6376 debugfs_remove_recursive(tr->dir);
6202 free_percpu(tr->trace_buffer.data); 6377 free_trace_buffers(tr);
6203 ring_buffer_free(tr->trace_buffer.buffer);
6204 6378
6205 kfree(tr->name); 6379 kfree(tr->name);
6206 kfree(tr); 6380 kfree(tr);
@@ -6328,6 +6502,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6328 trace_create_file("tracing_on", 0644, d_tracer, 6502 trace_create_file("tracing_on", 0644, d_tracer,
6329 tr, &rb_simple_fops); 6503 tr, &rb_simple_fops);
6330 6504
6505#ifdef CONFIG_TRACER_MAX_TRACE
6506 trace_create_file("tracing_max_latency", 0644, d_tracer,
6507 &tr->max_latency, &tracing_max_lat_fops);
6508#endif
6509
6331 if (ftrace_create_function_files(tr, d_tracer)) 6510 if (ftrace_create_function_files(tr, d_tracer))
6332 WARN(1, "Could not allocate function filter files"); 6511 WARN(1, "Could not allocate function filter files");
6333 6512
@@ -6353,11 +6532,6 @@ static __init int tracer_init_debugfs(void)
6353 6532
6354 init_tracer_debugfs(&global_trace, d_tracer); 6533 init_tracer_debugfs(&global_trace, d_tracer);
6355 6534
6356#ifdef CONFIG_TRACER_MAX_TRACE
6357 trace_create_file("tracing_max_latency", 0644, d_tracer,
6358 &tracing_max_latency, &tracing_max_lat_fops);
6359#endif
6360
6361 trace_create_file("tracing_thresh", 0644, d_tracer, 6535 trace_create_file("tracing_thresh", 0644, d_tracer,
6362 &tracing_thresh, &tracing_max_lat_fops); 6536 &tracing_thresh, &tracing_max_lat_fops);
6363 6537
@@ -6367,6 +6541,9 @@ static __init int tracer_init_debugfs(void)
6367 trace_create_file("saved_cmdlines", 0444, d_tracer, 6541 trace_create_file("saved_cmdlines", 0444, d_tracer,
6368 NULL, &tracing_saved_cmdlines_fops); 6542 NULL, &tracing_saved_cmdlines_fops);
6369 6543
6544 trace_create_file("saved_cmdlines_size", 0644, d_tracer,
6545 NULL, &tracing_saved_cmdlines_size_fops);
6546
6370#ifdef CONFIG_DYNAMIC_FTRACE 6547#ifdef CONFIG_DYNAMIC_FTRACE
6371 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 6548 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
6372 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 6549 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -6603,18 +6780,19 @@ __init static int tracer_alloc_buffers(void)
6603 if (!temp_buffer) 6780 if (!temp_buffer)
6604 goto out_free_cpumask; 6781 goto out_free_cpumask;
6605 6782
6783 if (trace_create_savedcmd() < 0)
6784 goto out_free_temp_buffer;
6785
6606 /* TODO: make the number of buffers hot pluggable with CPUS */ 6786 /* TODO: make the number of buffers hot pluggable with CPUS */
6607 if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { 6787 if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
6608 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 6788 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
6609 WARN_ON(1); 6789 WARN_ON(1);
6610 goto out_free_temp_buffer; 6790 goto out_free_savedcmd;
6611 } 6791 }
6612 6792
6613 if (global_trace.buffer_disabled) 6793 if (global_trace.buffer_disabled)
6614 tracing_off(); 6794 tracing_off();
6615 6795
6616 trace_init_cmdlines();
6617
6618 if (trace_boot_clock) { 6796 if (trace_boot_clock) {
6619 ret = tracing_set_clock(&global_trace, trace_boot_clock); 6797 ret = tracing_set_clock(&global_trace, trace_boot_clock);
6620 if (ret < 0) 6798 if (ret < 0)
@@ -6629,6 +6807,10 @@ __init static int tracer_alloc_buffers(void)
6629 */ 6807 */
6630 global_trace.current_trace = &nop_trace; 6808 global_trace.current_trace = &nop_trace;
6631 6809
6810 global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
6811
6812 ftrace_init_global_array_ops(&global_trace);
6813
6632 register_tracer(&nop_trace); 6814 register_tracer(&nop_trace);
6633 6815
6634 /* All seems OK, enable tracing */ 6816 /* All seems OK, enable tracing */
@@ -6656,13 +6838,11 @@ __init static int tracer_alloc_buffers(void)
6656 6838
6657 return 0; 6839 return 0;
6658 6840
6841out_free_savedcmd:
6842 free_saved_cmdlines_buffer(savedcmd);
6659out_free_temp_buffer: 6843out_free_temp_buffer:
6660 ring_buffer_free(temp_buffer); 6844 ring_buffer_free(temp_buffer);
6661out_free_cpumask: 6845out_free_cpumask:
6662 free_percpu(global_trace.trace_buffer.data);
6663#ifdef CONFIG_TRACER_MAX_TRACE
6664 free_percpu(global_trace.max_buffer.data);
6665#endif
6666 free_cpumask_var(global_trace.tracing_cpumask); 6846 free_cpumask_var(global_trace.tracing_cpumask);
6667out_free_buffer_mask: 6847out_free_buffer_mask:
6668 free_cpumask_var(tracing_buffer_mask); 6848 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2e29d7ba5a52..9258f5a815db 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -190,7 +190,22 @@ struct trace_array {
190 */ 190 */
191 struct trace_buffer max_buffer; 191 struct trace_buffer max_buffer;
192 bool allocated_snapshot; 192 bool allocated_snapshot;
193 unsigned long max_latency;
193#endif 194#endif
195 /*
196 * max_lock is used to protect the swapping of buffers
197 * when taking a max snapshot. The buffers themselves are
198 * protected by per_cpu spinlocks. But the action of the swap
199 * needs its own lock.
200 *
201 * This is defined as a arch_spinlock_t in order to help
202 * with performance when lockdep debugging is enabled.
203 *
204 * It is also used in other places outside the update_max_tr
205 * so it needs to be defined outside of the
206 * CONFIG_TRACER_MAX_TRACE.
207 */
208 arch_spinlock_t max_lock;
194 int buffer_disabled; 209 int buffer_disabled;
195#ifdef CONFIG_FTRACE_SYSCALLS 210#ifdef CONFIG_FTRACE_SYSCALLS
196 int sys_refcount_enter; 211 int sys_refcount_enter;
@@ -237,6 +252,9 @@ static inline struct trace_array *top_trace_array(void)
237{ 252{
238 struct trace_array *tr; 253 struct trace_array *tr;
239 254
255 if (list_empty(&ftrace_trace_arrays))
256 return NULL;
257
240 tr = list_entry(ftrace_trace_arrays.prev, 258 tr = list_entry(ftrace_trace_arrays.prev,
241 typeof(*tr), list); 259 typeof(*tr), list);
242 WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); 260 WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
@@ -323,7 +341,6 @@ struct tracer_flags {
323 * @stop: called when tracing is paused (echo 0 > tracing_enabled) 341 * @stop: called when tracing is paused (echo 0 > tracing_enabled)
324 * @open: called when the trace file is opened 342 * @open: called when the trace file is opened
325 * @pipe_open: called when the trace_pipe file is opened 343 * @pipe_open: called when the trace_pipe file is opened
326 * @wait_pipe: override how the user waits for traces on trace_pipe
327 * @close: called when the trace file is released 344 * @close: called when the trace file is released
328 * @pipe_close: called when the trace_pipe file is released 345 * @pipe_close: called when the trace_pipe file is released
329 * @read: override the default read callback on trace_pipe 346 * @read: override the default read callback on trace_pipe
@@ -342,7 +359,6 @@ struct tracer {
342 void (*stop)(struct trace_array *tr); 359 void (*stop)(struct trace_array *tr);
343 void (*open)(struct trace_iterator *iter); 360 void (*open)(struct trace_iterator *iter);
344 void (*pipe_open)(struct trace_iterator *iter); 361 void (*pipe_open)(struct trace_iterator *iter);
345 void (*wait_pipe)(struct trace_iterator *iter);
346 void (*close)(struct trace_iterator *iter); 362 void (*close)(struct trace_iterator *iter);
347 void (*pipe_close)(struct trace_iterator *iter); 363 void (*pipe_close)(struct trace_iterator *iter);
348 ssize_t (*read)(struct trace_iterator *iter, 364 ssize_t (*read)(struct trace_iterator *iter,
@@ -416,13 +432,7 @@ enum {
416 TRACE_FTRACE_IRQ_BIT, 432 TRACE_FTRACE_IRQ_BIT,
417 TRACE_FTRACE_SIRQ_BIT, 433 TRACE_FTRACE_SIRQ_BIT,
418 434
419 /* GLOBAL_BITs must be greater than FTRACE_BITs */ 435 /* INTERNAL_BITs must be greater than FTRACE_BITs */
420 TRACE_GLOBAL_BIT,
421 TRACE_GLOBAL_NMI_BIT,
422 TRACE_GLOBAL_IRQ_BIT,
423 TRACE_GLOBAL_SIRQ_BIT,
424
425 /* INTERNAL_BITs must be greater than GLOBAL_BITs */
426 TRACE_INTERNAL_BIT, 436 TRACE_INTERNAL_BIT,
427 TRACE_INTERNAL_NMI_BIT, 437 TRACE_INTERNAL_NMI_BIT,
428 TRACE_INTERNAL_IRQ_BIT, 438 TRACE_INTERNAL_IRQ_BIT,
@@ -449,9 +459,6 @@ enum {
449#define TRACE_FTRACE_START TRACE_FTRACE_BIT 459#define TRACE_FTRACE_START TRACE_FTRACE_BIT
450#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) 460#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
451 461
452#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT
453#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
454
455#define TRACE_LIST_START TRACE_INTERNAL_BIT 462#define TRACE_LIST_START TRACE_INTERNAL_BIT
456#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) 463#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
457 464
@@ -560,8 +567,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
560 567
561void tracing_iter_reset(struct trace_iterator *iter, int cpu); 568void tracing_iter_reset(struct trace_iterator *iter, int cpu);
562 569
563void poll_wait_pipe(struct trace_iterator *iter);
564
565void tracing_sched_switch_trace(struct trace_array *tr, 570void tracing_sched_switch_trace(struct trace_array *tr,
566 struct task_struct *prev, 571 struct task_struct *prev,
567 struct task_struct *next, 572 struct task_struct *next,
@@ -608,8 +613,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
608extern unsigned long tracing_thresh; 613extern unsigned long tracing_thresh;
609 614
610#ifdef CONFIG_TRACER_MAX_TRACE 615#ifdef CONFIG_TRACER_MAX_TRACE
611extern unsigned long tracing_max_latency;
612
613void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 616void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
614void update_max_tr_single(struct trace_array *tr, 617void update_max_tr_single(struct trace_array *tr,
615 struct task_struct *tsk, int cpu); 618 struct task_struct *tsk, int cpu);
@@ -724,6 +727,8 @@ extern unsigned long trace_flags;
724#define TRACE_GRAPH_PRINT_PROC 0x8 727#define TRACE_GRAPH_PRINT_PROC 0x8
725#define TRACE_GRAPH_PRINT_DURATION 0x10 728#define TRACE_GRAPH_PRINT_DURATION 0x10
726#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 729#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
730#define TRACE_GRAPH_PRINT_IRQS 0x40
731#define TRACE_GRAPH_PRINT_TAIL 0x80
727#define TRACE_GRAPH_PRINT_FILL_SHIFT 28 732#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
728#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) 733#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
729 734
@@ -823,6 +828,10 @@ extern int ftrace_is_dead(void);
823int ftrace_create_function_files(struct trace_array *tr, 828int ftrace_create_function_files(struct trace_array *tr,
824 struct dentry *parent); 829 struct dentry *parent);
825void ftrace_destroy_function_files(struct trace_array *tr); 830void ftrace_destroy_function_files(struct trace_array *tr);
831void ftrace_init_global_array_ops(struct trace_array *tr);
832void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
833void ftrace_reset_array_ops(struct trace_array *tr);
834int using_ftrace_ops_list_func(void);
826#else 835#else
827static inline int ftrace_trace_task(struct task_struct *task) 836static inline int ftrace_trace_task(struct task_struct *task)
828{ 837{
@@ -836,6 +845,11 @@ ftrace_create_function_files(struct trace_array *tr,
836 return 0; 845 return 0;
837} 846}
838static inline void ftrace_destroy_function_files(struct trace_array *tr) { } 847static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
848static inline __init void
849ftrace_init_global_array_ops(struct trace_array *tr) { }
850static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
851/* ftace_func_t type is not defined, use macro instead of static inline */
852#define ftrace_init_array_ops(tr, func) do { } while (0)
839#endif /* CONFIG_FUNCTION_TRACER */ 853#endif /* CONFIG_FUNCTION_TRACER */
840 854
841#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) 855#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
new file mode 100644
index 000000000000..40a14cbcf8e0
--- /dev/null
+++ b/kernel/trace/trace_benchmark.c
@@ -0,0 +1,198 @@
1#include <linux/delay.h>
2#include <linux/module.h>
3#include <linux/kthread.h>
4#include <linux/trace_clock.h>
5
6#define CREATE_TRACE_POINTS
7#include "trace_benchmark.h"
8
9static struct task_struct *bm_event_thread;
10
11static char bm_str[BENCHMARK_EVENT_STRLEN] = "START";
12
13static u64 bm_total;
14static u64 bm_totalsq;
15static u64 bm_last;
16static u64 bm_max;
17static u64 bm_min;
18static u64 bm_first;
19static u64 bm_cnt;
20static u64 bm_stddev;
21static unsigned int bm_avg;
22static unsigned int bm_std;
23
24/*
25 * This gets called in a loop recording the time it took to write
26 * the tracepoint. What it writes is the time statistics of the last
27 * tracepoint write. As there is nothing to write the first time
28 * it simply writes "START". As the first write is cold cache and
29 * the rest is hot, we save off that time in bm_first and it is
30 * reported as "first", which is shown in the second write to the
31 * tracepoint. The "first" field is writen within the statics from
32 * then on but never changes.
33 */
34static void trace_do_benchmark(void)
35{
36 u64 start;
37 u64 stop;
38 u64 delta;
39 u64 stddev;
40 u64 seed;
41 u64 last_seed;
42 unsigned int avg;
43 unsigned int std = 0;
44
45 /* Only run if the tracepoint is actually active */
46 if (!trace_benchmark_event_enabled())
47 return;
48
49 local_irq_disable();
50 start = trace_clock_local();
51 trace_benchmark_event(bm_str);
52 stop = trace_clock_local();
53 local_irq_enable();
54
55 bm_cnt++;
56
57 delta = stop - start;
58
59 /*
60 * The first read is cold cached, keep it separate from the
61 * other calculations.
62 */
63 if (bm_cnt == 1) {
64 bm_first = delta;
65 scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
66 "first=%llu [COLD CACHED]", bm_first);
67 return;
68 }
69
70 bm_last = delta;
71
72 if (delta > bm_max)
73 bm_max = delta;
74 if (!bm_min || delta < bm_min)
75 bm_min = delta;
76
77 /*
78 * When bm_cnt is greater than UINT_MAX, it breaks the statistics
79 * accounting. Freeze the statistics when that happens.
80 * We should have enough data for the avg and stddev anyway.
81 */
82 if (bm_cnt > UINT_MAX) {
83 scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
84 "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld",
85 bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev);
86 return;
87 }
88
89 bm_total += delta;
90 bm_totalsq += delta * delta;
91
92
93 if (bm_cnt > 1) {
94 /*
95 * Apply Welford's method to calculate standard deviation:
96 * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
97 */
98 stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total;
99 do_div(stddev, (u32)bm_cnt);
100 do_div(stddev, (u32)bm_cnt - 1);
101 } else
102 stddev = 0;
103
104 delta = bm_total;
105 do_div(delta, bm_cnt);
106 avg = delta;
107
108 if (stddev > 0) {
109 int i = 0;
110 /*
111 * stddev is the square of standard deviation but
112 * we want the actualy number. Use the average
113 * as our seed to find the std.
114 *
115 * The next try is:
116 * x = (x + N/x) / 2
117 *
118 * Where N is the squared number to find the square
119 * root of.
120 */
121 seed = avg;
122 do {
123 last_seed = seed;
124 seed = stddev;
125 if (!last_seed)
126 break;
127 do_div(seed, last_seed);
128 seed += last_seed;
129 do_div(seed, 2);
130 } while (i++ < 10 && last_seed != seed);
131
132 std = seed;
133 }
134
135 scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
136 "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld",
137 bm_last, bm_first, bm_max, bm_min, avg, std, stddev);
138
139 bm_std = std;
140 bm_avg = avg;
141 bm_stddev = stddev;
142}
143
144static int benchmark_event_kthread(void *arg)
145{
146 /* sleep a bit to make sure the tracepoint gets activated */
147 msleep(100);
148
149 while (!kthread_should_stop()) {
150
151 trace_do_benchmark();
152
153 /*
154 * We don't go to sleep, but let others
155 * run as well.
156 */
157 cond_resched();
158 }
159
160 return 0;
161}
162
163/*
164 * When the benchmark tracepoint is enabled, it calls this
165 * function and the thread that calls the tracepoint is created.
166 */
167void trace_benchmark_reg(void)
168{
169 bm_event_thread = kthread_run(benchmark_event_kthread,
170 NULL, "event_benchmark");
171 WARN_ON(!bm_event_thread);
172}
173
174/*
175 * When the benchmark tracepoint is disabled, it calls this
176 * function and the thread that calls the tracepoint is deleted
177 * and all the numbers are reset.
178 */
179void trace_benchmark_unreg(void)
180{
181 if (!bm_event_thread)
182 return;
183
184 kthread_stop(bm_event_thread);
185
186 strcpy(bm_str, "START");
187 bm_total = 0;
188 bm_totalsq = 0;
189 bm_last = 0;
190 bm_max = 0;
191 bm_min = 0;
192 bm_cnt = 0;
193 /* These don't need to be reset but reset them anyway */
194 bm_first = 0;
195 bm_std = 0;
196 bm_avg = 0;
197 bm_stddev = 0;
198}
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
new file mode 100644
index 000000000000..3c1df1df4e29
--- /dev/null
+++ b/kernel/trace/trace_benchmark.h
@@ -0,0 +1,41 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM benchmark
3
4#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_BENCHMARK_H
6
7#include <linux/tracepoint.h>
8
9extern void trace_benchmark_reg(void);
10extern void trace_benchmark_unreg(void);
11
12#define BENCHMARK_EVENT_STRLEN 128
13
14TRACE_EVENT_FN(benchmark_event,
15
16 TP_PROTO(const char *str),
17
18 TP_ARGS(str),
19
20 TP_STRUCT__entry(
21 __array( char, str, BENCHMARK_EVENT_STRLEN )
22 ),
23
24 TP_fast_assign(
25 memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
26 ),
27
28 TP_printk("%s", __entry->str),
29
30 trace_benchmark_reg, trace_benchmark_unreg
31);
32
33#endif /* _TRACE_BENCHMARK_H */
34
35#undef TRACE_INCLUDE_FILE
36#undef TRACE_INCLUDE_PATH
37#define TRACE_INCLUDE_PATH .
38#define TRACE_INCLUDE_FILE trace_benchmark
39
40/* This part must be outside protection */
41#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index c894614de14d..5d12bb407b44 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -248,8 +248,8 @@ void perf_trace_del(struct perf_event *p_event, int flags)
248 tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); 248 tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
249} 249}
250 250
251__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, 251void *perf_trace_buf_prepare(int size, unsigned short type,
252 struct pt_regs *regs, int *rctxp) 252 struct pt_regs *regs, int *rctxp)
253{ 253{
254 struct trace_entry *entry; 254 struct trace_entry *entry;
255 unsigned long flags; 255 unsigned long flags;
@@ -281,6 +281,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
281 return raw_data; 281 return raw_data;
282} 282}
283EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); 283EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
284NOKPROBE_SYMBOL(perf_trace_buf_prepare);
284 285
285#ifdef CONFIG_FUNCTION_TRACER 286#ifdef CONFIG_FUNCTION_TRACER
286static void 287static void
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3ddfd8f62c05..2de53628689f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -470,6 +470,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
470 470
471 list_del(&file->list); 471 list_del(&file->list);
472 remove_subsystem(file->system); 472 remove_subsystem(file->system);
473 free_event_filter(file->filter);
473 kmem_cache_free(file_cachep, file); 474 kmem_cache_free(file_cachep, file);
474} 475}
475 476
@@ -574,6 +575,9 @@ int trace_set_clr_event(const char *system, const char *event, int set)
574{ 575{
575 struct trace_array *tr = top_trace_array(); 576 struct trace_array *tr = top_trace_array();
576 577
578 if (!tr)
579 return -ENODEV;
580
577 return __ftrace_set_clr_event(tr, NULL, system, event, set); 581 return __ftrace_set_clr_event(tr, NULL, system, event, set);
578} 582}
579EXPORT_SYMBOL_GPL(trace_set_clr_event); 583EXPORT_SYMBOL_GPL(trace_set_clr_event);
@@ -2065,6 +2069,9 @@ event_enable_func(struct ftrace_hash *hash,
2065 bool enable; 2069 bool enable;
2066 int ret; 2070 int ret;
2067 2071
2072 if (!tr)
2073 return -ENODEV;
2074
2068 /* hash funcs only work with set_ftrace_filter */ 2075 /* hash funcs only work with set_ftrace_filter */
2069 if (!enabled || !param) 2076 if (!enabled || !param)
2070 return -EINVAL; 2077 return -EINVAL;
@@ -2396,6 +2403,9 @@ static __init int event_trace_enable(void)
2396 char *token; 2403 char *token;
2397 int ret; 2404 int ret;
2398 2405
2406 if (!tr)
2407 return -ENODEV;
2408
2399 for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { 2409 for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
2400 2410
2401 call = *iter; 2411 call = *iter;
@@ -2442,6 +2452,8 @@ static __init int event_trace_init(void)
2442 int ret; 2452 int ret;
2443 2453
2444 tr = top_trace_array(); 2454 tr = top_trace_array();
2455 if (!tr)
2456 return -ENODEV;
2445 2457
2446 d_tracer = tracing_init_dentry(); 2458 d_tracer = tracing_init_dentry();
2447 if (!d_tracer) 2459 if (!d_tracer)
@@ -2535,6 +2547,8 @@ static __init void event_trace_self_tests(void)
2535 int ret; 2547 int ret;
2536 2548
2537 tr = top_trace_array(); 2549 tr = top_trace_array();
2550 if (!tr)
2551 return;
2538 2552
2539 pr_info("Running tests on trace events:\n"); 2553 pr_info("Running tests on trace events:\n");
2540 2554
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index ffd56351b521..57f0ec962d2c 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -26,8 +26,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
26static void 26static void
27function_stack_trace_call(unsigned long ip, unsigned long parent_ip, 27function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
28 struct ftrace_ops *op, struct pt_regs *pt_regs); 28 struct ftrace_ops *op, struct pt_regs *pt_regs);
29static struct ftrace_ops trace_ops;
30static struct ftrace_ops trace_stack_ops;
31static struct tracer_flags func_flags; 29static struct tracer_flags func_flags;
32 30
33/* Our option */ 31/* Our option */
@@ -83,28 +81,24 @@ void ftrace_destroy_function_files(struct trace_array *tr)
83 81
84static int function_trace_init(struct trace_array *tr) 82static int function_trace_init(struct trace_array *tr)
85{ 83{
86 struct ftrace_ops *ops; 84 ftrace_func_t func;
87
88 if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
89 /* There's only one global tr */
90 if (!trace_ops.private) {
91 trace_ops.private = tr;
92 trace_stack_ops.private = tr;
93 }
94 85
95 if (func_flags.val & TRACE_FUNC_OPT_STACK) 86 /*
96 ops = &trace_stack_ops; 87 * Instance trace_arrays get their ops allocated
97 else 88 * at instance creation. Unless it failed
98 ops = &trace_ops; 89 * the allocation.
99 tr->ops = ops; 90 */
100 } else if (!tr->ops) { 91 if (!tr->ops)
101 /*
102 * Instance trace_arrays get their ops allocated
103 * at instance creation. Unless it failed
104 * the allocation.
105 */
106 return -ENOMEM; 92 return -ENOMEM;
107 } 93
94 /* Currently only the global instance can do stack tracing */
95 if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
96 func_flags.val & TRACE_FUNC_OPT_STACK)
97 func = function_stack_trace_call;
98 else
99 func = function_trace_call;
100
101 ftrace_init_array_ops(tr, func);
108 102
109 tr->trace_buffer.cpu = get_cpu(); 103 tr->trace_buffer.cpu = get_cpu();
110 put_cpu(); 104 put_cpu();
@@ -118,6 +112,7 @@ static void function_trace_reset(struct trace_array *tr)
118{ 112{
119 tracing_stop_function_trace(tr); 113 tracing_stop_function_trace(tr);
120 tracing_stop_cmdline_record(); 114 tracing_stop_cmdline_record();
115 ftrace_reset_array_ops(tr);
121} 116}
122 117
123static void function_trace_start(struct trace_array *tr) 118static void function_trace_start(struct trace_array *tr)
@@ -199,18 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
199 local_irq_restore(flags); 194 local_irq_restore(flags);
200} 195}
201 196
202static struct ftrace_ops trace_ops __read_mostly =
203{
204 .func = function_trace_call,
205 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
206};
207
208static struct ftrace_ops trace_stack_ops __read_mostly =
209{
210 .func = function_stack_trace_call,
211 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
212};
213
214static struct tracer_opt func_opts[] = { 197static struct tracer_opt func_opts[] = {
215#ifdef CONFIG_STACKTRACE 198#ifdef CONFIG_STACKTRACE
216 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, 199 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
@@ -248,10 +231,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
248 unregister_ftrace_function(tr->ops); 231 unregister_ftrace_function(tr->ops);
249 232
250 if (set) { 233 if (set) {
251 tr->ops = &trace_stack_ops; 234 tr->ops->func = function_stack_trace_call;
252 register_ftrace_function(tr->ops); 235 register_ftrace_function(tr->ops);
253 } else { 236 } else {
254 tr->ops = &trace_ops; 237 tr->ops->func = function_trace_call;
255 register_ftrace_function(tr->ops); 238 register_ftrace_function(tr->ops);
256 } 239 }
257 240
@@ -269,7 +252,6 @@ static struct tracer function_trace __tracer_data =
269 .init = function_trace_init, 252 .init = function_trace_init,
270 .reset = function_trace_reset, 253 .reset = function_trace_reset,
271 .start = function_trace_start, 254 .start = function_trace_start,
272 .wait_pipe = poll_wait_pipe,
273 .flags = &func_flags, 255 .flags = &func_flags,
274 .set_flag = func_set_flag, 256 .set_flag = func_set_flag,
275 .allow_instances = true, 257 .allow_instances = true,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index deff11200261..4de3e57f723c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -38,15 +38,6 @@ struct fgraph_data {
38 38
39#define TRACE_GRAPH_INDENT 2 39#define TRACE_GRAPH_INDENT 2
40 40
41/* Flag options */
42#define TRACE_GRAPH_PRINT_OVERRUN 0x1
43#define TRACE_GRAPH_PRINT_CPU 0x2
44#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
45#define TRACE_GRAPH_PRINT_PROC 0x8
46#define TRACE_GRAPH_PRINT_DURATION 0x10
47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40
49
50static unsigned int max_depth; 41static unsigned int max_depth;
51 42
52static struct tracer_opt trace_opts[] = { 43static struct tracer_opt trace_opts[] = {
@@ -64,11 +55,13 @@ static struct tracer_opt trace_opts[] = {
64 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, 55 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
65 /* Display interrupts */ 56 /* Display interrupts */
66 { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, 57 { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
58 /* Display function name after trailing } */
59 { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
67 { } /* Empty entry */ 60 { } /* Empty entry */
68}; 61};
69 62
70static struct tracer_flags tracer_flags = { 63static struct tracer_flags tracer_flags = {
71 /* Don't display overruns and proc by default */ 64 /* Don't display overruns, proc, or tail by default */
72 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | 65 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
73 TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, 66 TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
74 .opts = trace_opts 67 .opts = trace_opts
@@ -1176,9 +1169,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1176 * If the return function does not have a matching entry, 1169 * If the return function does not have a matching entry,
1177 * then the entry was lost. Instead of just printing 1170 * then the entry was lost. Instead of just printing
1178 * the '}' and letting the user guess what function this 1171 * the '}' and letting the user guess what function this
1179 * belongs to, write out the function name. 1172 * belongs to, write out the function name. Always do
1173 * that if the funcgraph-tail option is enabled.
1180 */ 1174 */
1181 if (func_match) { 1175 if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) {
1182 ret = trace_seq_puts(s, "}\n"); 1176 ret = trace_seq_puts(s, "}\n");
1183 if (!ret) 1177 if (!ret)
1184 return TRACE_TYPE_PARTIAL_LINE; 1178 return TRACE_TYPE_PARTIAL_LINE;
@@ -1505,7 +1499,6 @@ static struct tracer graph_trace __tracer_data = {
1505 .pipe_open = graph_trace_open, 1499 .pipe_open = graph_trace_open,
1506 .close = graph_trace_close, 1500 .close = graph_trace_close,
1507 .pipe_close = graph_trace_close, 1501 .pipe_close = graph_trace_close,
1508 .wait_pipe = poll_wait_pipe,
1509 .init = graph_trace_init, 1502 .init = graph_trace_init,
1510 .reset = graph_trace_reset, 1503 .reset = graph_trace_reset,
1511 .print_line = print_graph_function, 1504 .print_line = print_graph_function,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 8ff02cbb892f..9bb104f748d0 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -151,12 +151,6 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
151 151
152 atomic_dec(&data->disabled); 152 atomic_dec(&data->disabled);
153} 153}
154
155static struct ftrace_ops trace_ops __read_mostly =
156{
157 .func = irqsoff_tracer_call,
158 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
159};
160#endif /* CONFIG_FUNCTION_TRACER */ 154#endif /* CONFIG_FUNCTION_TRACER */
161 155
162#ifdef CONFIG_FUNCTION_GRAPH_TRACER 156#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -176,7 +170,7 @@ irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
176 for_each_possible_cpu(cpu) 170 for_each_possible_cpu(cpu)
177 per_cpu(tracing_cpu, cpu) = 0; 171 per_cpu(tracing_cpu, cpu) = 0;
178 172
179 tracing_max_latency = 0; 173 tr->max_latency = 0;
180 tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); 174 tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
181 175
182 return start_irqsoff_tracer(irqsoff_trace, set); 176 return start_irqsoff_tracer(irqsoff_trace, set);
@@ -303,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s)
303/* 297/*
304 * Should this new latency be reported/recorded? 298 * Should this new latency be reported/recorded?
305 */ 299 */
306static int report_latency(cycle_t delta) 300static int report_latency(struct trace_array *tr, cycle_t delta)
307{ 301{
308 if (tracing_thresh) { 302 if (tracing_thresh) {
309 if (delta < tracing_thresh) 303 if (delta < tracing_thresh)
310 return 0; 304 return 0;
311 } else { 305 } else {
312 if (delta <= tracing_max_latency) 306 if (delta <= tr->max_latency)
313 return 0; 307 return 0;
314 } 308 }
315 return 1; 309 return 1;
@@ -333,13 +327,13 @@ check_critical_timing(struct trace_array *tr,
333 327
334 pc = preempt_count(); 328 pc = preempt_count();
335 329
336 if (!report_latency(delta)) 330 if (!report_latency(tr, delta))
337 goto out; 331 goto out;
338 332
339 raw_spin_lock_irqsave(&max_trace_lock, flags); 333 raw_spin_lock_irqsave(&max_trace_lock, flags);
340 334
341 /* check if we are still the max latency */ 335 /* check if we are still the max latency */
342 if (!report_latency(delta)) 336 if (!report_latency(tr, delta))
343 goto out_unlock; 337 goto out_unlock;
344 338
345 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 339 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
@@ -352,7 +346,7 @@ check_critical_timing(struct trace_array *tr,
352 data->critical_end = parent_ip; 346 data->critical_end = parent_ip;
353 347
354 if (likely(!is_tracing_stopped())) { 348 if (likely(!is_tracing_stopped())) {
355 tracing_max_latency = delta; 349 tr->max_latency = delta;
356 update_max_tr_single(tr, current, cpu); 350 update_max_tr_single(tr, current, cpu);
357 } 351 }
358 352
@@ -531,7 +525,7 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
531} 525}
532#endif /* CONFIG_PREEMPT_TRACER */ 526#endif /* CONFIG_PREEMPT_TRACER */
533 527
534static int register_irqsoff_function(int graph, int set) 528static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
535{ 529{
536 int ret; 530 int ret;
537 531
@@ -543,7 +537,7 @@ static int register_irqsoff_function(int graph, int set)
543 ret = register_ftrace_graph(&irqsoff_graph_return, 537 ret = register_ftrace_graph(&irqsoff_graph_return,
544 &irqsoff_graph_entry); 538 &irqsoff_graph_entry);
545 else 539 else
546 ret = register_ftrace_function(&trace_ops); 540 ret = register_ftrace_function(tr->ops);
547 541
548 if (!ret) 542 if (!ret)
549 function_enabled = true; 543 function_enabled = true;
@@ -551,7 +545,7 @@ static int register_irqsoff_function(int graph, int set)
551 return ret; 545 return ret;
552} 546}
553 547
554static void unregister_irqsoff_function(int graph) 548static void unregister_irqsoff_function(struct trace_array *tr, int graph)
555{ 549{
556 if (!function_enabled) 550 if (!function_enabled)
557 return; 551 return;
@@ -559,17 +553,17 @@ static void unregister_irqsoff_function(int graph)
559 if (graph) 553 if (graph)
560 unregister_ftrace_graph(); 554 unregister_ftrace_graph();
561 else 555 else
562 unregister_ftrace_function(&trace_ops); 556 unregister_ftrace_function(tr->ops);
563 557
564 function_enabled = false; 558 function_enabled = false;
565} 559}
566 560
567static void irqsoff_function_set(int set) 561static void irqsoff_function_set(struct trace_array *tr, int set)
568{ 562{
569 if (set) 563 if (set)
570 register_irqsoff_function(is_graph(), 1); 564 register_irqsoff_function(tr, is_graph(), 1);
571 else 565 else
572 unregister_irqsoff_function(is_graph()); 566 unregister_irqsoff_function(tr, is_graph());
573} 567}
574 568
575static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) 569static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -577,7 +571,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
577 struct tracer *tracer = tr->current_trace; 571 struct tracer *tracer = tr->current_trace;
578 572
579 if (mask & TRACE_ITER_FUNCTION) 573 if (mask & TRACE_ITER_FUNCTION)
580 irqsoff_function_set(set); 574 irqsoff_function_set(tr, set);
581 575
582 return trace_keep_overwrite(tracer, mask, set); 576 return trace_keep_overwrite(tracer, mask, set);
583} 577}
@@ -586,7 +580,7 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph)
586{ 580{
587 int ret; 581 int ret;
588 582
589 ret = register_irqsoff_function(graph, 0); 583 ret = register_irqsoff_function(tr, graph, 0);
590 584
591 if (!ret && tracing_is_enabled()) 585 if (!ret && tracing_is_enabled())
592 tracer_enabled = 1; 586 tracer_enabled = 1;
@@ -600,25 +594,37 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
600{ 594{
601 tracer_enabled = 0; 595 tracer_enabled = 0;
602 596
603 unregister_irqsoff_function(graph); 597 unregister_irqsoff_function(tr, graph);
604} 598}
605 599
606static void __irqsoff_tracer_init(struct trace_array *tr) 600static bool irqsoff_busy;
601
602static int __irqsoff_tracer_init(struct trace_array *tr)
607{ 603{
604 if (irqsoff_busy)
605 return -EBUSY;
606
608 save_flags = trace_flags; 607 save_flags = trace_flags;
609 608
610 /* non overwrite screws up the latency tracers */ 609 /* non overwrite screws up the latency tracers */
611 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); 610 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
612 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); 611 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
613 612
614 tracing_max_latency = 0; 613 tr->max_latency = 0;
615 irqsoff_trace = tr; 614 irqsoff_trace = tr;
616 /* make sure that the tracer is visible */ 615 /* make sure that the tracer is visible */
617 smp_wmb(); 616 smp_wmb();
618 tracing_reset_online_cpus(&tr->trace_buffer); 617 tracing_reset_online_cpus(&tr->trace_buffer);
619 618
620 if (start_irqsoff_tracer(tr, is_graph())) 619 ftrace_init_array_ops(tr, irqsoff_tracer_call);
620
621 /* Only toplevel instance supports graph tracing */
622 if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
623 is_graph())))
621 printk(KERN_ERR "failed to start irqsoff tracer\n"); 624 printk(KERN_ERR "failed to start irqsoff tracer\n");
625
626 irqsoff_busy = true;
627 return 0;
622} 628}
623 629
624static void irqsoff_tracer_reset(struct trace_array *tr) 630static void irqsoff_tracer_reset(struct trace_array *tr)
@@ -630,6 +636,9 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
630 636
631 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); 637 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
632 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); 638 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
639 ftrace_reset_array_ops(tr);
640
641 irqsoff_busy = false;
633} 642}
634 643
635static void irqsoff_tracer_start(struct trace_array *tr) 644static void irqsoff_tracer_start(struct trace_array *tr)
@@ -647,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr)
647{ 656{
648 trace_type = TRACER_IRQS_OFF; 657 trace_type = TRACER_IRQS_OFF;
649 658
650 __irqsoff_tracer_init(tr); 659 return __irqsoff_tracer_init(tr);
651 return 0;
652} 660}
653static struct tracer irqsoff_tracer __read_mostly = 661static struct tracer irqsoff_tracer __read_mostly =
654{ 662{
@@ -668,6 +676,7 @@ static struct tracer irqsoff_tracer __read_mostly =
668#endif 676#endif
669 .open = irqsoff_trace_open, 677 .open = irqsoff_trace_open,
670 .close = irqsoff_trace_close, 678 .close = irqsoff_trace_close,
679 .allow_instances = true,
671 .use_max_tr = true, 680 .use_max_tr = true,
672}; 681};
673# define register_irqsoff(trace) register_tracer(&trace) 682# define register_irqsoff(trace) register_tracer(&trace)
@@ -680,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr)
680{ 689{
681 trace_type = TRACER_PREEMPT_OFF; 690 trace_type = TRACER_PREEMPT_OFF;
682 691
683 __irqsoff_tracer_init(tr); 692 return __irqsoff_tracer_init(tr);
684 return 0;
685} 693}
686 694
687static struct tracer preemptoff_tracer __read_mostly = 695static struct tracer preemptoff_tracer __read_mostly =
@@ -702,6 +710,7 @@ static struct tracer preemptoff_tracer __read_mostly =
702#endif 710#endif
703 .open = irqsoff_trace_open, 711 .open = irqsoff_trace_open,
704 .close = irqsoff_trace_close, 712 .close = irqsoff_trace_close,
713 .allow_instances = true,
705 .use_max_tr = true, 714 .use_max_tr = true,
706}; 715};
707# define register_preemptoff(trace) register_tracer(&trace) 716# define register_preemptoff(trace) register_tracer(&trace)
@@ -716,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr)
716{ 725{
717 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; 726 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
718 727
719 __irqsoff_tracer_init(tr); 728 return __irqsoff_tracer_init(tr);
720 return 0;
721} 729}
722 730
723static struct tracer preemptirqsoff_tracer __read_mostly = 731static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -738,6 +746,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
738#endif 746#endif
739 .open = irqsoff_trace_open, 747 .open = irqsoff_trace_open,
740 .close = irqsoff_trace_close, 748 .close = irqsoff_trace_close,
749 .allow_instances = true,
741 .use_max_tr = true, 750 .use_max_tr = true,
742}; 751};
743 752
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 903ae28962be..282f6e4e5539 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -40,27 +40,27 @@ struct trace_kprobe {
40 (sizeof(struct probe_arg) * (n))) 40 (sizeof(struct probe_arg) * (n)))
41 41
42 42
43static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk) 43static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
44{ 44{
45 return tk->rp.handler != NULL; 45 return tk->rp.handler != NULL;
46} 46}
47 47
48static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk) 48static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk)
49{ 49{
50 return tk->symbol ? tk->symbol : "unknown"; 50 return tk->symbol ? tk->symbol : "unknown";
51} 51}
52 52
53static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk) 53static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
54{ 54{
55 return tk->rp.kp.offset; 55 return tk->rp.kp.offset;
56} 56}
57 57
58static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk) 58static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
59{ 59{
60 return !!(kprobe_gone(&tk->rp.kp)); 60 return !!(kprobe_gone(&tk->rp.kp));
61} 61}
62 62
63static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk, 63static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
64 struct module *mod) 64 struct module *mod)
65{ 65{
66 int len = strlen(mod->name); 66 int len = strlen(mod->name);
@@ -68,7 +68,7 @@ static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk,
68 return strncmp(mod->name, name, len) == 0 && name[len] == ':'; 68 return strncmp(mod->name, name, len) == 0 && name[len] == ':';
69} 69}
70 70
71static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk) 71static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
72{ 72{
73 return !!strchr(trace_kprobe_symbol(tk), ':'); 73 return !!strchr(trace_kprobe_symbol(tk), ':');
74} 74}
@@ -132,19 +132,21 @@ struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
132 * Kprobes-specific fetch functions 132 * Kprobes-specific fetch functions
133 */ 133 */
134#define DEFINE_FETCH_stack(type) \ 134#define DEFINE_FETCH_stack(type) \
135static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ 135static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
136 void *offset, void *dest) \ 136 void *offset, void *dest) \
137{ \ 137{ \
138 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ 138 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
139 (unsigned int)((unsigned long)offset)); \ 139 (unsigned int)((unsigned long)offset)); \
140} 140} \
141NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type));
142
141DEFINE_BASIC_FETCH_FUNCS(stack) 143DEFINE_BASIC_FETCH_FUNCS(stack)
142/* No string on the stack entry */ 144/* No string on the stack entry */
143#define fetch_stack_string NULL 145#define fetch_stack_string NULL
144#define fetch_stack_string_size NULL 146#define fetch_stack_string_size NULL
145 147
146#define DEFINE_FETCH_memory(type) \ 148#define DEFINE_FETCH_memory(type) \
147static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ 149static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
148 void *addr, void *dest) \ 150 void *addr, void *dest) \
149{ \ 151{ \
150 type retval; \ 152 type retval; \
@@ -152,14 +154,16 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
152 *(type *)dest = 0; \ 154 *(type *)dest = 0; \
153 else \ 155 else \
154 *(type *)dest = retval; \ 156 *(type *)dest = retval; \
155} 157} \
158NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type));
159
156DEFINE_BASIC_FETCH_FUNCS(memory) 160DEFINE_BASIC_FETCH_FUNCS(memory)
157/* 161/*
158 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max 162 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
159 * length and relative data location. 163 * length and relative data location.
160 */ 164 */
161static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, 165static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
162 void *addr, void *dest) 166 void *addr, void *dest)
163{ 167{
164 long ret; 168 long ret;
165 int maxlen = get_rloc_len(*(u32 *)dest); 169 int maxlen = get_rloc_len(*(u32 *)dest);
@@ -193,10 +197,11 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
193 get_rloc_offs(*(u32 *)dest)); 197 get_rloc_offs(*(u32 *)dest));
194 } 198 }
195} 199}
200NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
196 201
197/* Return the length of string -- including null terminal byte */ 202/* Return the length of string -- including null terminal byte */
198static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, 203static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
199 void *addr, void *dest) 204 void *addr, void *dest)
200{ 205{
201 mm_segment_t old_fs; 206 mm_segment_t old_fs;
202 int ret, len = 0; 207 int ret, len = 0;
@@ -219,17 +224,19 @@ static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
219 else 224 else
220 *(u32 *)dest = len; 225 *(u32 *)dest = len;
221} 226}
227NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size));
222 228
223#define DEFINE_FETCH_symbol(type) \ 229#define DEFINE_FETCH_symbol(type) \
224__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \ 230void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\
225 void *data, void *dest) \
226{ \ 231{ \
227 struct symbol_cache *sc = data; \ 232 struct symbol_cache *sc = data; \
228 if (sc->addr) \ 233 if (sc->addr) \
229 fetch_memory_##type(regs, (void *)sc->addr, dest); \ 234 fetch_memory_##type(regs, (void *)sc->addr, dest); \
230 else \ 235 else \
231 *(type *)dest = 0; \ 236 *(type *)dest = 0; \
232} 237} \
238NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type));
239
233DEFINE_BASIC_FETCH_FUNCS(symbol) 240DEFINE_BASIC_FETCH_FUNCS(symbol)
234DEFINE_FETCH_symbol(string) 241DEFINE_FETCH_symbol(string)
235DEFINE_FETCH_symbol(string_size) 242DEFINE_FETCH_symbol(string_size)
@@ -907,7 +914,7 @@ static const struct file_operations kprobe_profile_ops = {
907}; 914};
908 915
909/* Kprobe handler */ 916/* Kprobe handler */
910static __kprobes void 917static nokprobe_inline void
911__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, 918__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
912 struct ftrace_event_file *ftrace_file) 919 struct ftrace_event_file *ftrace_file)
913{ 920{
@@ -943,7 +950,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
943 entry, irq_flags, pc, regs); 950 entry, irq_flags, pc, regs);
944} 951}
945 952
946static __kprobes void 953static void
947kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) 954kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
948{ 955{
949 struct event_file_link *link; 956 struct event_file_link *link;
@@ -951,9 +958,10 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
951 list_for_each_entry_rcu(link, &tk->tp.files, list) 958 list_for_each_entry_rcu(link, &tk->tp.files, list)
952 __kprobe_trace_func(tk, regs, link->file); 959 __kprobe_trace_func(tk, regs, link->file);
953} 960}
961NOKPROBE_SYMBOL(kprobe_trace_func);
954 962
955/* Kretprobe handler */ 963/* Kretprobe handler */
956static __kprobes void 964static nokprobe_inline void
957__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, 965__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
958 struct pt_regs *regs, 966 struct pt_regs *regs,
959 struct ftrace_event_file *ftrace_file) 967 struct ftrace_event_file *ftrace_file)
@@ -991,7 +999,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
991 entry, irq_flags, pc, regs); 999 entry, irq_flags, pc, regs);
992} 1000}
993 1001
994static __kprobes void 1002static void
995kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, 1003kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
996 struct pt_regs *regs) 1004 struct pt_regs *regs)
997{ 1005{
@@ -1000,6 +1008,7 @@ kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1000 list_for_each_entry_rcu(link, &tk->tp.files, list) 1008 list_for_each_entry_rcu(link, &tk->tp.files, list)
1001 __kretprobe_trace_func(tk, ri, regs, link->file); 1009 __kretprobe_trace_func(tk, ri, regs, link->file);
1002} 1010}
1011NOKPROBE_SYMBOL(kretprobe_trace_func);
1003 1012
1004/* Event entry printers */ 1013/* Event entry printers */
1005static enum print_line_t 1014static enum print_line_t
@@ -1131,7 +1140,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1131#ifdef CONFIG_PERF_EVENTS 1140#ifdef CONFIG_PERF_EVENTS
1132 1141
1133/* Kprobe profile handler */ 1142/* Kprobe profile handler */
1134static __kprobes void 1143static void
1135kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) 1144kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1136{ 1145{
1137 struct ftrace_event_call *call = &tk->tp.call; 1146 struct ftrace_event_call *call = &tk->tp.call;
@@ -1158,9 +1167,10 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1158 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); 1167 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
1159 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1168 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1160} 1169}
1170NOKPROBE_SYMBOL(kprobe_perf_func);
1161 1171
1162/* Kretprobe profile handler */ 1172/* Kretprobe profile handler */
1163static __kprobes void 1173static void
1164kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, 1174kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1165 struct pt_regs *regs) 1175 struct pt_regs *regs)
1166{ 1176{
@@ -1188,6 +1198,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1188 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); 1198 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
1189 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1199 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1190} 1200}
1201NOKPROBE_SYMBOL(kretprobe_perf_func);
1191#endif /* CONFIG_PERF_EVENTS */ 1202#endif /* CONFIG_PERF_EVENTS */
1192 1203
1193/* 1204/*
@@ -1196,9 +1207,8 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1196 * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe 1207 * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
1197 * lockless, but we can't race with this __init function. 1208 * lockless, but we can't race with this __init function.
1198 */ 1209 */
1199static __kprobes 1210static int kprobe_register(struct ftrace_event_call *event,
1200int kprobe_register(struct ftrace_event_call *event, 1211 enum trace_reg type, void *data)
1201 enum trace_reg type, void *data)
1202{ 1212{
1203 struct trace_kprobe *tk = (struct trace_kprobe *)event->data; 1213 struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
1204 struct ftrace_event_file *file = data; 1214 struct ftrace_event_file *file = data;
@@ -1224,8 +1234,7 @@ int kprobe_register(struct ftrace_event_call *event,
1224 return 0; 1234 return 0;
1225} 1235}
1226 1236
1227static __kprobes 1237static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1228int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1229{ 1238{
1230 struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); 1239 struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
1231 1240
@@ -1239,9 +1248,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1239#endif 1248#endif
1240 return 0; /* We don't tweek kernel, so just return 0 */ 1249 return 0; /* We don't tweek kernel, so just return 0 */
1241} 1250}
1251NOKPROBE_SYMBOL(kprobe_dispatcher);
1242 1252
1243static __kprobes 1253static int
1244int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) 1254kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1245{ 1255{
1246 struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); 1256 struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
1247 1257
@@ -1255,6 +1265,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1255#endif 1265#endif
1256 return 0; /* We don't tweek kernel, so just return 0 */ 1266 return 0; /* We don't tweek kernel, so just return 0 */
1257} 1267}
1268NOKPROBE_SYMBOL(kretprobe_dispatcher);
1258 1269
1259static struct trace_event_functions kretprobe_funcs = { 1270static struct trace_event_functions kretprobe_funcs = {
1260 .trace = print_kretprobe_event 1271 .trace = print_kretprobe_event
@@ -1377,6 +1388,9 @@ static __init int kprobe_trace_self_tests_init(void)
1377 struct trace_kprobe *tk; 1388 struct trace_kprobe *tk;
1378 struct ftrace_event_file *file; 1389 struct ftrace_event_file *file;
1379 1390
1391 if (tracing_is_disabled())
1392 return -ENODEV;
1393
1380 target = kprobe_trace_selftest_target; 1394 target = kprobe_trace_selftest_target;
1381 1395
1382 pr_info("Testing kprobe tracing: "); 1396 pr_info("Testing kprobe tracing: ");
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 69a5cc94c01a..fcf0a9e48916 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -91,7 +91,6 @@ struct tracer nop_trace __read_mostly =
91 .name = "nop", 91 .name = "nop",
92 .init = nop_trace_init, 92 .init = nop_trace_init,
93 .reset = nop_trace_reset, 93 .reset = nop_trace_reset,
94 .wait_pipe = poll_wait_pipe,
95#ifdef CONFIG_FTRACE_SELFTEST 94#ifdef CONFIG_FTRACE_SELFTEST
96 .selftest = trace_selftest_startup_nop, 95 .selftest = trace_selftest_startup_nop,
97#endif 96#endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a436de18aa99..f3dad80c20b2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -126,6 +126,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
126EXPORT_SYMBOL_GPL(trace_seq_printf); 126EXPORT_SYMBOL_GPL(trace_seq_printf);
127 127
128/** 128/**
129 * trace_seq_bitmask - put a list of longs as a bitmask print output
130 * @s: trace sequence descriptor
131 * @maskp: points to an array of unsigned longs that represent a bitmask
132 * @nmaskbits: The number of bits that are valid in @maskp
133 *
134 * It returns 0 if the trace oversizes the buffer's free
135 * space, 1 otherwise.
136 *
137 * Writes a ASCII representation of a bitmask string into @s.
138 */
139int
140trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
141 int nmaskbits)
142{
143 int len = (PAGE_SIZE - 1) - s->len;
144 int ret;
145
146 if (s->full || !len)
147 return 0;
148
149 ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
150 s->len += ret;
151
152 return 1;
153}
154EXPORT_SYMBOL_GPL(trace_seq_bitmask);
155
156/**
129 * trace_seq_vprintf - sequence printing of trace information 157 * trace_seq_vprintf - sequence printing of trace information
130 * @s: trace sequence descriptor 158 * @s: trace sequence descriptor
131 * @fmt: printf format string 159 * @fmt: printf format string
@@ -399,6 +427,19 @@ EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
399#endif 427#endif
400 428
401const char * 429const char *
430ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
431 unsigned int bitmask_size)
432{
433 const char *ret = p->buffer + p->len;
434
435 trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
436 trace_seq_putc(p, 0);
437
438 return ret;
439}
440EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq);
441
442const char *
402ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) 443ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
403{ 444{
404 int i; 445 int i;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8364a421b4df..d4b9fc22cd27 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -37,13 +37,13 @@ const char *reserved_field_names[] = {
37 37
38/* Printing in basic type function template */ 38/* Printing in basic type function template */
39#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ 39#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \
40__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ 40int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
41 const char *name, \ 41 void *data, void *ent) \
42 void *data, void *ent) \
43{ \ 42{ \
44 return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ 43 return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
45} \ 44} \
46const char PRINT_TYPE_FMT_NAME(type)[] = fmt; 45const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
46NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
47 47
48DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") 48DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
49DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") 49DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
@@ -55,9 +55,8 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
55DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") 55DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
56 56
57/* Print type function for string type */ 57/* Print type function for string type */
58__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, 58int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
59 const char *name, 59 void *data, void *ent)
60 void *data, void *ent)
61{ 60{
62 int len = *(u32 *)data >> 16; 61 int len = *(u32 *)data >> 16;
63 62
@@ -67,6 +66,7 @@ __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
67 return trace_seq_printf(s, " %s=\"%s\"", name, 66 return trace_seq_printf(s, " %s=\"%s\"", name,
68 (const char *)get_loc_data(data, ent)); 67 (const char *)get_loc_data(data, ent));
69} 68}
69NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
70 70
71const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; 71const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
72 72
@@ -81,23 +81,24 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
81 81
82/* Data fetch function templates */ 82/* Data fetch function templates */
83#define DEFINE_FETCH_reg(type) \ 83#define DEFINE_FETCH_reg(type) \
84__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ 84void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \
85 void *offset, void *dest) \
86{ \ 85{ \
87 *(type *)dest = (type)regs_get_register(regs, \ 86 *(type *)dest = (type)regs_get_register(regs, \
88 (unsigned int)((unsigned long)offset)); \ 87 (unsigned int)((unsigned long)offset)); \
89} 88} \
89NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type));
90DEFINE_BASIC_FETCH_FUNCS(reg) 90DEFINE_BASIC_FETCH_FUNCS(reg)
91/* No string on the register */ 91/* No string on the register */
92#define fetch_reg_string NULL 92#define fetch_reg_string NULL
93#define fetch_reg_string_size NULL 93#define fetch_reg_string_size NULL
94 94
95#define DEFINE_FETCH_retval(type) \ 95#define DEFINE_FETCH_retval(type) \
96__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ 96void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \
97 void *dummy, void *dest) \ 97 void *dummy, void *dest) \
98{ \ 98{ \
99 *(type *)dest = (type)regs_return_value(regs); \ 99 *(type *)dest = (type)regs_return_value(regs); \
100} 100} \
101NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type));
101DEFINE_BASIC_FETCH_FUNCS(retval) 102DEFINE_BASIC_FETCH_FUNCS(retval)
102/* No string on the retval */ 103/* No string on the retval */
103#define fetch_retval_string NULL 104#define fetch_retval_string NULL
@@ -112,8 +113,8 @@ struct deref_fetch_param {
112}; 113};
113 114
114#define DEFINE_FETCH_deref(type) \ 115#define DEFINE_FETCH_deref(type) \
115__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ 116void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
116 void *data, void *dest) \ 117 void *data, void *dest) \
117{ \ 118{ \
118 struct deref_fetch_param *dprm = data; \ 119 struct deref_fetch_param *dprm = data; \
119 unsigned long addr; \ 120 unsigned long addr; \
@@ -123,12 +124,13 @@ __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
123 dprm->fetch(regs, (void *)addr, dest); \ 124 dprm->fetch(regs, (void *)addr, dest); \
124 } else \ 125 } else \
125 *(type *)dest = 0; \ 126 *(type *)dest = 0; \
126} 127} \
128NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type));
127DEFINE_BASIC_FETCH_FUNCS(deref) 129DEFINE_BASIC_FETCH_FUNCS(deref)
128DEFINE_FETCH_deref(string) 130DEFINE_FETCH_deref(string)
129 131
130__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, 132void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
131 void *data, void *dest) 133 void *data, void *dest)
132{ 134{
133 struct deref_fetch_param *dprm = data; 135 struct deref_fetch_param *dprm = data;
134 unsigned long addr; 136 unsigned long addr;
@@ -140,16 +142,18 @@ __kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
140 } else 142 } else
141 *(string_size *)dest = 0; 143 *(string_size *)dest = 0;
142} 144}
145NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size));
143 146
144static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) 147static void update_deref_fetch_param(struct deref_fetch_param *data)
145{ 148{
146 if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) 149 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
147 update_deref_fetch_param(data->orig.data); 150 update_deref_fetch_param(data->orig.data);
148 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) 151 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
149 update_symbol_cache(data->orig.data); 152 update_symbol_cache(data->orig.data);
150} 153}
154NOKPROBE_SYMBOL(update_deref_fetch_param);
151 155
152static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) 156static void free_deref_fetch_param(struct deref_fetch_param *data)
153{ 157{
154 if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) 158 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
155 free_deref_fetch_param(data->orig.data); 159 free_deref_fetch_param(data->orig.data);
@@ -157,6 +161,7 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
157 free_symbol_cache(data->orig.data); 161 free_symbol_cache(data->orig.data);
158 kfree(data); 162 kfree(data);
159} 163}
164NOKPROBE_SYMBOL(free_deref_fetch_param);
160 165
161/* Bitfield fetch function */ 166/* Bitfield fetch function */
162struct bitfield_fetch_param { 167struct bitfield_fetch_param {
@@ -166,8 +171,8 @@ struct bitfield_fetch_param {
166}; 171};
167 172
168#define DEFINE_FETCH_bitfield(type) \ 173#define DEFINE_FETCH_bitfield(type) \
169__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ 174void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
170 void *data, void *dest) \ 175 void *data, void *dest) \
171{ \ 176{ \
172 struct bitfield_fetch_param *bprm = data; \ 177 struct bitfield_fetch_param *bprm = data; \
173 type buf = 0; \ 178 type buf = 0; \
@@ -177,13 +182,13 @@ __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
177 buf >>= bprm->low_shift; \ 182 buf >>= bprm->low_shift; \
178 } \ 183 } \
179 *(type *)dest = buf; \ 184 *(type *)dest = buf; \
180} 185} \
181 186NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type));
182DEFINE_BASIC_FETCH_FUNCS(bitfield) 187DEFINE_BASIC_FETCH_FUNCS(bitfield)
183#define fetch_bitfield_string NULL 188#define fetch_bitfield_string NULL
184#define fetch_bitfield_string_size NULL 189#define fetch_bitfield_string_size NULL
185 190
186static __kprobes void 191static void
187update_bitfield_fetch_param(struct bitfield_fetch_param *data) 192update_bitfield_fetch_param(struct bitfield_fetch_param *data)
188{ 193{
189 /* 194 /*
@@ -196,7 +201,7 @@ update_bitfield_fetch_param(struct bitfield_fetch_param *data)
196 update_symbol_cache(data->orig.data); 201 update_symbol_cache(data->orig.data);
197} 202}
198 203
199static __kprobes void 204static void
200free_bitfield_fetch_param(struct bitfield_fetch_param *data) 205free_bitfield_fetch_param(struct bitfield_fetch_param *data)
201{ 206{
202 /* 207 /*
@@ -255,17 +260,17 @@ fail:
255} 260}
256 261
257/* Special function : only accept unsigned long */ 262/* Special function : only accept unsigned long */
258static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs, 263static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest)
259 void *dummy, void *dest)
260{ 264{
261 *(unsigned long *)dest = kernel_stack_pointer(regs); 265 *(unsigned long *)dest = kernel_stack_pointer(regs);
262} 266}
267NOKPROBE_SYMBOL(fetch_kernel_stack_address);
263 268
264static __kprobes void fetch_user_stack_address(struct pt_regs *regs, 269static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest)
265 void *dummy, void *dest)
266{ 270{
267 *(unsigned long *)dest = user_stack_pointer(regs); 271 *(unsigned long *)dest = user_stack_pointer(regs);
268} 272}
273NOKPROBE_SYMBOL(fetch_user_stack_address);
269 274
270static fetch_func_t get_fetch_size_function(const struct fetch_type *type, 275static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
271 fetch_func_t orig_fn, 276 fetch_func_t orig_fn,
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index fb1ab5dfbd42..4f815fbce16d 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -81,13 +81,13 @@
81 */ 81 */
82#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) 82#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
83 83
84static inline void *get_rloc_data(u32 *dl) 84static nokprobe_inline void *get_rloc_data(u32 *dl)
85{ 85{
86 return (u8 *)dl + get_rloc_offs(*dl); 86 return (u8 *)dl + get_rloc_offs(*dl);
87} 87}
88 88
89/* For data_loc conversion */ 89/* For data_loc conversion */
90static inline void *get_loc_data(u32 *dl, void *ent) 90static nokprobe_inline void *get_loc_data(u32 *dl, void *ent)
91{ 91{
92 return (u8 *)ent + get_rloc_offs(*dl); 92 return (u8 *)ent + get_rloc_offs(*dl);
93} 93}
@@ -136,9 +136,8 @@ typedef u32 string_size;
136 136
137/* Printing in basic type function template */ 137/* Printing in basic type function template */
138#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ 138#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \
139__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ 139int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
140 const char *name, \ 140 void *data, void *ent); \
141 void *data, void *ent); \
142extern const char PRINT_TYPE_FMT_NAME(type)[] 141extern const char PRINT_TYPE_FMT_NAME(type)[]
143 142
144DECLARE_BASIC_PRINT_TYPE_FUNC(u8); 143DECLARE_BASIC_PRINT_TYPE_FUNC(u8);
@@ -303,7 +302,7 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp)
303 return !!(tp->flags & TP_FLAG_REGISTERED); 302 return !!(tp->flags & TP_FLAG_REGISTERED);
304} 303}
305 304
306static inline __kprobes void call_fetch(struct fetch_param *fprm, 305static nokprobe_inline void call_fetch(struct fetch_param *fprm,
307 struct pt_regs *regs, void *dest) 306 struct pt_regs *regs, void *dest)
308{ 307{
309 return fprm->fn(regs, fprm->data, dest); 308 return fprm->fn(regs, fprm->data, dest);
@@ -351,7 +350,7 @@ extern ssize_t traceprobe_probes_write(struct file *file,
351extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); 350extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
352 351
353/* Sum up total data length for dynamic arraies (strings) */ 352/* Sum up total data length for dynamic arraies (strings) */
354static inline __kprobes int 353static nokprobe_inline int
355__get_data_size(struct trace_probe *tp, struct pt_regs *regs) 354__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
356{ 355{
357 int i, ret = 0; 356 int i, ret = 0;
@@ -367,7 +366,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
367} 366}
368 367
369/* Store the value of each argument */ 368/* Store the value of each argument */
370static inline __kprobes void 369static nokprobe_inline void
371store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, 370store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
372 u8 *data, int maxlen) 371 u8 *data, int maxlen)
373{ 372{
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e14da5e97a69..19bd8928ce94 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -130,15 +130,9 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
130 atomic_dec(&data->disabled); 130 atomic_dec(&data->disabled);
131 preempt_enable_notrace(); 131 preempt_enable_notrace();
132} 132}
133
134static struct ftrace_ops trace_ops __read_mostly =
135{
136 .func = wakeup_tracer_call,
137 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
138};
139#endif /* CONFIG_FUNCTION_TRACER */ 133#endif /* CONFIG_FUNCTION_TRACER */
140 134
141static int register_wakeup_function(int graph, int set) 135static int register_wakeup_function(struct trace_array *tr, int graph, int set)
142{ 136{
143 int ret; 137 int ret;
144 138
@@ -150,7 +144,7 @@ static int register_wakeup_function(int graph, int set)
150 ret = register_ftrace_graph(&wakeup_graph_return, 144 ret = register_ftrace_graph(&wakeup_graph_return,
151 &wakeup_graph_entry); 145 &wakeup_graph_entry);
152 else 146 else
153 ret = register_ftrace_function(&trace_ops); 147 ret = register_ftrace_function(tr->ops);
154 148
155 if (!ret) 149 if (!ret)
156 function_enabled = true; 150 function_enabled = true;
@@ -158,7 +152,7 @@ static int register_wakeup_function(int graph, int set)
158 return ret; 152 return ret;
159} 153}
160 154
161static void unregister_wakeup_function(int graph) 155static void unregister_wakeup_function(struct trace_array *tr, int graph)
162{ 156{
163 if (!function_enabled) 157 if (!function_enabled)
164 return; 158 return;
@@ -166,17 +160,17 @@ static void unregister_wakeup_function(int graph)
166 if (graph) 160 if (graph)
167 unregister_ftrace_graph(); 161 unregister_ftrace_graph();
168 else 162 else
169 unregister_ftrace_function(&trace_ops); 163 unregister_ftrace_function(tr->ops);
170 164
171 function_enabled = false; 165 function_enabled = false;
172} 166}
173 167
174static void wakeup_function_set(int set) 168static void wakeup_function_set(struct trace_array *tr, int set)
175{ 169{
176 if (set) 170 if (set)
177 register_wakeup_function(is_graph(), 1); 171 register_wakeup_function(tr, is_graph(), 1);
178 else 172 else
179 unregister_wakeup_function(is_graph()); 173 unregister_wakeup_function(tr, is_graph());
180} 174}
181 175
182static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) 176static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -184,16 +178,16 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
184 struct tracer *tracer = tr->current_trace; 178 struct tracer *tracer = tr->current_trace;
185 179
186 if (mask & TRACE_ITER_FUNCTION) 180 if (mask & TRACE_ITER_FUNCTION)
187 wakeup_function_set(set); 181 wakeup_function_set(tr, set);
188 182
189 return trace_keep_overwrite(tracer, mask, set); 183 return trace_keep_overwrite(tracer, mask, set);
190} 184}
191 185
192static int start_func_tracer(int graph) 186static int start_func_tracer(struct trace_array *tr, int graph)
193{ 187{
194 int ret; 188 int ret;
195 189
196 ret = register_wakeup_function(graph, 0); 190 ret = register_wakeup_function(tr, graph, 0);
197 191
198 if (!ret && tracing_is_enabled()) 192 if (!ret && tracing_is_enabled())
199 tracer_enabled = 1; 193 tracer_enabled = 1;
@@ -203,11 +197,11 @@ static int start_func_tracer(int graph)
203 return ret; 197 return ret;
204} 198}
205 199
206static void stop_func_tracer(int graph) 200static void stop_func_tracer(struct trace_array *tr, int graph)
207{ 201{
208 tracer_enabled = 0; 202 tracer_enabled = 0;
209 203
210 unregister_wakeup_function(graph); 204 unregister_wakeup_function(tr, graph);
211} 205}
212 206
213#ifdef CONFIG_FUNCTION_GRAPH_TRACER 207#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -221,12 +215,12 @@ wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
221 if (!(is_graph() ^ set)) 215 if (!(is_graph() ^ set))
222 return 0; 216 return 0;
223 217
224 stop_func_tracer(!set); 218 stop_func_tracer(tr, !set);
225 219
226 wakeup_reset(wakeup_trace); 220 wakeup_reset(wakeup_trace);
227 tracing_max_latency = 0; 221 tr->max_latency = 0;
228 222
229 return start_func_tracer(set); 223 return start_func_tracer(tr, set);
230} 224}
231 225
232static int wakeup_graph_entry(struct ftrace_graph_ent *trace) 226static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
@@ -350,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s)
350/* 344/*
351 * Should this new latency be reported/recorded? 345 * Should this new latency be reported/recorded?
352 */ 346 */
353static int report_latency(cycle_t delta) 347static int report_latency(struct trace_array *tr, cycle_t delta)
354{ 348{
355 if (tracing_thresh) { 349 if (tracing_thresh) {
356 if (delta < tracing_thresh) 350 if (delta < tracing_thresh)
357 return 0; 351 return 0;
358 } else { 352 } else {
359 if (delta <= tracing_max_latency) 353 if (delta <= tr->max_latency)
360 return 0; 354 return 0;
361 } 355 }
362 return 1; 356 return 1;
@@ -424,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore,
424 T1 = ftrace_now(cpu); 418 T1 = ftrace_now(cpu);
425 delta = T1-T0; 419 delta = T1-T0;
426 420
427 if (!report_latency(delta)) 421 if (!report_latency(wakeup_trace, delta))
428 goto out_unlock; 422 goto out_unlock;
429 423
430 if (likely(!is_tracing_stopped())) { 424 if (likely(!is_tracing_stopped())) {
431 tracing_max_latency = delta; 425 wakeup_trace->max_latency = delta;
432 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); 426 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
433 } 427 }
434 428
@@ -587,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
587 */ 581 */
588 smp_wmb(); 582 smp_wmb();
589 583
590 if (start_func_tracer(is_graph())) 584 if (start_func_tracer(tr, is_graph()))
591 printk(KERN_ERR "failed to start wakeup tracer\n"); 585 printk(KERN_ERR "failed to start wakeup tracer\n");
592 586
593 return; 587 return;
@@ -600,13 +594,15 @@ fail_deprobe:
600static void stop_wakeup_tracer(struct trace_array *tr) 594static void stop_wakeup_tracer(struct trace_array *tr)
601{ 595{
602 tracer_enabled = 0; 596 tracer_enabled = 0;
603 stop_func_tracer(is_graph()); 597 stop_func_tracer(tr, is_graph());
604 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); 598 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
605 unregister_trace_sched_wakeup_new(probe_wakeup, NULL); 599 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
606 unregister_trace_sched_wakeup(probe_wakeup, NULL); 600 unregister_trace_sched_wakeup(probe_wakeup, NULL);
607 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); 601 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
608} 602}
609 603
604static bool wakeup_busy;
605
610static int __wakeup_tracer_init(struct trace_array *tr) 606static int __wakeup_tracer_init(struct trace_array *tr)
611{ 607{
612 save_flags = trace_flags; 608 save_flags = trace_flags;
@@ -615,14 +611,20 @@ static int __wakeup_tracer_init(struct trace_array *tr)
615 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); 611 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
616 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); 612 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
617 613
618 tracing_max_latency = 0; 614 tr->max_latency = 0;
619 wakeup_trace = tr; 615 wakeup_trace = tr;
616 ftrace_init_array_ops(tr, wakeup_tracer_call);
620 start_wakeup_tracer(tr); 617 start_wakeup_tracer(tr);
618
619 wakeup_busy = true;
621 return 0; 620 return 0;
622} 621}
623 622
624static int wakeup_tracer_init(struct trace_array *tr) 623static int wakeup_tracer_init(struct trace_array *tr)
625{ 624{
625 if (wakeup_busy)
626 return -EBUSY;
627
626 wakeup_dl = 0; 628 wakeup_dl = 0;
627 wakeup_rt = 0; 629 wakeup_rt = 0;
628 return __wakeup_tracer_init(tr); 630 return __wakeup_tracer_init(tr);
@@ -630,6 +632,9 @@ static int wakeup_tracer_init(struct trace_array *tr)
630 632
631static int wakeup_rt_tracer_init(struct trace_array *tr) 633static int wakeup_rt_tracer_init(struct trace_array *tr)
632{ 634{
635 if (wakeup_busy)
636 return -EBUSY;
637
633 wakeup_dl = 0; 638 wakeup_dl = 0;
634 wakeup_rt = 1; 639 wakeup_rt = 1;
635 return __wakeup_tracer_init(tr); 640 return __wakeup_tracer_init(tr);
@@ -637,6 +642,9 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
637 642
638static int wakeup_dl_tracer_init(struct trace_array *tr) 643static int wakeup_dl_tracer_init(struct trace_array *tr)
639{ 644{
645 if (wakeup_busy)
646 return -EBUSY;
647
640 wakeup_dl = 1; 648 wakeup_dl = 1;
641 wakeup_rt = 0; 649 wakeup_rt = 0;
642 return __wakeup_tracer_init(tr); 650 return __wakeup_tracer_init(tr);
@@ -653,6 +661,8 @@ static void wakeup_tracer_reset(struct trace_array *tr)
653 661
654 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); 662 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
655 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); 663 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
664 ftrace_reset_array_ops(tr);
665 wakeup_busy = false;
656} 666}
657 667
658static void wakeup_tracer_start(struct trace_array *tr) 668static void wakeup_tracer_start(struct trace_array *tr)
@@ -684,6 +694,7 @@ static struct tracer wakeup_tracer __read_mostly =
684#endif 694#endif
685 .open = wakeup_trace_open, 695 .open = wakeup_trace_open,
686 .close = wakeup_trace_close, 696 .close = wakeup_trace_close,
697 .allow_instances = true,
687 .use_max_tr = true, 698 .use_max_tr = true,
688}; 699};
689 700
@@ -694,7 +705,6 @@ static struct tracer wakeup_rt_tracer __read_mostly =
694 .reset = wakeup_tracer_reset, 705 .reset = wakeup_tracer_reset,
695 .start = wakeup_tracer_start, 706 .start = wakeup_tracer_start,
696 .stop = wakeup_tracer_stop, 707 .stop = wakeup_tracer_stop,
697 .wait_pipe = poll_wait_pipe,
698 .print_max = true, 708 .print_max = true,
699 .print_header = wakeup_print_header, 709 .print_header = wakeup_print_header,
700 .print_line = wakeup_print_line, 710 .print_line = wakeup_print_line,
@@ -706,6 +716,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
706#endif 716#endif
707 .open = wakeup_trace_open, 717 .open = wakeup_trace_open,
708 .close = wakeup_trace_close, 718 .close = wakeup_trace_close,
719 .allow_instances = true,
709 .use_max_tr = true, 720 .use_max_tr = true,
710}; 721};
711 722
@@ -716,7 +727,6 @@ static struct tracer wakeup_dl_tracer __read_mostly =
716 .reset = wakeup_tracer_reset, 727 .reset = wakeup_tracer_reset,
717 .start = wakeup_tracer_start, 728 .start = wakeup_tracer_start,
718 .stop = wakeup_tracer_stop, 729 .stop = wakeup_tracer_stop,
719 .wait_pipe = poll_wait_pipe,
720 .print_max = true, 730 .print_max = true,
721 .print_header = wakeup_print_header, 731 .print_header = wakeup_print_header,
722 .print_line = wakeup_print_line, 732 .print_line = wakeup_print_line,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index e98fca60974f..5ef60499dc8e 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -65,7 +65,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
65 65
66 /* Don't allow flipping of max traces now */ 66 /* Don't allow flipping of max traces now */
67 local_irq_save(flags); 67 local_irq_save(flags);
68 arch_spin_lock(&ftrace_max_lock); 68 arch_spin_lock(&buf->tr->max_lock);
69 69
70 cnt = ring_buffer_entries(buf->buffer); 70 cnt = ring_buffer_entries(buf->buffer);
71 71
@@ -83,7 +83,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
83 break; 83 break;
84 } 84 }
85 tracing_on(); 85 tracing_on();
86 arch_spin_unlock(&ftrace_max_lock); 86 arch_spin_unlock(&buf->tr->max_lock);
87 local_irq_restore(flags); 87 local_irq_restore(flags);
88 88
89 if (count) 89 if (count)
@@ -161,11 +161,6 @@ static struct ftrace_ops test_probe3 = {
161 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 161 .flags = FTRACE_OPS_FL_RECURSION_SAFE,
162}; 162};
163 163
164static struct ftrace_ops test_global = {
165 .func = trace_selftest_test_global_func,
166 .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
167};
168
169static void print_counts(void) 164static void print_counts(void)
170{ 165{
171 printk("(%d %d %d %d %d) ", 166 printk("(%d %d %d %d %d) ",
@@ -185,7 +180,7 @@ static void reset_counts(void)
185 trace_selftest_test_dyn_cnt = 0; 180 trace_selftest_test_dyn_cnt = 0;
186} 181}
187 182
188static int trace_selftest_ops(int cnt) 183static int trace_selftest_ops(struct trace_array *tr, int cnt)
189{ 184{
190 int save_ftrace_enabled = ftrace_enabled; 185 int save_ftrace_enabled = ftrace_enabled;
191 struct ftrace_ops *dyn_ops; 186 struct ftrace_ops *dyn_ops;
@@ -220,7 +215,11 @@ static int trace_selftest_ops(int cnt)
220 register_ftrace_function(&test_probe1); 215 register_ftrace_function(&test_probe1);
221 register_ftrace_function(&test_probe2); 216 register_ftrace_function(&test_probe2);
222 register_ftrace_function(&test_probe3); 217 register_ftrace_function(&test_probe3);
223 register_ftrace_function(&test_global); 218 /* First time we are running with main function */
219 if (cnt > 1) {
220 ftrace_init_array_ops(tr, trace_selftest_test_global_func);
221 register_ftrace_function(tr->ops);
222 }
224 223
225 DYN_FTRACE_TEST_NAME(); 224 DYN_FTRACE_TEST_NAME();
226 225
@@ -232,8 +231,10 @@ static int trace_selftest_ops(int cnt)
232 goto out; 231 goto out;
233 if (trace_selftest_test_probe3_cnt != 1) 232 if (trace_selftest_test_probe3_cnt != 1)
234 goto out; 233 goto out;
235 if (trace_selftest_test_global_cnt == 0) 234 if (cnt > 1) {
236 goto out; 235 if (trace_selftest_test_global_cnt == 0)
236 goto out;
237 }
237 238
238 DYN_FTRACE_TEST_NAME2(); 239 DYN_FTRACE_TEST_NAME2();
239 240
@@ -269,8 +270,10 @@ static int trace_selftest_ops(int cnt)
269 goto out_free; 270 goto out_free;
270 if (trace_selftest_test_probe3_cnt != 3) 271 if (trace_selftest_test_probe3_cnt != 3)
271 goto out_free; 272 goto out_free;
272 if (trace_selftest_test_global_cnt == 0) 273 if (cnt > 1) {
273 goto out; 274 if (trace_selftest_test_global_cnt == 0)
275 goto out;
276 }
274 if (trace_selftest_test_dyn_cnt == 0) 277 if (trace_selftest_test_dyn_cnt == 0)
275 goto out_free; 278 goto out_free;
276 279
@@ -295,7 +298,9 @@ static int trace_selftest_ops(int cnt)
295 unregister_ftrace_function(&test_probe1); 298 unregister_ftrace_function(&test_probe1);
296 unregister_ftrace_function(&test_probe2); 299 unregister_ftrace_function(&test_probe2);
297 unregister_ftrace_function(&test_probe3); 300 unregister_ftrace_function(&test_probe3);
298 unregister_ftrace_function(&test_global); 301 if (cnt > 1)
302 unregister_ftrace_function(tr->ops);
303 ftrace_reset_array_ops(tr);
299 304
300 /* Make sure everything is off */ 305 /* Make sure everything is off */
301 reset_counts(); 306 reset_counts();
@@ -315,9 +320,9 @@ static int trace_selftest_ops(int cnt)
315} 320}
316 321
317/* Test dynamic code modification and ftrace filters */ 322/* Test dynamic code modification and ftrace filters */
318int trace_selftest_startup_dynamic_tracing(struct tracer *trace, 323static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
319 struct trace_array *tr, 324 struct trace_array *tr,
320 int (*func)(void)) 325 int (*func)(void))
321{ 326{
322 int save_ftrace_enabled = ftrace_enabled; 327 int save_ftrace_enabled = ftrace_enabled;
323 unsigned long count; 328 unsigned long count;
@@ -388,7 +393,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
388 } 393 }
389 394
390 /* Test the ops with global tracing running */ 395 /* Test the ops with global tracing running */
391 ret = trace_selftest_ops(1); 396 ret = trace_selftest_ops(tr, 1);
392 trace->reset(tr); 397 trace->reset(tr);
393 398
394 out: 399 out:
@@ -399,7 +404,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
399 404
400 /* Test the ops with global tracing off */ 405 /* Test the ops with global tracing off */
401 if (!ret) 406 if (!ret)
402 ret = trace_selftest_ops(2); 407 ret = trace_selftest_ops(tr, 2);
403 408
404 return ret; 409 return ret;
405} 410}
@@ -802,7 +807,7 @@ out:
802int 807int
803trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) 808trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
804{ 809{
805 unsigned long save_max = tracing_max_latency; 810 unsigned long save_max = tr->max_latency;
806 unsigned long count; 811 unsigned long count;
807 int ret; 812 int ret;
808 813
@@ -814,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
814 } 819 }
815 820
816 /* reset the max latency */ 821 /* reset the max latency */
817 tracing_max_latency = 0; 822 tr->max_latency = 0;
818 /* disable interrupts for a bit */ 823 /* disable interrupts for a bit */
819 local_irq_disable(); 824 local_irq_disable();
820 udelay(100); 825 udelay(100);
@@ -841,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
841 ret = -1; 846 ret = -1;
842 } 847 }
843 848
844 tracing_max_latency = save_max; 849 tr->max_latency = save_max;
845 850
846 return ret; 851 return ret;
847} 852}
@@ -851,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
851int 856int
852trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) 857trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
853{ 858{
854 unsigned long save_max = tracing_max_latency; 859 unsigned long save_max = tr->max_latency;
855 unsigned long count; 860 unsigned long count;
856 int ret; 861 int ret;
857 862
@@ -876,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
876 } 881 }
877 882
878 /* reset the max latency */ 883 /* reset the max latency */
879 tracing_max_latency = 0; 884 tr->max_latency = 0;
880 /* disable preemption for a bit */ 885 /* disable preemption for a bit */
881 preempt_disable(); 886 preempt_disable();
882 udelay(100); 887 udelay(100);
@@ -903,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
903 ret = -1; 908 ret = -1;
904 } 909 }
905 910
906 tracing_max_latency = save_max; 911 tr->max_latency = save_max;
907 912
908 return ret; 913 return ret;
909} 914}
@@ -913,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
913int 918int
914trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) 919trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
915{ 920{
916 unsigned long save_max = tracing_max_latency; 921 unsigned long save_max = tr->max_latency;
917 unsigned long count; 922 unsigned long count;
918 int ret; 923 int ret;
919 924
@@ -938,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
938 } 943 }
939 944
940 /* reset the max latency */ 945 /* reset the max latency */
941 tracing_max_latency = 0; 946 tr->max_latency = 0;
942 947
943 /* disable preemption and interrupts for a bit */ 948 /* disable preemption and interrupts for a bit */
944 preempt_disable(); 949 preempt_disable();
@@ -973,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
973 } 978 }
974 979
975 /* do the test by disabling interrupts first this time */ 980 /* do the test by disabling interrupts first this time */
976 tracing_max_latency = 0; 981 tr->max_latency = 0;
977 tracing_start(); 982 tracing_start();
978 trace->start(tr); 983 trace->start(tr);
979 984
@@ -1004,7 +1009,7 @@ out:
1004 tracing_start(); 1009 tracing_start();
1005out_no_start: 1010out_no_start:
1006 trace->reset(tr); 1011 trace->reset(tr);
1007 tracing_max_latency = save_max; 1012 tr->max_latency = save_max;
1008 1013
1009 return ret; 1014 return ret;
1010} 1015}
@@ -1057,7 +1062,7 @@ static int trace_wakeup_test_thread(void *data)
1057int 1062int
1058trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) 1063trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1059{ 1064{
1060 unsigned long save_max = tracing_max_latency; 1065 unsigned long save_max = tr->max_latency;
1061 struct task_struct *p; 1066 struct task_struct *p;
1062 struct completion is_ready; 1067 struct completion is_ready;
1063 unsigned long count; 1068 unsigned long count;
@@ -1083,7 +1088,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1083 } 1088 }
1084 1089
1085 /* reset the max latency */ 1090 /* reset the max latency */
1086 tracing_max_latency = 0; 1091 tr->max_latency = 0;
1087 1092
1088 while (p->on_rq) { 1093 while (p->on_rq) {
1089 /* 1094 /*
@@ -1113,7 +1118,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1113 trace->reset(tr); 1118 trace->reset(tr);
1114 tracing_start(); 1119 tracing_start();
1115 1120
1116 tracing_max_latency = save_max; 1121 tr->max_latency = save_max;
1117 1122
1118 /* kill the thread */ 1123 /* kill the thread */
1119 kthread_stop(p); 1124 kthread_stop(p);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 21b320e5d163..8a4e5cb66a4c 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -51,11 +51,33 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
51int stack_tracer_enabled; 51int stack_tracer_enabled;
52static int last_stack_tracer_enabled; 52static int last_stack_tracer_enabled;
53 53
54static inline void print_max_stack(void)
55{
56 long i;
57 int size;
58
59 pr_emerg(" Depth Size Location (%d entries)\n"
60 " ----- ---- --------\n",
61 max_stack_trace.nr_entries - 1);
62
63 for (i = 0; i < max_stack_trace.nr_entries; i++) {
64 if (stack_dump_trace[i] == ULONG_MAX)
65 break;
66 if (i+1 == max_stack_trace.nr_entries ||
67 stack_dump_trace[i+1] == ULONG_MAX)
68 size = stack_dump_index[i];
69 else
70 size = stack_dump_index[i] - stack_dump_index[i+1];
71
72 pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i],
73 size, (void *)stack_dump_trace[i]);
74 }
75}
76
54static inline void 77static inline void
55check_stack(unsigned long ip, unsigned long *stack) 78check_stack(unsigned long ip, unsigned long *stack)
56{ 79{
57 unsigned long this_size, flags; 80 unsigned long this_size, flags; unsigned long *p, *top, *start;
58 unsigned long *p, *top, *start;
59 static int tracer_frame; 81 static int tracer_frame;
60 int frame_size = ACCESS_ONCE(tracer_frame); 82 int frame_size = ACCESS_ONCE(tracer_frame);
61 int i; 83 int i;
@@ -85,8 +107,12 @@ check_stack(unsigned long ip, unsigned long *stack)
85 107
86 max_stack_size = this_size; 108 max_stack_size = this_size;
87 109
88 max_stack_trace.nr_entries = 0; 110 max_stack_trace.nr_entries = 0;
89 max_stack_trace.skip = 3; 111
112 if (using_ftrace_ops_list_func())
113 max_stack_trace.skip = 4;
114 else
115 max_stack_trace.skip = 3;
90 116
91 save_stack_trace(&max_stack_trace); 117 save_stack_trace(&max_stack_trace);
92 118
@@ -145,8 +171,12 @@ check_stack(unsigned long ip, unsigned long *stack)
145 i++; 171 i++;
146 } 172 }
147 173
148 BUG_ON(current != &init_task && 174 if ((current != &init_task &&
149 *(end_of_stack(current)) != STACK_END_MAGIC); 175 *(end_of_stack(current)) != STACK_END_MAGIC)) {
176 print_max_stack();
177 BUG();
178 }
179
150 out: 180 out:
151 arch_spin_unlock(&max_stack_lock); 181 arch_spin_unlock(&max_stack_lock);
152 local_irq_restore(flags); 182 local_irq_restore(flags);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c082a7441345..3c9b97e6b1f4 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -108,8 +108,8 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n)
108 * Uprobes-specific fetch functions 108 * Uprobes-specific fetch functions
109 */ 109 */
110#define DEFINE_FETCH_stack(type) \ 110#define DEFINE_FETCH_stack(type) \
111static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ 111static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
112 void *offset, void *dest) \ 112 void *offset, void *dest) \
113{ \ 113{ \
114 *(type *)dest = (type)get_user_stack_nth(regs, \ 114 *(type *)dest = (type)get_user_stack_nth(regs, \
115 ((unsigned long)offset)); \ 115 ((unsigned long)offset)); \
@@ -120,8 +120,8 @@ DEFINE_BASIC_FETCH_FUNCS(stack)
120#define fetch_stack_string_size NULL 120#define fetch_stack_string_size NULL
121 121
122#define DEFINE_FETCH_memory(type) \ 122#define DEFINE_FETCH_memory(type) \
123static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ 123static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
124 void *addr, void *dest) \ 124 void *addr, void *dest) \
125{ \ 125{ \
126 type retval; \ 126 type retval; \
127 void __user *vaddr = (void __force __user *) addr; \ 127 void __user *vaddr = (void __force __user *) addr; \
@@ -136,8 +136,8 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
136 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max 136 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
137 * length and relative data location. 137 * length and relative data location.
138 */ 138 */
139static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, 139static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
140 void *addr, void *dest) 140 void *addr, void *dest)
141{ 141{
142 long ret; 142 long ret;
143 u32 rloc = *(u32 *)dest; 143 u32 rloc = *(u32 *)dest;
@@ -158,8 +158,8 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
158 } 158 }
159} 159}
160 160
161static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, 161static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
162 void *addr, void *dest) 162 void *addr, void *dest)
163{ 163{
164 int len; 164 int len;
165 void __user *vaddr = (void __force __user *) addr; 165 void __user *vaddr = (void __force __user *) addr;
@@ -184,8 +184,8 @@ static unsigned long translate_user_vaddr(void *file_offset)
184} 184}
185 185
186#define DEFINE_FETCH_file_offset(type) \ 186#define DEFINE_FETCH_file_offset(type) \
187static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\ 187static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \
188 void *offset, void *dest) \ 188 void *offset, void *dest)\
189{ \ 189{ \
190 void *vaddr = (void *)translate_user_vaddr(offset); \ 190 void *vaddr = (void *)translate_user_vaddr(offset); \
191 \ 191 \
@@ -893,6 +893,9 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
893 int ret; 893 int ret;
894 894
895 if (file) { 895 if (file) {
896 if (tu->tp.flags & TP_FLAG_PROFILE)
897 return -EINTR;
898
896 link = kmalloc(sizeof(*link), GFP_KERNEL); 899 link = kmalloc(sizeof(*link), GFP_KERNEL);
897 if (!link) 900 if (!link)
898 return -ENOMEM; 901 return -ENOMEM;
@@ -901,29 +904,40 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
901 list_add_tail_rcu(&link->list, &tu->tp.files); 904 list_add_tail_rcu(&link->list, &tu->tp.files);
902 905
903 tu->tp.flags |= TP_FLAG_TRACE; 906 tu->tp.flags |= TP_FLAG_TRACE;
904 } else 907 } else {
905 tu->tp.flags |= TP_FLAG_PROFILE; 908 if (tu->tp.flags & TP_FLAG_TRACE)
909 return -EINTR;
906 910
907 ret = uprobe_buffer_enable(); 911 tu->tp.flags |= TP_FLAG_PROFILE;
908 if (ret < 0) 912 }
909 return ret;
910 913
911 WARN_ON(!uprobe_filter_is_empty(&tu->filter)); 914 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
912 915
913 if (enabled) 916 if (enabled)
914 return 0; 917 return 0;
915 918
919 ret = uprobe_buffer_enable();
920 if (ret)
921 goto err_flags;
922
916 tu->consumer.filter = filter; 923 tu->consumer.filter = filter;
917 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); 924 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
918 if (ret) { 925 if (ret)
919 if (file) { 926 goto err_buffer;
920 list_del(&link->list); 927
921 kfree(link); 928 return 0;
922 tu->tp.flags &= ~TP_FLAG_TRACE;
923 } else
924 tu->tp.flags &= ~TP_FLAG_PROFILE;
925 }
926 929
930 err_buffer:
931 uprobe_buffer_disable();
932
933 err_flags:
934 if (file) {
935 list_del(&link->list);
936 kfree(link);
937 tu->tp.flags &= ~TP_FLAG_TRACE;
938 } else {
939 tu->tp.flags &= ~TP_FLAG_PROFILE;
940 }
927 return ret; 941 return ret;
928} 942}
929 943
@@ -1009,56 +1023,60 @@ uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
1009 return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); 1023 return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
1010} 1024}
1011 1025
1012static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) 1026static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
1013{ 1027{
1014 bool done; 1028 bool done;
1015 1029
1016 write_lock(&tu->filter.rwlock); 1030 write_lock(&tu->filter.rwlock);
1017 if (event->hw.tp_target) { 1031 if (event->hw.tp_target) {
1018 /* 1032 list_del(&event->hw.tp_list);
1019 * event->parent != NULL means copy_process(), we can avoid
1020 * uprobe_apply(). current->mm must be probed and we can rely
1021 * on dup_mmap() which preserves the already installed bp's.
1022 *
1023 * attr.enable_on_exec means that exec/mmap will install the
1024 * breakpoints we need.
1025 */
1026 done = tu->filter.nr_systemwide || 1033 done = tu->filter.nr_systemwide ||
1027 event->parent || event->attr.enable_on_exec || 1034 (event->hw.tp_target->flags & PF_EXITING) ||
1028 uprobe_filter_event(tu, event); 1035 uprobe_filter_event(tu, event);
1029 list_add(&event->hw.tp_list, &tu->filter.perf_events);
1030 } else { 1036 } else {
1037 tu->filter.nr_systemwide--;
1031 done = tu->filter.nr_systemwide; 1038 done = tu->filter.nr_systemwide;
1032 tu->filter.nr_systemwide++;
1033 } 1039 }
1034 write_unlock(&tu->filter.rwlock); 1040 write_unlock(&tu->filter.rwlock);
1035 1041
1036 if (!done) 1042 if (!done)
1037 uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); 1043 return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
1038 1044
1039 return 0; 1045 return 0;
1040} 1046}
1041 1047
1042static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) 1048static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
1043{ 1049{
1044 bool done; 1050 bool done;
1051 int err;
1045 1052
1046 write_lock(&tu->filter.rwlock); 1053 write_lock(&tu->filter.rwlock);
1047 if (event->hw.tp_target) { 1054 if (event->hw.tp_target) {
1048 list_del(&event->hw.tp_list); 1055 /*
1056 * event->parent != NULL means copy_process(), we can avoid
1057 * uprobe_apply(). current->mm must be probed and we can rely
1058 * on dup_mmap() which preserves the already installed bp's.
1059 *
1060 * attr.enable_on_exec means that exec/mmap will install the
1061 * breakpoints we need.
1062 */
1049 done = tu->filter.nr_systemwide || 1063 done = tu->filter.nr_systemwide ||
1050 (event->hw.tp_target->flags & PF_EXITING) || 1064 event->parent || event->attr.enable_on_exec ||
1051 uprobe_filter_event(tu, event); 1065 uprobe_filter_event(tu, event);
1066 list_add(&event->hw.tp_list, &tu->filter.perf_events);
1052 } else { 1067 } else {
1053 tu->filter.nr_systemwide--;
1054 done = tu->filter.nr_systemwide; 1068 done = tu->filter.nr_systemwide;
1069 tu->filter.nr_systemwide++;
1055 } 1070 }
1056 write_unlock(&tu->filter.rwlock); 1071 write_unlock(&tu->filter.rwlock);
1057 1072
1058 if (!done) 1073 err = 0;
1059 uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); 1074 if (!done) {
1060 1075 err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
1061 return 0; 1076 if (err)
1077 uprobe_perf_close(tu, event);
1078 }
1079 return err;
1062} 1080}
1063 1081
1064static bool uprobe_perf_filter(struct uprobe_consumer *uc, 1082static bool uprobe_perf_filter(struct uprobe_consumer *uc,
@@ -1197,12 +1215,6 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
1197 1215
1198 current->utask->vaddr = (unsigned long) &udd; 1216 current->utask->vaddr = (unsigned long) &udd;
1199 1217
1200#ifdef CONFIG_PERF_EVENTS
1201 if ((tu->tp.flags & TP_FLAG_TRACE) == 0 &&
1202 !uprobe_perf_filter(&tu->consumer, 0, current->mm))
1203 return UPROBE_HANDLER_REMOVE;
1204#endif
1205
1206 if (WARN_ON_ONCE(!uprobe_cpu_buffer)) 1218 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1207 return 0; 1219 return 0;
1208 1220
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 6620e5837ce2..3490407dc7b7 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -239,6 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
239 * tracepoint_probe_register - Connect a probe to a tracepoint 239 * tracepoint_probe_register - Connect a probe to a tracepoint
240 * @tp: tracepoint 240 * @tp: tracepoint
241 * @probe: probe handler 241 * @probe: probe handler
242 * @data: tracepoint data
242 * 243 *
243 * Returns 0 if ok, error value on error. 244 * Returns 0 if ok, error value on error.
244 * Note: if @tp is within a module, the caller is responsible for 245 * Note: if @tp is within a module, the caller is responsible for
@@ -264,6 +265,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
264 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint 265 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
265 * @tp: tracepoint 266 * @tp: tracepoint
266 * @probe: probe function pointer 267 * @probe: probe function pointer
268 * @data: tracepoint data
267 * 269 *
268 * Returns 0 if ok, error value on error. 270 * Returns 0 if ok, error value on error.
269 */ 271 */
@@ -490,33 +492,29 @@ static int sys_tracepoint_refcount;
490 492
491void syscall_regfunc(void) 493void syscall_regfunc(void)
492{ 494{
493 unsigned long flags; 495 struct task_struct *p, *t;
494 struct task_struct *g, *t;
495 496
496 if (!sys_tracepoint_refcount) { 497 if (!sys_tracepoint_refcount) {
497 read_lock_irqsave(&tasklist_lock, flags); 498 read_lock(&tasklist_lock);
498 do_each_thread(g, t) { 499 for_each_process_thread(p, t) {
499 /* Skip kernel threads. */ 500 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
500 if (t->mm) 501 }
501 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); 502 read_unlock(&tasklist_lock);
502 } while_each_thread(g, t);
503 read_unlock_irqrestore(&tasklist_lock, flags);
504 } 503 }
505 sys_tracepoint_refcount++; 504 sys_tracepoint_refcount++;
506} 505}
507 506
508void syscall_unregfunc(void) 507void syscall_unregfunc(void)
509{ 508{
510 unsigned long flags; 509 struct task_struct *p, *t;
511 struct task_struct *g, *t;
512 510
513 sys_tracepoint_refcount--; 511 sys_tracepoint_refcount--;
514 if (!sys_tracepoint_refcount) { 512 if (!sys_tracepoint_refcount) {
515 read_lock_irqsave(&tasklist_lock, flags); 513 read_lock(&tasklist_lock);
516 do_each_thread(g, t) { 514 for_each_process_thread(p, t) {
517 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); 515 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
518 } while_each_thread(g, t); 516 }
519 read_unlock_irqrestore(&tasklist_lock, flags); 517 read_unlock(&tasklist_lock);
520 } 518 }
521} 519}
522#endif 520#endif
diff --git a/kernel/user.c b/kernel/user.c
index 294fc6a94168..4efa39350e44 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
87struct user_struct root_user = { 87struct user_struct root_user = {
88 .__count = ATOMIC_INIT(1), 88 .__count = ATOMIC_INIT(1),
89 .processes = ATOMIC_INIT(1), 89 .processes = ATOMIC_INIT(1),
90 .files = ATOMIC_INIT(0),
91 .sigpending = ATOMIC_INIT(0), 90 .sigpending = ATOMIC_INIT(0),
92 .locked_shm = 0, 91 .locked_shm = 0,
93 .uid = GLOBAL_ROOT_UID, 92 .uid = GLOBAL_ROOT_UID,
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index bf71b4b2d632..fcc02560fd6b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -286,7 +286,7 @@ EXPORT_SYMBOL(from_kuid_munged);
286/** 286/**
287 * make_kgid - Map a user-namespace gid pair into a kgid. 287 * make_kgid - Map a user-namespace gid pair into a kgid.
288 * @ns: User namespace that the gid is in 288 * @ns: User namespace that the gid is in
289 * @uid: group identifier 289 * @gid: group identifier
290 * 290 *
291 * Maps a user-namespace gid pair into a kernel internal kgid, 291 * Maps a user-namespace gid pair into a kernel internal kgid,
292 * and returns that kgid. 292 * and returns that kgid.
@@ -482,7 +482,8 @@ static int projid_m_show(struct seq_file *seq, void *v)
482 return 0; 482 return 0;
483} 483}
484 484
485static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) 485static void *m_start(struct seq_file *seq, loff_t *ppos,
486 struct uid_gid_map *map)
486{ 487{
487 struct uid_gid_extent *extent = NULL; 488 struct uid_gid_extent *extent = NULL;
488 loff_t pos = *ppos; 489 loff_t pos = *ppos;
@@ -546,7 +547,8 @@ struct seq_operations proc_projid_seq_operations = {
546 .show = projid_m_show, 547 .show = projid_m_show,
547}; 548};
548 549
549static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) 550static bool mappings_overlap(struct uid_gid_map *new_map,
551 struct uid_gid_extent *extent)
550{ 552{
551 u32 upper_first, lower_first, upper_last, lower_last; 553 u32 upper_first, lower_first, upper_last, lower_last;
552 unsigned idx; 554 unsigned idx;
@@ -653,7 +655,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
653 ret = -EINVAL; 655 ret = -EINVAL;
654 pos = kbuf; 656 pos = kbuf;
655 new_map.nr_extents = 0; 657 new_map.nr_extents = 0;
656 for (;pos; pos = next_line) { 658 for (; pos; pos = next_line) {
657 extent = &new_map.extent[new_map.nr_extents]; 659 extent = &new_map.extent[new_map.nr_extents];
658 660
659 /* Find the end of line and ensure I don't look past it */ 661 /* Find the end of line and ensure I don't look past it */
@@ -687,13 +689,16 @@ static ssize_t map_write(struct file *file, const char __user *buf,
687 689
688 /* Verify we have been given valid starting values */ 690 /* Verify we have been given valid starting values */
689 if ((extent->first == (u32) -1) || 691 if ((extent->first == (u32) -1) ||
690 (extent->lower_first == (u32) -1 )) 692 (extent->lower_first == (u32) -1))
691 goto out; 693 goto out;
692 694
693 /* Verify count is not zero and does not cause the extent to wrap */ 695 /* Verify count is not zero and does not cause the
696 * extent to wrap
697 */
694 if ((extent->first + extent->count) <= extent->first) 698 if ((extent->first + extent->count) <= extent->first)
695 goto out; 699 goto out;
696 if ((extent->lower_first + extent->count) <= extent->lower_first) 700 if ((extent->lower_first + extent->count) <=
701 extent->lower_first)
697 goto out; 702 goto out;
698 703
699 /* Do the ranges in extent overlap any previous extents? */ 704 /* Do the ranges in extent overlap any previous extents? */
@@ -751,7 +756,8 @@ out:
751 return ret; 756 return ret;
752} 757}
753 758
754ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) 759ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
760 size_t size, loff_t *ppos)
755{ 761{
756 struct seq_file *seq = file->private_data; 762 struct seq_file *seq = file->private_data;
757 struct user_namespace *ns = seq->private; 763 struct user_namespace *ns = seq->private;
@@ -767,7 +773,8 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
767 &ns->uid_map, &ns->parent->uid_map); 773 &ns->uid_map, &ns->parent->uid_map);
768} 774}
769 775
770ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) 776ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
777 size_t size, loff_t *ppos)
771{ 778{
772 struct seq_file *seq = file->private_data; 779 struct seq_file *seq = file->private_data;
773 struct user_namespace *ns = seq->private; 780 struct user_namespace *ns = seq->private;
@@ -783,7 +790,8 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
783 &ns->gid_map, &ns->parent->gid_map); 790 &ns->gid_map, &ns->parent->gid_map);
784} 791}
785 792
786ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) 793ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
794 size_t size, loff_t *ppos)
787{ 795{
788 struct seq_file *seq = file->private_data; 796 struct seq_file *seq = file->private_data;
789 struct user_namespace *ns = seq->private; 797 struct user_namespace *ns = seq->private;
@@ -800,7 +808,7 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
800 &ns->projid_map, &ns->parent->projid_map); 808 &ns->projid_map, &ns->parent->projid_map);
801} 809}
802 810
803static bool new_idmap_permitted(const struct file *file, 811static bool new_idmap_permitted(const struct file *file,
804 struct user_namespace *ns, int cap_setid, 812 struct user_namespace *ns, int cap_setid,
805 struct uid_gid_map *new_map) 813 struct uid_gid_map *new_map)
806{ 814{
@@ -811,8 +819,7 @@ static bool new_idmap_permitted(const struct file *file,
811 kuid_t uid = make_kuid(ns->parent, id); 819 kuid_t uid = make_kuid(ns->parent, id);
812 if (uid_eq(uid, file->f_cred->fsuid)) 820 if (uid_eq(uid, file->f_cred->fsuid))
813 return true; 821 return true;
814 } 822 } else if (cap_setid == CAP_SETGID) {
815 else if (cap_setid == CAP_SETGID) {
816 kgid_t gid = make_kgid(ns->parent, id); 823 kgid_t gid = make_kgid(ns->parent, id);
817 if (gid_eq(gid, file->f_cred->fsgid)) 824 if (gid_eq(gid, file->f_cred->fsgid))
818 return true; 825 return true;
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 4f69f9a5e221..c8eac43267e9 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -17,7 +17,7 @@
17 17
18#ifdef CONFIG_PROC_SYSCTL 18#ifdef CONFIG_PROC_SYSCTL
19 19
20static void *get_uts(ctl_table *table, int write) 20static void *get_uts(struct ctl_table *table, int write)
21{ 21{
22 char *which = table->data; 22 char *which = table->data;
23 struct uts_namespace *uts_ns; 23 struct uts_namespace *uts_ns;
@@ -32,7 +32,7 @@ static void *get_uts(ctl_table *table, int write)
32 return which; 32 return which;
33} 33}
34 34
35static void put_uts(ctl_table *table, int write, void *which) 35static void put_uts(struct ctl_table *table, int write, void *which)
36{ 36{
37 if (!write) 37 if (!write)
38 up_read(&uts_sem); 38 up_read(&uts_sem);
@@ -44,14 +44,14 @@ static void put_uts(ctl_table *table, int write, void *which)
44 * Special case of dostring for the UTS structure. This has locks 44 * Special case of dostring for the UTS structure. This has locks
45 * to observe. Should this be in kernel/sys.c ???? 45 * to observe. Should this be in kernel/sys.c ????
46 */ 46 */
47static int proc_do_uts_string(ctl_table *table, int write, 47static int proc_do_uts_string(struct ctl_table *table, int write,
48 void __user *buffer, size_t *lenp, loff_t *ppos) 48 void __user *buffer, size_t *lenp, loff_t *ppos)
49{ 49{
50 struct ctl_table uts_table; 50 struct ctl_table uts_table;
51 int r; 51 int r;
52 memcpy(&uts_table, table, sizeof(uts_table)); 52 memcpy(&uts_table, table, sizeof(uts_table));
53 uts_table.data = get_uts(table, write); 53 uts_table.data = get_uts(table, write);
54 r = proc_dostring(&uts_table,write,buffer,lenp, ppos); 54 r = proc_dostring(&uts_table, write, buffer, lenp, ppos);
55 put_uts(table, write, uts_table.data); 55 put_uts(table, write, uts_table.data);
56 56
57 if (write) 57 if (write)
@@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void)
135 return 0; 135 return 0;
136} 136}
137 137
138__initcall(utsname_sysctl_init); 138device_initcall(utsname_sysctl_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 516203e665fc..c3319bd1b040 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,12 @@
31 31
32int watchdog_user_enabled = 1; 32int watchdog_user_enabled = 1;
33int __read_mostly watchdog_thresh = 10; 33int __read_mostly watchdog_thresh = 10;
34#ifdef CONFIG_SMP
35int __read_mostly sysctl_softlockup_all_cpu_backtrace;
36#else
37#define sysctl_softlockup_all_cpu_backtrace 0
38#endif
39
34static int __read_mostly watchdog_running; 40static int __read_mostly watchdog_running;
35static u64 __read_mostly sample_period; 41static u64 __read_mostly sample_period;
36 42
@@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
47static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); 53static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
48static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 54static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
49#endif 55#endif
56static unsigned long soft_lockup_nmi_warn;
50 57
51/* boot commands */ 58/* boot commands */
52/* 59/*
@@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str)
95} 102}
96__setup("nosoftlockup", nosoftlockup_setup); 103__setup("nosoftlockup", nosoftlockup_setup);
97/* */ 104/* */
105#ifdef CONFIG_SMP
106static int __init softlockup_all_cpu_backtrace_setup(char *str)
107{
108 sysctl_softlockup_all_cpu_backtrace =
109 !!simple_strtol(str, NULL, 0);
110 return 1;
111}
112__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
113#endif
98 114
99/* 115/*
100 * Hard-lockup warnings should be triggered after just a few seconds. Soft- 116 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
271 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); 287 unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
272 struct pt_regs *regs = get_irq_regs(); 288 struct pt_regs *regs = get_irq_regs();
273 int duration; 289 int duration;
290 int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
274 291
275 /* kick the hardlockup detector */ 292 /* kick the hardlockup detector */
276 watchdog_interrupt_count(); 293 watchdog_interrupt_count();
@@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
317 if (__this_cpu_read(soft_watchdog_warn) == true) 334 if (__this_cpu_read(soft_watchdog_warn) == true)
318 return HRTIMER_RESTART; 335 return HRTIMER_RESTART;
319 336
337 if (softlockup_all_cpu_backtrace) {
338 /* Prevent multiple soft-lockup reports if one cpu is already
339 * engaged in dumping cpu back traces
340 */
341 if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
342 /* Someone else will report us. Let's give up */
343 __this_cpu_write(soft_watchdog_warn, true);
344 return HRTIMER_RESTART;
345 }
346 }
347
320 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 348 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
321 smp_processor_id(), duration, 349 smp_processor_id(), duration,
322 current->comm, task_pid_nr(current)); 350 current->comm, task_pid_nr(current));
@@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
327 else 355 else
328 dump_stack(); 356 dump_stack();
329 357
358 if (softlockup_all_cpu_backtrace) {
359 /* Avoid generating two back traces for current
360 * given that one is already made above
361 */
362 trigger_allbutself_cpu_backtrace();
363
364 clear_bit(0, &soft_lockup_nmi_warn);
365 /* Barrier to sync with other cpus */
366 smp_mb__after_atomic();
367 }
368
330 if (softlockup_panic) 369 if (softlockup_panic)
331 panic("softlockup: hung tasks"); 370 panic("softlockup: hung tasks");
332 __this_cpu_write(soft_watchdog_warn, true); 371 __this_cpu_write(soft_watchdog_warn, true);
@@ -527,10 +566,8 @@ static void update_timers_all_cpus(void)
527 int cpu; 566 int cpu;
528 567
529 get_online_cpus(); 568 get_online_cpus();
530 preempt_disable();
531 for_each_online_cpu(cpu) 569 for_each_online_cpu(cpu)
532 update_timers(cpu); 570 update_timers(cpu);
533 preempt_enable();
534 put_online_cpus(); 571 put_online_cpus();
535} 572}
536 573
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0ee63af30bd1..35974ac69600 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -65,15 +65,12 @@ enum {
65 * be executing on any CPU. The pool behaves as an unbound one. 65 * be executing on any CPU. The pool behaves as an unbound one.
66 * 66 *
67 * Note that DISASSOCIATED should be flipped only while holding 67 * Note that DISASSOCIATED should be flipped only while holding
68 * manager_mutex to avoid changing binding state while 68 * attach_mutex to avoid changing binding state while
69 * create_worker() is in progress. 69 * worker_attach_to_pool() is in progress.
70 */ 70 */
71 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
72 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ 71 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
73 POOL_FREEZING = 1 << 3, /* freeze in progress */
74 72
75 /* worker flags */ 73 /* worker flags */
76 WORKER_STARTED = 1 << 0, /* started */
77 WORKER_DIE = 1 << 1, /* die die die */ 74 WORKER_DIE = 1 << 1, /* die die die */
78 WORKER_IDLE = 1 << 2, /* is idle */ 75 WORKER_IDLE = 1 << 2, /* is idle */
79 WORKER_PREP = 1 << 3, /* preparing to run works */ 76 WORKER_PREP = 1 << 3, /* preparing to run works */
@@ -100,10 +97,10 @@ enum {
100 97
101 /* 98 /*
102 * Rescue workers are used only on emergencies and shared by 99 * Rescue workers are used only on emergencies and shared by
103 * all cpus. Give -20. 100 * all cpus. Give MIN_NICE.
104 */ 101 */
105 RESCUER_NICE_LEVEL = -20, 102 RESCUER_NICE_LEVEL = MIN_NICE,
106 HIGHPRI_NICE_LEVEL = -20, 103 HIGHPRI_NICE_LEVEL = MIN_NICE,
107 104
108 WQ_NAME_LEN = 24, 105 WQ_NAME_LEN = 24,
109}; 106};
@@ -124,8 +121,7 @@ enum {
124 * cpu or grabbing pool->lock is enough for read access. If 121 * cpu or grabbing pool->lock is enough for read access. If
125 * POOL_DISASSOCIATED is set, it's identical to L. 122 * POOL_DISASSOCIATED is set, it's identical to L.
126 * 123 *
127 * MG: pool->manager_mutex and pool->lock protected. Writes require both 124 * A: pool->attach_mutex protected.
128 * locks. Reads can happen under either lock.
129 * 125 *
130 * PL: wq_pool_mutex protected. 126 * PL: wq_pool_mutex protected.
131 * 127 *
@@ -163,8 +159,11 @@ struct worker_pool {
163 159
164 /* see manage_workers() for details on the two manager mutexes */ 160 /* see manage_workers() for details on the two manager mutexes */
165 struct mutex manager_arb; /* manager arbitration */ 161 struct mutex manager_arb; /* manager arbitration */
166 struct mutex manager_mutex; /* manager exclusion */ 162 struct mutex attach_mutex; /* attach/detach exclusion */
167 struct idr worker_idr; /* MG: worker IDs and iteration */ 163 struct list_head workers; /* A: attached workers */
164 struct completion *detach_completion; /* all workers detached */
165
166 struct ida worker_ida; /* worker IDs for task name */
168 167
169 struct workqueue_attrs *attrs; /* I: worker attributes */ 168 struct workqueue_attrs *attrs; /* I: worker attributes */
170 struct hlist_node hash_node; /* PL: unbound_pool_hash node */ 169 struct hlist_node hash_node; /* PL: unbound_pool_hash node */
@@ -340,16 +339,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
340 lockdep_is_held(&wq->mutex), \ 339 lockdep_is_held(&wq->mutex), \
341 "sched RCU or wq->mutex should be held") 340 "sched RCU or wq->mutex should be held")
342 341
343#ifdef CONFIG_LOCKDEP
344#define assert_manager_or_pool_lock(pool) \
345 WARN_ONCE(debug_locks && \
346 !lockdep_is_held(&(pool)->manager_mutex) && \
347 !lockdep_is_held(&(pool)->lock), \
348 "pool->manager_mutex or ->lock should be held")
349#else
350#define assert_manager_or_pool_lock(pool) do { } while (0)
351#endif
352
353#define for_each_cpu_worker_pool(pool, cpu) \ 342#define for_each_cpu_worker_pool(pool, cpu) \
354 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ 343 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
355 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ 344 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
@@ -375,17 +364,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
375/** 364/**
376 * for_each_pool_worker - iterate through all workers of a worker_pool 365 * for_each_pool_worker - iterate through all workers of a worker_pool
377 * @worker: iteration cursor 366 * @worker: iteration cursor
378 * @wi: integer used for iteration
379 * @pool: worker_pool to iterate workers of 367 * @pool: worker_pool to iterate workers of
380 * 368 *
381 * This must be called with either @pool->manager_mutex or ->lock held. 369 * This must be called with @pool->attach_mutex.
382 * 370 *
383 * The if/else clause exists only for the lockdep assertion and can be 371 * The if/else clause exists only for the lockdep assertion and can be
384 * ignored. 372 * ignored.
385 */ 373 */
386#define for_each_pool_worker(worker, wi, pool) \ 374#define for_each_pool_worker(worker, pool) \
387 idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ 375 list_for_each_entry((worker), &(pool)->workers, node) \
388 if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ 376 if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
389 else 377 else
390 378
391/** 379/**
@@ -763,13 +751,6 @@ static bool need_to_create_worker(struct worker_pool *pool)
763 return need_more_worker(pool) && !may_start_working(pool); 751 return need_more_worker(pool) && !may_start_working(pool);
764} 752}
765 753
766/* Do I need to be the manager? */
767static bool need_to_manage_workers(struct worker_pool *pool)
768{
769 return need_to_create_worker(pool) ||
770 (pool->flags & POOL_MANAGE_WORKERS);
771}
772
773/* Do we have too many workers and should some go away? */ 754/* Do we have too many workers and should some go away? */
774static bool too_many_workers(struct worker_pool *pool) 755static bool too_many_workers(struct worker_pool *pool)
775{ 756{
@@ -791,8 +772,8 @@ static bool too_many_workers(struct worker_pool *pool)
791 * Wake up functions. 772 * Wake up functions.
792 */ 773 */
793 774
794/* Return the first worker. Safe with preemption disabled */ 775/* Return the first idle worker. Safe with preemption disabled */
795static struct worker *first_worker(struct worker_pool *pool) 776static struct worker *first_idle_worker(struct worker_pool *pool)
796{ 777{
797 if (unlikely(list_empty(&pool->idle_list))) 778 if (unlikely(list_empty(&pool->idle_list)))
798 return NULL; 779 return NULL;
@@ -811,7 +792,7 @@ static struct worker *first_worker(struct worker_pool *pool)
811 */ 792 */
812static void wake_up_worker(struct worker_pool *pool) 793static void wake_up_worker(struct worker_pool *pool)
813{ 794{
814 struct worker *worker = first_worker(pool); 795 struct worker *worker = first_idle_worker(pool);
815 796
816 if (likely(worker)) 797 if (likely(worker))
817 wake_up_process(worker->task); 798 wake_up_process(worker->task);
@@ -885,7 +866,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
885 */ 866 */
886 if (atomic_dec_and_test(&pool->nr_running) && 867 if (atomic_dec_and_test(&pool->nr_running) &&
887 !list_empty(&pool->worklist)) 868 !list_empty(&pool->worklist))
888 to_wakeup = first_worker(pool); 869 to_wakeup = first_idle_worker(pool);
889 return to_wakeup ? to_wakeup->task : NULL; 870 return to_wakeup ? to_wakeup->task : NULL;
890} 871}
891 872
@@ -1621,70 +1602,6 @@ static void worker_leave_idle(struct worker *worker)
1621 list_del_init(&worker->entry); 1602 list_del_init(&worker->entry);
1622} 1603}
1623 1604
1624/**
1625 * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
1626 * @pool: target worker_pool
1627 *
1628 * Bind %current to the cpu of @pool if it is associated and lock @pool.
1629 *
1630 * Works which are scheduled while the cpu is online must at least be
1631 * scheduled to a worker which is bound to the cpu so that if they are
1632 * flushed from cpu callbacks while cpu is going down, they are
1633 * guaranteed to execute on the cpu.
1634 *
1635 * This function is to be used by unbound workers and rescuers to bind
1636 * themselves to the target cpu and may race with cpu going down or
1637 * coming online. kthread_bind() can't be used because it may put the
1638 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1639 * verbatim as it's best effort and blocking and pool may be
1640 * [dis]associated in the meantime.
1641 *
1642 * This function tries set_cpus_allowed() and locks pool and verifies the
1643 * binding against %POOL_DISASSOCIATED which is set during
1644 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
1645 * enters idle state or fetches works without dropping lock, it can
1646 * guarantee the scheduling requirement described in the first paragraph.
1647 *
1648 * CONTEXT:
1649 * Might sleep. Called without any lock but returns with pool->lock
1650 * held.
1651 *
1652 * Return:
1653 * %true if the associated pool is online (@worker is successfully
1654 * bound), %false if offline.
1655 */
1656static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
1657__acquires(&pool->lock)
1658{
1659 while (true) {
1660 /*
1661 * The following call may fail, succeed or succeed
1662 * without actually migrating the task to the cpu if
1663 * it races with cpu hotunplug operation. Verify
1664 * against POOL_DISASSOCIATED.
1665 */
1666 if (!(pool->flags & POOL_DISASSOCIATED))
1667 set_cpus_allowed_ptr(current, pool->attrs->cpumask);
1668
1669 spin_lock_irq(&pool->lock);
1670 if (pool->flags & POOL_DISASSOCIATED)
1671 return false;
1672 if (task_cpu(current) == pool->cpu &&
1673 cpumask_equal(&current->cpus_allowed, pool->attrs->cpumask))
1674 return true;
1675 spin_unlock_irq(&pool->lock);
1676
1677 /*
1678 * We've raced with CPU hot[un]plug. Give it a breather
1679 * and retry migration. cond_resched() is required here;
1680 * otherwise, we might deadlock against cpu_stop trying to
1681 * bring down the CPU on non-preemptive kernel.
1682 */
1683 cpu_relax();
1684 cond_resched();
1685 }
1686}
1687
1688static struct worker *alloc_worker(void) 1605static struct worker *alloc_worker(void)
1689{ 1606{
1690 struct worker *worker; 1607 struct worker *worker;
@@ -1693,6 +1610,7 @@ static struct worker *alloc_worker(void)
1693 if (worker) { 1610 if (worker) {
1694 INIT_LIST_HEAD(&worker->entry); 1611 INIT_LIST_HEAD(&worker->entry);
1695 INIT_LIST_HEAD(&worker->scheduled); 1612 INIT_LIST_HEAD(&worker->scheduled);
1613 INIT_LIST_HEAD(&worker->node);
1696 /* on creation a worker is in !idle && prep state */ 1614 /* on creation a worker is in !idle && prep state */
1697 worker->flags = WORKER_PREP; 1615 worker->flags = WORKER_PREP;
1698 } 1616 }
@@ -1700,12 +1618,68 @@ static struct worker *alloc_worker(void)
1700} 1618}
1701 1619
1702/** 1620/**
1621 * worker_attach_to_pool() - attach a worker to a pool
1622 * @worker: worker to be attached
1623 * @pool: the target pool
1624 *
1625 * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and
1626 * cpu-binding of @worker are kept coordinated with the pool across
1627 * cpu-[un]hotplugs.
1628 */
1629static void worker_attach_to_pool(struct worker *worker,
1630 struct worker_pool *pool)
1631{
1632 mutex_lock(&pool->attach_mutex);
1633
1634 /*
1635 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
1636 * online CPUs. It'll be re-applied when any of the CPUs come up.
1637 */
1638 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
1639
1640 /*
1641 * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
1642 * stable across this function. See the comments above the
1643 * flag definition for details.
1644 */
1645 if (pool->flags & POOL_DISASSOCIATED)
1646 worker->flags |= WORKER_UNBOUND;
1647
1648 list_add_tail(&worker->node, &pool->workers);
1649
1650 mutex_unlock(&pool->attach_mutex);
1651}
1652
1653/**
1654 * worker_detach_from_pool() - detach a worker from its pool
1655 * @worker: worker which is attached to its pool
1656 * @pool: the pool @worker is attached to
1657 *
1658 * Undo the attaching which had been done in worker_attach_to_pool(). The
1659 * caller worker shouldn't access to the pool after detached except it has
1660 * other reference to the pool.
1661 */
1662static void worker_detach_from_pool(struct worker *worker,
1663 struct worker_pool *pool)
1664{
1665 struct completion *detach_completion = NULL;
1666
1667 mutex_lock(&pool->attach_mutex);
1668 list_del(&worker->node);
1669 if (list_empty(&pool->workers))
1670 detach_completion = pool->detach_completion;
1671 mutex_unlock(&pool->attach_mutex);
1672
1673 if (detach_completion)
1674 complete(detach_completion);
1675}
1676
1677/**
1703 * create_worker - create a new workqueue worker 1678 * create_worker - create a new workqueue worker
1704 * @pool: pool the new worker will belong to 1679 * @pool: pool the new worker will belong to
1705 * 1680 *
1706 * Create a new worker which is bound to @pool. The returned worker 1681 * Create a new worker which is attached to @pool. The new worker must be
1707 * can be started by calling start_worker() or destroyed using 1682 * started by start_worker().
1708 * destroy_worker().
1709 * 1683 *
1710 * CONTEXT: 1684 * CONTEXT:
1711 * Might sleep. Does GFP_KERNEL allocations. 1685 * Might sleep. Does GFP_KERNEL allocations.
@@ -1719,19 +1693,8 @@ static struct worker *create_worker(struct worker_pool *pool)
1719 int id = -1; 1693 int id = -1;
1720 char id_buf[16]; 1694 char id_buf[16];
1721 1695
1722 lockdep_assert_held(&pool->manager_mutex); 1696 /* ID is needed to determine kthread name */
1723 1697 id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
1724 /*
1725 * ID is needed to determine kthread name. Allocate ID first
1726 * without installing the pointer.
1727 */
1728 idr_preload(GFP_KERNEL);
1729 spin_lock_irq(&pool->lock);
1730
1731 id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);
1732
1733 spin_unlock_irq(&pool->lock);
1734 idr_preload_end();
1735 if (id < 0) 1698 if (id < 0)
1736 goto fail; 1699 goto fail;
1737 1700
@@ -1758,33 +1721,14 @@ static struct worker *create_worker(struct worker_pool *pool)
1758 /* prevent userland from meddling with cpumask of workqueue workers */ 1721 /* prevent userland from meddling with cpumask of workqueue workers */
1759 worker->task->flags |= PF_NO_SETAFFINITY; 1722 worker->task->flags |= PF_NO_SETAFFINITY;
1760 1723
1761 /* 1724 /* successful, attach the worker to the pool */
1762 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any 1725 worker_attach_to_pool(worker, pool);
1763 * online CPUs. It'll be re-applied when any of the CPUs come up.
1764 */
1765 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
1766
1767 /*
1768 * The caller is responsible for ensuring %POOL_DISASSOCIATED
1769 * remains stable across this function. See the comments above the
1770 * flag definition for details.
1771 */
1772 if (pool->flags & POOL_DISASSOCIATED)
1773 worker->flags |= WORKER_UNBOUND;
1774
1775 /* successful, commit the pointer to idr */
1776 spin_lock_irq(&pool->lock);
1777 idr_replace(&pool->worker_idr, worker, worker->id);
1778 spin_unlock_irq(&pool->lock);
1779 1726
1780 return worker; 1727 return worker;
1781 1728
1782fail: 1729fail:
1783 if (id >= 0) { 1730 if (id >= 0)
1784 spin_lock_irq(&pool->lock); 1731 ida_simple_remove(&pool->worker_ida, id);
1785 idr_remove(&pool->worker_idr, id);
1786 spin_unlock_irq(&pool->lock);
1787 }
1788 kfree(worker); 1732 kfree(worker);
1789 return NULL; 1733 return NULL;
1790} 1734}
@@ -1800,7 +1744,6 @@ fail:
1800 */ 1744 */
1801static void start_worker(struct worker *worker) 1745static void start_worker(struct worker *worker)
1802{ 1746{
1803 worker->flags |= WORKER_STARTED;
1804 worker->pool->nr_workers++; 1747 worker->pool->nr_workers++;
1805 worker_enter_idle(worker); 1748 worker_enter_idle(worker);
1806 wake_up_process(worker->task); 1749 wake_up_process(worker->task);
@@ -1818,8 +1761,6 @@ static int create_and_start_worker(struct worker_pool *pool)
1818{ 1761{
1819 struct worker *worker; 1762 struct worker *worker;
1820 1763
1821 mutex_lock(&pool->manager_mutex);
1822
1823 worker = create_worker(pool); 1764 worker = create_worker(pool);
1824 if (worker) { 1765 if (worker) {
1825 spin_lock_irq(&pool->lock); 1766 spin_lock_irq(&pool->lock);
@@ -1827,8 +1768,6 @@ static int create_and_start_worker(struct worker_pool *pool)
1827 spin_unlock_irq(&pool->lock); 1768 spin_unlock_irq(&pool->lock);
1828 } 1769 }
1829 1770
1830 mutex_unlock(&pool->manager_mutex);
1831
1832 return worker ? 0 : -ENOMEM; 1771 return worker ? 0 : -ENOMEM;
1833} 1772}
1834 1773
@@ -1836,46 +1775,30 @@ static int create_and_start_worker(struct worker_pool *pool)
1836 * destroy_worker - destroy a workqueue worker 1775 * destroy_worker - destroy a workqueue worker
1837 * @worker: worker to be destroyed 1776 * @worker: worker to be destroyed
1838 * 1777 *
1839 * Destroy @worker and adjust @pool stats accordingly. 1778 * Destroy @worker and adjust @pool stats accordingly. The worker should
1779 * be idle.
1840 * 1780 *
1841 * CONTEXT: 1781 * CONTEXT:
1842 * spin_lock_irq(pool->lock) which is released and regrabbed. 1782 * spin_lock_irq(pool->lock).
1843 */ 1783 */
1844static void destroy_worker(struct worker *worker) 1784static void destroy_worker(struct worker *worker)
1845{ 1785{
1846 struct worker_pool *pool = worker->pool; 1786 struct worker_pool *pool = worker->pool;
1847 1787
1848 lockdep_assert_held(&pool->manager_mutex);
1849 lockdep_assert_held(&pool->lock); 1788 lockdep_assert_held(&pool->lock);
1850 1789
1851 /* sanity check frenzy */ 1790 /* sanity check frenzy */
1852 if (WARN_ON(worker->current_work) || 1791 if (WARN_ON(worker->current_work) ||
1853 WARN_ON(!list_empty(&worker->scheduled))) 1792 WARN_ON(!list_empty(&worker->scheduled)) ||
1793 WARN_ON(!(worker->flags & WORKER_IDLE)))
1854 return; 1794 return;
1855 1795
1856 if (worker->flags & WORKER_STARTED) 1796 pool->nr_workers--;
1857 pool->nr_workers--; 1797 pool->nr_idle--;
1858 if (worker->flags & WORKER_IDLE)
1859 pool->nr_idle--;
1860
1861 /*
1862 * Once WORKER_DIE is set, the kworker may destroy itself at any
1863 * point. Pin to ensure the task stays until we're done with it.
1864 */
1865 get_task_struct(worker->task);
1866 1798
1867 list_del_init(&worker->entry); 1799 list_del_init(&worker->entry);
1868 worker->flags |= WORKER_DIE; 1800 worker->flags |= WORKER_DIE;
1869 1801 wake_up_process(worker->task);
1870 idr_remove(&pool->worker_idr, worker->id);
1871
1872 spin_unlock_irq(&pool->lock);
1873
1874 kthread_stop(worker->task);
1875 put_task_struct(worker->task);
1876 kfree(worker);
1877
1878 spin_lock_irq(&pool->lock);
1879} 1802}
1880 1803
1881static void idle_worker_timeout(unsigned long __pool) 1804static void idle_worker_timeout(unsigned long __pool)
@@ -1884,7 +1807,7 @@ static void idle_worker_timeout(unsigned long __pool)
1884 1807
1885 spin_lock_irq(&pool->lock); 1808 spin_lock_irq(&pool->lock);
1886 1809
1887 if (too_many_workers(pool)) { 1810 while (too_many_workers(pool)) {
1888 struct worker *worker; 1811 struct worker *worker;
1889 unsigned long expires; 1812 unsigned long expires;
1890 1813
@@ -1892,13 +1815,12 @@ static void idle_worker_timeout(unsigned long __pool)
1892 worker = list_entry(pool->idle_list.prev, struct worker, entry); 1815 worker = list_entry(pool->idle_list.prev, struct worker, entry);
1893 expires = worker->last_active + IDLE_WORKER_TIMEOUT; 1816 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1894 1817
1895 if (time_before(jiffies, expires)) 1818 if (time_before(jiffies, expires)) {
1896 mod_timer(&pool->idle_timer, expires); 1819 mod_timer(&pool->idle_timer, expires);
1897 else { 1820 break;
1898 /* it's been idle for too long, wake up manager */
1899 pool->flags |= POOL_MANAGE_WORKERS;
1900 wake_up_worker(pool);
1901 } 1821 }
1822
1823 destroy_worker(worker);
1902 } 1824 }
1903 1825
1904 spin_unlock_irq(&pool->lock); 1826 spin_unlock_irq(&pool->lock);
@@ -1916,6 +1838,12 @@ static void send_mayday(struct work_struct *work)
1916 1838
1917 /* mayday mayday mayday */ 1839 /* mayday mayday mayday */
1918 if (list_empty(&pwq->mayday_node)) { 1840 if (list_empty(&pwq->mayday_node)) {
1841 /*
1842 * If @pwq is for an unbound wq, its base ref may be put at
1843 * any time due to an attribute change. Pin @pwq until the
1844 * rescuer is done with it.
1845 */
1846 get_pwq(pwq);
1919 list_add_tail(&pwq->mayday_node, &wq->maydays); 1847 list_add_tail(&pwq->mayday_node, &wq->maydays);
1920 wake_up_process(wq->rescuer->task); 1848 wake_up_process(wq->rescuer->task);
1921 } 1849 }
@@ -2011,44 +1939,6 @@ restart:
2011} 1939}
2012 1940
2013/** 1941/**
2014 * maybe_destroy_worker - destroy workers which have been idle for a while
2015 * @pool: pool to destroy workers for
2016 *
2017 * Destroy @pool workers which have been idle for longer than
2018 * IDLE_WORKER_TIMEOUT.
2019 *
2020 * LOCKING:
2021 * spin_lock_irq(pool->lock) which may be released and regrabbed
2022 * multiple times. Called only from manager.
2023 *
2024 * Return:
2025 * %false if no action was taken and pool->lock stayed locked, %true
2026 * otherwise.
2027 */
2028static bool maybe_destroy_workers(struct worker_pool *pool)
2029{
2030 bool ret = false;
2031
2032 while (too_many_workers(pool)) {
2033 struct worker *worker;
2034 unsigned long expires;
2035
2036 worker = list_entry(pool->idle_list.prev, struct worker, entry);
2037 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
2038
2039 if (time_before(jiffies, expires)) {
2040 mod_timer(&pool->idle_timer, expires);
2041 break;
2042 }
2043
2044 destroy_worker(worker);
2045 ret = true;
2046 }
2047
2048 return ret;
2049}
2050
2051/**
2052 * manage_workers - manage worker pool 1942 * manage_workers - manage worker pool
2053 * @worker: self 1943 * @worker: self
2054 * 1944 *
@@ -2077,8 +1967,6 @@ static bool manage_workers(struct worker *worker)
2077 bool ret = false; 1967 bool ret = false;
2078 1968
2079 /* 1969 /*
2080 * Managership is governed by two mutexes - manager_arb and
2081 * manager_mutex. manager_arb handles arbitration of manager role.
2082 * Anyone who successfully grabs manager_arb wins the arbitration 1970 * Anyone who successfully grabs manager_arb wins the arbitration
2083 * and becomes the manager. mutex_trylock() on pool->manager_arb 1971 * and becomes the manager. mutex_trylock() on pool->manager_arb
2084 * failure while holding pool->lock reliably indicates that someone 1972 * failure while holding pool->lock reliably indicates that someone
@@ -2087,40 +1975,12 @@ static bool manage_workers(struct worker *worker)
2087 * grabbing manager_arb is responsible for actually performing 1975 * grabbing manager_arb is responsible for actually performing
2088 * manager duties. If manager_arb is grabbed and released without 1976 * manager duties. If manager_arb is grabbed and released without
2089 * actual management, the pool may stall indefinitely. 1977 * actual management, the pool may stall indefinitely.
2090 *
2091 * manager_mutex is used for exclusion of actual management
2092 * operations. The holder of manager_mutex can be sure that none
2093 * of management operations, including creation and destruction of
2094 * workers, won't take place until the mutex is released. Because
2095 * manager_mutex doesn't interfere with manager role arbitration,
2096 * it is guaranteed that the pool's management, while may be
2097 * delayed, won't be disturbed by someone else grabbing
2098 * manager_mutex.
2099 */ 1978 */
2100 if (!mutex_trylock(&pool->manager_arb)) 1979 if (!mutex_trylock(&pool->manager_arb))
2101 return ret; 1980 return ret;
2102 1981
2103 /*
2104 * With manager arbitration won, manager_mutex would be free in
2105 * most cases. trylock first without dropping @pool->lock.
2106 */
2107 if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
2108 spin_unlock_irq(&pool->lock);
2109 mutex_lock(&pool->manager_mutex);
2110 spin_lock_irq(&pool->lock);
2111 ret = true;
2112 }
2113
2114 pool->flags &= ~POOL_MANAGE_WORKERS;
2115
2116 /*
2117 * Destroy and then create so that may_start_working() is true
2118 * on return.
2119 */
2120 ret |= maybe_destroy_workers(pool);
2121 ret |= maybe_create_worker(pool); 1982 ret |= maybe_create_worker(pool);
2122 1983
2123 mutex_unlock(&pool->manager_mutex);
2124 mutex_unlock(&pool->manager_arb); 1984 mutex_unlock(&pool->manager_arb);
2125 return ret; 1985 return ret;
2126} 1986}
@@ -2308,6 +2168,11 @@ woke_up:
2308 spin_unlock_irq(&pool->lock); 2168 spin_unlock_irq(&pool->lock);
2309 WARN_ON_ONCE(!list_empty(&worker->entry)); 2169 WARN_ON_ONCE(!list_empty(&worker->entry));
2310 worker->task->flags &= ~PF_WQ_WORKER; 2170 worker->task->flags &= ~PF_WQ_WORKER;
2171
2172 set_task_comm(worker->task, "kworker/dying");
2173 ida_simple_remove(&pool->worker_ida, worker->id);
2174 worker_detach_from_pool(worker, pool);
2175 kfree(worker);
2311 return 0; 2176 return 0;
2312 } 2177 }
2313 2178
@@ -2355,9 +2220,6 @@ recheck:
2355 2220
2356 worker_set_flags(worker, WORKER_PREP, false); 2221 worker_set_flags(worker, WORKER_PREP, false);
2357sleep: 2222sleep:
2358 if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
2359 goto recheck;
2360
2361 /* 2223 /*
2362 * pool->lock is held and there's no work to process and no need to 2224 * pool->lock is held and there's no work to process and no need to
2363 * manage, sleep. Workers are woken up only while holding 2225 * manage, sleep. Workers are woken up only while holding
@@ -2398,6 +2260,7 @@ static int rescuer_thread(void *__rescuer)
2398 struct worker *rescuer = __rescuer; 2260 struct worker *rescuer = __rescuer;
2399 struct workqueue_struct *wq = rescuer->rescue_wq; 2261 struct workqueue_struct *wq = rescuer->rescue_wq;
2400 struct list_head *scheduled = &rescuer->scheduled; 2262 struct list_head *scheduled = &rescuer->scheduled;
2263 bool should_stop;
2401 2264
2402 set_user_nice(current, RESCUER_NICE_LEVEL); 2265 set_user_nice(current, RESCUER_NICE_LEVEL);
2403 2266
@@ -2409,11 +2272,15 @@ static int rescuer_thread(void *__rescuer)
2409repeat: 2272repeat:
2410 set_current_state(TASK_INTERRUPTIBLE); 2273 set_current_state(TASK_INTERRUPTIBLE);
2411 2274
2412 if (kthread_should_stop()) { 2275 /*
2413 __set_current_state(TASK_RUNNING); 2276 * By the time the rescuer is requested to stop, the workqueue
2414 rescuer->task->flags &= ~PF_WQ_WORKER; 2277 * shouldn't have any work pending, but @wq->maydays may still have
2415 return 0; 2278 * pwq(s) queued. This can happen by non-rescuer workers consuming
2416 } 2279 * all the work items before the rescuer got to them. Go through
2280 * @wq->maydays processing before acting on should_stop so that the
2281 * list is always empty on exit.
2282 */
2283 should_stop = kthread_should_stop();
2417 2284
2418 /* see whether any pwq is asking for help */ 2285 /* see whether any pwq is asking for help */
2419 spin_lock_irq(&wq_mayday_lock); 2286 spin_lock_irq(&wq_mayday_lock);
@@ -2429,8 +2296,9 @@ repeat:
2429 2296
2430 spin_unlock_irq(&wq_mayday_lock); 2297 spin_unlock_irq(&wq_mayday_lock);
2431 2298
2432 /* migrate to the target cpu if possible */ 2299 worker_attach_to_pool(rescuer, pool);
2433 worker_maybe_bind_and_lock(pool); 2300
2301 spin_lock_irq(&pool->lock);
2434 rescuer->pool = pool; 2302 rescuer->pool = pool;
2435 2303
2436 /* 2304 /*
@@ -2443,6 +2311,17 @@ repeat:
2443 move_linked_works(work, scheduled, &n); 2311 move_linked_works(work, scheduled, &n);
2444 2312
2445 process_scheduled_works(rescuer); 2313 process_scheduled_works(rescuer);
2314 spin_unlock_irq(&pool->lock);
2315
2316 worker_detach_from_pool(rescuer, pool);
2317
2318 spin_lock_irq(&pool->lock);
2319
2320 /*
2321 * Put the reference grabbed by send_mayday(). @pool won't
2322 * go away while we're holding its lock.
2323 */
2324 put_pwq(pwq);
2446 2325
2447 /* 2326 /*
2448 * Leave this pool. If keep_working() is %true, notify a 2327 * Leave this pool. If keep_working() is %true, notify a
@@ -2459,6 +2338,12 @@ repeat:
2459 2338
2460 spin_unlock_irq(&wq_mayday_lock); 2339 spin_unlock_irq(&wq_mayday_lock);
2461 2340
2341 if (should_stop) {
2342 __set_current_state(TASK_RUNNING);
2343 rescuer->task->flags &= ~PF_WQ_WORKER;
2344 return 0;
2345 }
2346
2462 /* rescuers should never participate in concurrency management */ 2347 /* rescuers should never participate in concurrency management */
2463 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); 2348 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
2464 schedule(); 2349 schedule();
@@ -3399,6 +3284,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
3399 } 3284 }
3400 } 3285 }
3401 3286
3287 dev_set_uevent_suppress(&wq_dev->dev, false);
3402 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); 3288 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
3403 return 0; 3289 return 0;
3404} 3290}
@@ -3527,9 +3413,10 @@ static int init_worker_pool(struct worker_pool *pool)
3527 (unsigned long)pool); 3413 (unsigned long)pool);
3528 3414
3529 mutex_init(&pool->manager_arb); 3415 mutex_init(&pool->manager_arb);
3530 mutex_init(&pool->manager_mutex); 3416 mutex_init(&pool->attach_mutex);
3531 idr_init(&pool->worker_idr); 3417 INIT_LIST_HEAD(&pool->workers);
3532 3418
3419 ida_init(&pool->worker_ida);
3533 INIT_HLIST_NODE(&pool->hash_node); 3420 INIT_HLIST_NODE(&pool->hash_node);
3534 pool->refcnt = 1; 3421 pool->refcnt = 1;
3535 3422
@@ -3544,7 +3431,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
3544{ 3431{
3545 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); 3432 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
3546 3433
3547 idr_destroy(&pool->worker_idr); 3434 ida_destroy(&pool->worker_ida);
3548 free_workqueue_attrs(pool->attrs); 3435 free_workqueue_attrs(pool->attrs);
3549 kfree(pool); 3436 kfree(pool);
3550} 3437}
@@ -3562,6 +3449,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
3562 */ 3449 */
3563static void put_unbound_pool(struct worker_pool *pool) 3450static void put_unbound_pool(struct worker_pool *pool)
3564{ 3451{
3452 DECLARE_COMPLETION_ONSTACK(detach_completion);
3565 struct worker *worker; 3453 struct worker *worker;
3566 3454
3567 lockdep_assert_held(&wq_pool_mutex); 3455 lockdep_assert_held(&wq_pool_mutex);
@@ -3582,18 +3470,24 @@ static void put_unbound_pool(struct worker_pool *pool)
3582 /* 3470 /*
3583 * Become the manager and destroy all workers. Grabbing 3471 * Become the manager and destroy all workers. Grabbing
3584 * manager_arb prevents @pool's workers from blocking on 3472 * manager_arb prevents @pool's workers from blocking on
3585 * manager_mutex. 3473 * attach_mutex.
3586 */ 3474 */
3587 mutex_lock(&pool->manager_arb); 3475 mutex_lock(&pool->manager_arb);
3588 mutex_lock(&pool->manager_mutex);
3589 spin_lock_irq(&pool->lock);
3590 3476
3591 while ((worker = first_worker(pool))) 3477 spin_lock_irq(&pool->lock);
3478 while ((worker = first_idle_worker(pool)))
3592 destroy_worker(worker); 3479 destroy_worker(worker);
3593 WARN_ON(pool->nr_workers || pool->nr_idle); 3480 WARN_ON(pool->nr_workers || pool->nr_idle);
3594
3595 spin_unlock_irq(&pool->lock); 3481 spin_unlock_irq(&pool->lock);
3596 mutex_unlock(&pool->manager_mutex); 3482
3483 mutex_lock(&pool->attach_mutex);
3484 if (!list_empty(&pool->workers))
3485 pool->detach_completion = &detach_completion;
3486 mutex_unlock(&pool->attach_mutex);
3487
3488 if (pool->detach_completion)
3489 wait_for_completion(pool->detach_completion);
3490
3597 mutex_unlock(&pool->manager_arb); 3491 mutex_unlock(&pool->manager_arb);
3598 3492
3599 /* shut down the timers */ 3493 /* shut down the timers */
@@ -3639,9 +3533,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3639 if (!pool || init_worker_pool(pool) < 0) 3533 if (!pool || init_worker_pool(pool) < 0)
3640 goto fail; 3534 goto fail;
3641 3535
3642 if (workqueue_freezing)
3643 pool->flags |= POOL_FREEZING;
3644
3645 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ 3536 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
3646 copy_workqueue_attrs(pool->attrs, attrs); 3537 copy_workqueue_attrs(pool->attrs, attrs);
3647 3538
@@ -3748,7 +3639,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
3748 3639
3749 spin_lock_irq(&pwq->pool->lock); 3640 spin_lock_irq(&pwq->pool->lock);
3750 3641
3751 if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { 3642 /*
3643 * During [un]freezing, the caller is responsible for ensuring that
3644 * this function is called at least once after @workqueue_freezing
3645 * is updated and visible.
3646 */
3647 if (!freezable || !workqueue_freezing) {
3752 pwq->max_active = wq->saved_max_active; 3648 pwq->max_active = wq->saved_max_active;
3753 3649
3754 while (!list_empty(&pwq->delayed_works) && 3650 while (!list_empty(&pwq->delayed_works) &&
@@ -4080,17 +3976,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
4080 * Let's determine what needs to be done. If the target cpumask is 3976 * Let's determine what needs to be done. If the target cpumask is
4081 * different from wq's, we need to compare it to @pwq's and create 3977 * different from wq's, we need to compare it to @pwq's and create
4082 * a new one if they don't match. If the target cpumask equals 3978 * a new one if they don't match. If the target cpumask equals
4083 * wq's, the default pwq should be used. If @pwq is already the 3979 * wq's, the default pwq should be used.
4084 * default one, nothing to do; otherwise, install the default one.
4085 */ 3980 */
4086 if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { 3981 if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
4087 if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) 3982 if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
4088 goto out_unlock; 3983 goto out_unlock;
4089 } else { 3984 } else {
4090 if (pwq == wq->dfl_pwq) 3985 goto use_dfl_pwq;
4091 goto out_unlock;
4092 else
4093 goto use_dfl_pwq;
4094 } 3986 }
4095 3987
4096 mutex_unlock(&wq->mutex); 3988 mutex_unlock(&wq->mutex);
@@ -4098,9 +3990,10 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
4098 /* create a new pwq */ 3990 /* create a new pwq */
4099 pwq = alloc_unbound_pwq(wq, target_attrs); 3991 pwq = alloc_unbound_pwq(wq, target_attrs);
4100 if (!pwq) { 3992 if (!pwq) {
4101 pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", 3993 pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
4102 wq->name); 3994 wq->name);
4103 goto out_unlock; 3995 mutex_lock(&wq->mutex);
3996 goto use_dfl_pwq;
4104 } 3997 }
4105 3998
4106 /* 3999 /*
@@ -4575,28 +4468,27 @@ static void wq_unbind_fn(struct work_struct *work)
4575 int cpu = smp_processor_id(); 4468 int cpu = smp_processor_id();
4576 struct worker_pool *pool; 4469 struct worker_pool *pool;
4577 struct worker *worker; 4470 struct worker *worker;
4578 int wi;
4579 4471
4580 for_each_cpu_worker_pool(pool, cpu) { 4472 for_each_cpu_worker_pool(pool, cpu) {
4581 WARN_ON_ONCE(cpu != smp_processor_id()); 4473 WARN_ON_ONCE(cpu != smp_processor_id());
4582 4474
4583 mutex_lock(&pool->manager_mutex); 4475 mutex_lock(&pool->attach_mutex);
4584 spin_lock_irq(&pool->lock); 4476 spin_lock_irq(&pool->lock);
4585 4477
4586 /* 4478 /*
4587 * We've blocked all manager operations. Make all workers 4479 * We've blocked all attach/detach operations. Make all workers
4588 * unbound and set DISASSOCIATED. Before this, all workers 4480 * unbound and set DISASSOCIATED. Before this, all workers
4589 * except for the ones which are still executing works from 4481 * except for the ones which are still executing works from
4590 * before the last CPU down must be on the cpu. After 4482 * before the last CPU down must be on the cpu. After
4591 * this, they may become diasporas. 4483 * this, they may become diasporas.
4592 */ 4484 */
4593 for_each_pool_worker(worker, wi, pool) 4485 for_each_pool_worker(worker, pool)
4594 worker->flags |= WORKER_UNBOUND; 4486 worker->flags |= WORKER_UNBOUND;
4595 4487
4596 pool->flags |= POOL_DISASSOCIATED; 4488 pool->flags |= POOL_DISASSOCIATED;
4597 4489
4598 spin_unlock_irq(&pool->lock); 4490 spin_unlock_irq(&pool->lock);
4599 mutex_unlock(&pool->manager_mutex); 4491 mutex_unlock(&pool->attach_mutex);
4600 4492
4601 /* 4493 /*
4602 * Call schedule() so that we cross rq->lock and thus can 4494 * Call schedule() so that we cross rq->lock and thus can
@@ -4636,9 +4528,8 @@ static void wq_unbind_fn(struct work_struct *work)
4636static void rebind_workers(struct worker_pool *pool) 4528static void rebind_workers(struct worker_pool *pool)
4637{ 4529{
4638 struct worker *worker; 4530 struct worker *worker;
4639 int wi;
4640 4531
4641 lockdep_assert_held(&pool->manager_mutex); 4532 lockdep_assert_held(&pool->attach_mutex);
4642 4533
4643 /* 4534 /*
4644 * Restore CPU affinity of all workers. As all idle workers should 4535 * Restore CPU affinity of all workers. As all idle workers should
@@ -4647,13 +4538,13 @@ static void rebind_workers(struct worker_pool *pool)
4647 * of all workers first and then clear UNBOUND. As we're called 4538 * of all workers first and then clear UNBOUND. As we're called
4648 * from CPU_ONLINE, the following shouldn't fail. 4539 * from CPU_ONLINE, the following shouldn't fail.
4649 */ 4540 */
4650 for_each_pool_worker(worker, wi, pool) 4541 for_each_pool_worker(worker, pool)
4651 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 4542 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4652 pool->attrs->cpumask) < 0); 4543 pool->attrs->cpumask) < 0);
4653 4544
4654 spin_lock_irq(&pool->lock); 4545 spin_lock_irq(&pool->lock);
4655 4546
4656 for_each_pool_worker(worker, wi, pool) { 4547 for_each_pool_worker(worker, pool) {
4657 unsigned int worker_flags = worker->flags; 4548 unsigned int worker_flags = worker->flags;
4658 4549
4659 /* 4550 /*
@@ -4705,9 +4596,8 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
4705{ 4596{
4706 static cpumask_t cpumask; 4597 static cpumask_t cpumask;
4707 struct worker *worker; 4598 struct worker *worker;
4708 int wi;
4709 4599
4710 lockdep_assert_held(&pool->manager_mutex); 4600 lockdep_assert_held(&pool->attach_mutex);
4711 4601
4712 /* is @cpu allowed for @pool? */ 4602 /* is @cpu allowed for @pool? */
4713 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) 4603 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
@@ -4719,7 +4609,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
4719 return; 4609 return;
4720 4610
4721 /* as we're called from CPU_ONLINE, the following shouldn't fail */ 4611 /* as we're called from CPU_ONLINE, the following shouldn't fail */
4722 for_each_pool_worker(worker, wi, pool) 4612 for_each_pool_worker(worker, pool)
4723 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 4613 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4724 pool->attrs->cpumask) < 0); 4614 pool->attrs->cpumask) < 0);
4725} 4615}
@@ -4752,7 +4642,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
4752 mutex_lock(&wq_pool_mutex); 4642 mutex_lock(&wq_pool_mutex);
4753 4643
4754 for_each_pool(pool, pi) { 4644 for_each_pool(pool, pi) {
4755 mutex_lock(&pool->manager_mutex); 4645 mutex_lock(&pool->attach_mutex);
4756 4646
4757 if (pool->cpu == cpu) { 4647 if (pool->cpu == cpu) {
4758 spin_lock_irq(&pool->lock); 4648 spin_lock_irq(&pool->lock);
@@ -4764,7 +4654,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
4764 restore_unbound_workers_cpumask(pool, cpu); 4654 restore_unbound_workers_cpumask(pool, cpu);
4765 } 4655 }
4766 4656
4767 mutex_unlock(&pool->manager_mutex); 4657 mutex_unlock(&pool->attach_mutex);
4768 } 4658 }
4769 4659
4770 /* update NUMA affinity of unbound workqueues */ 4660 /* update NUMA affinity of unbound workqueues */
@@ -4863,24 +4753,14 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
4863 */ 4753 */
4864void freeze_workqueues_begin(void) 4754void freeze_workqueues_begin(void)
4865{ 4755{
4866 struct worker_pool *pool;
4867 struct workqueue_struct *wq; 4756 struct workqueue_struct *wq;
4868 struct pool_workqueue *pwq; 4757 struct pool_workqueue *pwq;
4869 int pi;
4870 4758
4871 mutex_lock(&wq_pool_mutex); 4759 mutex_lock(&wq_pool_mutex);
4872 4760
4873 WARN_ON_ONCE(workqueue_freezing); 4761 WARN_ON_ONCE(workqueue_freezing);
4874 workqueue_freezing = true; 4762 workqueue_freezing = true;
4875 4763
4876 /* set FREEZING */
4877 for_each_pool(pool, pi) {
4878 spin_lock_irq(&pool->lock);
4879 WARN_ON_ONCE(pool->flags & POOL_FREEZING);
4880 pool->flags |= POOL_FREEZING;
4881 spin_unlock_irq(&pool->lock);
4882 }
4883
4884 list_for_each_entry(wq, &workqueues, list) { 4764 list_for_each_entry(wq, &workqueues, list) {
4885 mutex_lock(&wq->mutex); 4765 mutex_lock(&wq->mutex);
4886 for_each_pwq(pwq, wq) 4766 for_each_pwq(pwq, wq)
@@ -4950,21 +4830,13 @@ void thaw_workqueues(void)
4950{ 4830{
4951 struct workqueue_struct *wq; 4831 struct workqueue_struct *wq;
4952 struct pool_workqueue *pwq; 4832 struct pool_workqueue *pwq;
4953 struct worker_pool *pool;
4954 int pi;
4955 4833
4956 mutex_lock(&wq_pool_mutex); 4834 mutex_lock(&wq_pool_mutex);
4957 4835
4958 if (!workqueue_freezing) 4836 if (!workqueue_freezing)
4959 goto out_unlock; 4837 goto out_unlock;
4960 4838
4961 /* clear FREEZING */ 4839 workqueue_freezing = false;
4962 for_each_pool(pool, pi) {
4963 spin_lock_irq(&pool->lock);
4964 WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
4965 pool->flags &= ~POOL_FREEZING;
4966 spin_unlock_irq(&pool->lock);
4967 }
4968 4840
4969 /* restore max_active and repopulate worklist */ 4841 /* restore max_active and repopulate worklist */
4970 list_for_each_entry(wq, &workqueues, list) { 4842 list_for_each_entry(wq, &workqueues, list) {
@@ -4974,7 +4846,6 @@ void thaw_workqueues(void)
4974 mutex_unlock(&wq->mutex); 4846 mutex_unlock(&wq->mutex);
4975 } 4847 }
4976 4848
4977 workqueue_freezing = false;
4978out_unlock: 4849out_unlock:
4979 mutex_unlock(&wq_pool_mutex); 4850 mutex_unlock(&wq_pool_mutex);
4980} 4851}
@@ -5009,7 +4880,7 @@ static void __init wq_numa_init(void)
5009 BUG_ON(!tbl); 4880 BUG_ON(!tbl);
5010 4881
5011 for_each_node(node) 4882 for_each_node(node)
5012 BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, 4883 BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
5013 node_online(node) ? node : NUMA_NO_NODE)); 4884 node_online(node) ? node : NUMA_NO_NODE));
5014 4885
5015 for_each_possible_cpu(cpu) { 4886 for_each_possible_cpu(cpu) {
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 7e2204db0b1a..45215870ac6c 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -37,6 +37,8 @@ struct worker {
37 struct task_struct *task; /* I: worker task */ 37 struct task_struct *task; /* I: worker task */
38 struct worker_pool *pool; /* I: the associated pool */ 38 struct worker_pool *pool; /* I: the associated pool */
39 /* L: for rescuers */ 39 /* L: for rescuers */
40 struct list_head node; /* A: anchored at pool->workers */
41 /* A: runs through worker->node */
40 42
41 unsigned long last_active; /* L: last active timestamp */ 43 unsigned long last_active; /* L: last active timestamp */
42 unsigned int flags; /* X: flags */ 44 unsigned int flags; /* X: flags */