aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks4
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/acct.c94
-rw-r--r--kernel/audit.c10
-rw-r--r--kernel/audit.h17
-rw-r--r--kernel/auditfilter.c25
-rw-r--r--kernel/auditsc.c151
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/syscall.c25
-rw-r--r--kernel/cgroup.c14
-rw-r--r--kernel/compat.c5
-rw-r--r--kernel/cpu.c56
-rw-r--r--kernel/cpuset.c44
-rw-r--r--kernel/debug/debug_core.c52
-rw-r--r--kernel/debug/kdb/kdb_bp.c37
-rw-r--r--kernel/debug/kdb/kdb_debugger.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c269
-rw-r--r--kernel/debug/kdb/kdb_private.h3
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/core.c524
-rw-r--r--kernel/events/ring_buffer.c3
-rw-r--r--kernel/exit.c15
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/futex.c8
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/irq/proc.c11
-rw-r--r--kernel/kexec.c25
-rw-r--r--kernel/kprobes.c24
-rw-r--r--kernel/livepatch/Kconfig18
-rw-r--r--kernel/livepatch/Makefile3
-rw-r--r--kernel/livepatch/core.c1015
-rw-r--r--kernel/locking/Makefile11
-rw-r--r--kernel/locking/mcs_spinlock.h16
-rw-r--r--kernel/locking/mutex-debug.c2
-rw-r--r--kernel/locking/mutex.c62
-rw-r--r--kernel/locking/osq_lock.c (renamed from kernel/locking/mcs_spinlock.c)9
-rw-r--r--kernel/locking/rtmutex.c7
-rw-r--r--kernel/locking/rwsem-spinlock.c2
-rw-r--r--kernel/locking/rwsem-xadd.c3
-rw-r--r--kernel/locking/spinlock.c8
-rw-r--r--kernel/module.c149
-rw-r--r--kernel/notifier.c3
-rw-r--r--kernel/padata.c11
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c3
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/process.c75
-rw-r--r--kernel/power/qos.c91
-rw-r--r--kernel/power/snapshot.c11
-rw-r--r--kernel/power/suspend.c43
-rw-r--r--kernel/printk/printk.c12
-rw-r--r--kernel/profile.c3
-rw-r--r--kernel/ptrace.c1
-rw-r--r--kernel/range.c10
-rw-r--r--kernel/rcu/Makefile3
-rw-r--r--kernel/rcu/rcu.h6
-rw-r--r--kernel/rcu/rcutorture.c66
-rw-r--r--kernel/rcu/srcu.c2
-rw-r--r--kernel/rcu/tiny.c113
-rw-r--r--kernel/rcu/tiny_plugin.h9
-rw-r--r--kernel/rcu/tree.c355
-rw-r--r--kernel/rcu/tree.h62
-rw-r--r--kernel/rcu/tree_plugin.h276
-rw-r--r--kernel/rcu/tree_trace.c8
-rw-r--r--kernel/resource.c25
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/clock.c13
-rw-r--r--kernel/sched/completion.c18
-rw-r--r--kernel/sched/core.c173
-rw-r--r--kernel/sched/cpudeadline.c27
-rw-r--r--kernel/sched/cpudeadline.h2
-rw-r--r--kernel/sched/deadline.c79
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c15
-rw-r--r--kernel/sched/idle.c19
-rw-r--r--kernel/sched/rt.c26
-rw-r--r--kernel/sched/sched.h22
-rw-r--r--kernel/sched/stats.c11
-rw-r--r--kernel/seccomp.c4
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/smpboot.c2
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sysctl.c3
-rw-r--r--kernel/taskstats.c13
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clocksource.c76
-rw-r--r--kernel/time/hrtimer.c116
-rw-r--r--kernel/time/ntp.c11
-rw-r--r--kernel/time/posix-cpu-timers.c3
-rw-r--r--kernel/time/tick-common.c50
-rw-r--r--kernel/time/tick-sched.c11
-rw-r--r--kernel/time/time.c4
-rw-r--r--kernel/time/timecounter.c112
-rw-r--r--kernel/time/timekeeping.c60
-rw-r--r--kernel/time/timekeeping.h2
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ftrace.c55
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c42
-rw-r--r--kernel/trace/ring_buffer_benchmark.c18
-rw-r--r--kernel/trace/trace.c195
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_branch.c1
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_events.c71
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_kdb.c4
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_nop.c2
-rw-r--r--kernel/trace/trace_output.c44
-rw-r--r--kernel/trace/trace_printk.c4
-rw-r--r--kernel/trace/trace_sched_switch.c2
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_seq.c2
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_stat.c2
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/trace/trace_uprobe.c4
-rw-r--r--kernel/watchdog.c2
-rw-r--r--kernel/workqueue.c30
124 files changed, 3529 insertions, 1827 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 76768ee812b2..08561f1acd13 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER
231 def_bool y 231 def_bool y
232 depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW 232 depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
233 233
234config LOCK_SPIN_ON_OWNER
235 def_bool y
236 depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
237
234config ARCH_USE_QUEUE_RWLOCK 238config ARCH_USE_QUEUE_RWLOCK
235 bool 239 bool
236 240
diff --git a/kernel/Makefile b/kernel/Makefile
index a59481a3fa6c..1408b3353a3c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -13,8 +13,8 @@ obj-y = fork.o exec_domain.o panic.o \
13 13
14ifdef CONFIG_FUNCTION_TRACER 14ifdef CONFIG_FUNCTION_TRACER
15# Do not trace debug files and internal ftrace files 15# Do not trace debug files and internal ftrace files
16CFLAGS_REMOVE_cgroup-debug.o = -pg 16CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
17CFLAGS_REMOVE_irq_work.o = -pg 17CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
18endif 18endif
19 19
20# cond_syscall is currently not LTO compatible 20# cond_syscall is currently not LTO compatible
@@ -26,6 +26,7 @@ obj-y += power/
26obj-y += printk/ 26obj-y += printk/
27obj-y += irq/ 27obj-y += irq/
28obj-y += rcu/ 28obj-y += rcu/
29obj-y += livepatch/
29 30
30obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 31obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
31obj-$(CONFIG_FREEZER) += freezer.o 32obj-$(CONFIG_FREEZER) += freezer.o
@@ -142,7 +143,7 @@ endif
142kernel/system_certificates.o: $(obj)/x509_certificate_list 143kernel/system_certificates.o: $(obj)/x509_certificate_list
143 144
144quiet_cmd_x509certs = CERTS $@ 145quiet_cmd_x509certs = CERTS $@
145 cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)") 146 cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)")
146 147
147targets += $(obj)/x509_certificate_list 148targets += $(obj)/x509_certificate_list
148$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list 149$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
diff --git a/kernel/acct.c b/kernel/acct.c
index 33738ef972f3..e6c10d1a4058 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -76,10 +76,11 @@ int acct_parm[3] = {4, 2, 30};
76/* 76/*
77 * External references and all of the globals. 77 * External references and all of the globals.
78 */ 78 */
79static void do_acct_process(struct bsd_acct_struct *acct);
80 79
81struct bsd_acct_struct { 80struct bsd_acct_struct {
82 struct fs_pin pin; 81 struct fs_pin pin;
82 atomic_long_t count;
83 struct rcu_head rcu;
83 struct mutex lock; 84 struct mutex lock;
84 int active; 85 int active;
85 unsigned long needcheck; 86 unsigned long needcheck;
@@ -89,6 +90,8 @@ struct bsd_acct_struct {
89 struct completion done; 90 struct completion done;
90}; 91};
91 92
93static void do_acct_process(struct bsd_acct_struct *acct);
94
92/* 95/*
93 * Check the amount of free space and suspend/resume accordingly. 96 * Check the amount of free space and suspend/resume accordingly.
94 */ 97 */
@@ -124,32 +127,56 @@ out:
124 return acct->active; 127 return acct->active;
125} 128}
126 129
130static void acct_put(struct bsd_acct_struct *p)
131{
132 if (atomic_long_dec_and_test(&p->count))
133 kfree_rcu(p, rcu);
134}
135
136static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
137{
138 return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
139}
140
127static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) 141static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
128{ 142{
129 struct bsd_acct_struct *res; 143 struct bsd_acct_struct *res;
130again: 144again:
131 smp_rmb(); 145 smp_rmb();
132 rcu_read_lock(); 146 rcu_read_lock();
133 res = ACCESS_ONCE(ns->bacct); 147 res = to_acct(ACCESS_ONCE(ns->bacct));
134 if (!res) { 148 if (!res) {
135 rcu_read_unlock(); 149 rcu_read_unlock();
136 return NULL; 150 return NULL;
137 } 151 }
138 if (!atomic_long_inc_not_zero(&res->pin.count)) { 152 if (!atomic_long_inc_not_zero(&res->count)) {
139 rcu_read_unlock(); 153 rcu_read_unlock();
140 cpu_relax(); 154 cpu_relax();
141 goto again; 155 goto again;
142 } 156 }
143 rcu_read_unlock(); 157 rcu_read_unlock();
144 mutex_lock(&res->lock); 158 mutex_lock(&res->lock);
145 if (!res->ns) { 159 if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
146 mutex_unlock(&res->lock); 160 mutex_unlock(&res->lock);
147 pin_put(&res->pin); 161 acct_put(res);
148 goto again; 162 goto again;
149 } 163 }
150 return res; 164 return res;
151} 165}
152 166
167static void acct_pin_kill(struct fs_pin *pin)
168{
169 struct bsd_acct_struct *acct = to_acct(pin);
170 mutex_lock(&acct->lock);
171 do_acct_process(acct);
172 schedule_work(&acct->work);
173 wait_for_completion(&acct->done);
174 cmpxchg(&acct->ns->bacct, pin, NULL);
175 mutex_unlock(&acct->lock);
176 pin_remove(pin);
177 acct_put(acct);
178}
179
153static void close_work(struct work_struct *work) 180static void close_work(struct work_struct *work)
154{ 181{
155 struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); 182 struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
@@ -160,44 +187,13 @@ static void close_work(struct work_struct *work)
160 complete(&acct->done); 187 complete(&acct->done);
161} 188}
162 189
163static void acct_kill(struct bsd_acct_struct *acct,
164 struct bsd_acct_struct *new)
165{
166 if (acct) {
167 struct pid_namespace *ns = acct->ns;
168 do_acct_process(acct);
169 INIT_WORK(&acct->work, close_work);
170 init_completion(&acct->done);
171 schedule_work(&acct->work);
172 wait_for_completion(&acct->done);
173 pin_remove(&acct->pin);
174 ns->bacct = new;
175 acct->ns = NULL;
176 atomic_long_dec(&acct->pin.count);
177 mutex_unlock(&acct->lock);
178 pin_put(&acct->pin);
179 }
180}
181
182static void acct_pin_kill(struct fs_pin *pin)
183{
184 struct bsd_acct_struct *acct;
185 acct = container_of(pin, struct bsd_acct_struct, pin);
186 mutex_lock(&acct->lock);
187 if (!acct->ns) {
188 mutex_unlock(&acct->lock);
189 pin_put(pin);
190 acct = NULL;
191 }
192 acct_kill(acct, NULL);
193}
194
195static int acct_on(struct filename *pathname) 190static int acct_on(struct filename *pathname)
196{ 191{
197 struct file *file; 192 struct file *file;
198 struct vfsmount *mnt, *internal; 193 struct vfsmount *mnt, *internal;
199 struct pid_namespace *ns = task_active_pid_ns(current); 194 struct pid_namespace *ns = task_active_pid_ns(current);
200 struct bsd_acct_struct *acct, *old; 195 struct bsd_acct_struct *acct;
196 struct fs_pin *old;
201 int err; 197 int err;
202 198
203 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); 199 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
@@ -238,21 +234,21 @@ static int acct_on(struct filename *pathname)
238 mnt = file->f_path.mnt; 234 mnt = file->f_path.mnt;
239 file->f_path.mnt = internal; 235 file->f_path.mnt = internal;
240 236
241 atomic_long_set(&acct->pin.count, 1); 237 atomic_long_set(&acct->count, 1);
242 acct->pin.kill = acct_pin_kill; 238 init_fs_pin(&acct->pin, acct_pin_kill);
243 acct->file = file; 239 acct->file = file;
244 acct->needcheck = jiffies; 240 acct->needcheck = jiffies;
245 acct->ns = ns; 241 acct->ns = ns;
246 mutex_init(&acct->lock); 242 mutex_init(&acct->lock);
243 INIT_WORK(&acct->work, close_work);
244 init_completion(&acct->done);
247 mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ 245 mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
248 pin_insert(&acct->pin, mnt); 246 pin_insert(&acct->pin, mnt);
249 247
250 old = acct_get(ns); 248 rcu_read_lock();
251 if (old) 249 old = xchg(&ns->bacct, &acct->pin);
252 acct_kill(old, acct);
253 else
254 ns->bacct = acct;
255 mutex_unlock(&acct->lock); 250 mutex_unlock(&acct->lock);
251 pin_kill(old);
256 mnt_drop_write(mnt); 252 mnt_drop_write(mnt);
257 mntput(mnt); 253 mntput(mnt);
258 return 0; 254 return 0;
@@ -288,7 +284,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
288 mutex_unlock(&acct_on_mutex); 284 mutex_unlock(&acct_on_mutex);
289 putname(tmp); 285 putname(tmp);
290 } else { 286 } else {
291 acct_kill(acct_get(task_active_pid_ns(current)), NULL); 287 rcu_read_lock();
288 pin_kill(task_active_pid_ns(current)->bacct);
292 } 289 }
293 290
294 return error; 291 return error;
@@ -296,7 +293,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
296 293
297void acct_exit_ns(struct pid_namespace *ns) 294void acct_exit_ns(struct pid_namespace *ns)
298{ 295{
299 acct_kill(acct_get(ns), NULL); 296 rcu_read_lock();
297 pin_kill(ns->bacct);
300} 298}
301 299
302/* 300/*
@@ -576,7 +574,7 @@ static void slow_acct_process(struct pid_namespace *ns)
576 if (acct) { 574 if (acct) {
577 do_acct_process(acct); 575 do_acct_process(acct);
578 mutex_unlock(&acct->lock); 576 mutex_unlock(&acct->lock);
579 pin_put(&acct->pin); 577 acct_put(acct);
580 } 578 }
581 } 579 }
582} 580}
diff --git a/kernel/audit.c b/kernel/audit.c
index f8f203e8018c..72ab759a0b43 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -429,7 +429,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
429 * This function doesn't consume an skb as might be expected since it has to 429 * This function doesn't consume an skb as might be expected since it has to
430 * copy it anyways. 430 * copy it anyways.
431 */ 431 */
432static void kauditd_send_multicast_skb(struct sk_buff *skb) 432static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
433{ 433{
434 struct sk_buff *copy; 434 struct sk_buff *copy;
435 struct audit_net *aunet = net_generic(&init_net, audit_net_id); 435 struct audit_net *aunet = net_generic(&init_net, audit_net_id);
@@ -448,11 +448,11 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb)
448 * no reason for new multicast clients to continue with this 448 * no reason for new multicast clients to continue with this
449 * non-compliance. 449 * non-compliance.
450 */ 450 */
451 copy = skb_copy(skb, GFP_KERNEL); 451 copy = skb_copy(skb, gfp_mask);
452 if (!copy) 452 if (!copy)
453 return; 453 return;
454 454
455 nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); 455 nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask);
456} 456}
457 457
458/* 458/*
@@ -1100,7 +1100,7 @@ static void audit_receive(struct sk_buff *skb)
1100} 1100}
1101 1101
1102/* Run custom bind function on netlink socket group connect or bind requests. */ 1102/* Run custom bind function on netlink socket group connect or bind requests. */
1103static int audit_bind(int group) 1103static int audit_bind(struct net *net, int group)
1104{ 1104{
1105 if (!capable(CAP_AUDIT_READ)) 1105 if (!capable(CAP_AUDIT_READ))
1106 return -EPERM; 1106 return -EPERM;
@@ -1940,7 +1940,7 @@ void audit_log_end(struct audit_buffer *ab)
1940 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 1940 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1941 1941
1942 nlh->nlmsg_len = ab->skb->len; 1942 nlh->nlmsg_len = ab->skb->len;
1943 kauditd_send_multicast_skb(ab->skb); 1943 kauditd_send_multicast_skb(ab->skb, ab->gfp_mask);
1944 1944
1945 /* 1945 /*
1946 * The original kaudit unicast socket sends up messages with 1946 * The original kaudit unicast socket sends up messages with
diff --git a/kernel/audit.h b/kernel/audit.h
index 3cdffad5a1d9..1caa0d345d90 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -24,12 +24,6 @@
24#include <linux/skbuff.h> 24#include <linux/skbuff.h>
25#include <uapi/linux/mqueue.h> 25#include <uapi/linux/mqueue.h>
26 26
27/* 0 = no checking
28 1 = put_count checking
29 2 = verbose put_count checking
30*/
31#define AUDIT_DEBUG 0
32
33/* AUDIT_NAMES is the number of slots we reserve in the audit_context 27/* AUDIT_NAMES is the number of slots we reserve in the audit_context
34 * for saving names from getname(). If we get more names we will allocate 28 * for saving names from getname(). If we get more names we will allocate
35 * a name dynamically and also add those to the list anchored by names_list. */ 29 * a name dynamically and also add those to the list anchored by names_list. */
@@ -74,9 +68,8 @@ struct audit_cap_data {
74 }; 68 };
75}; 69};
76 70
77/* When fs/namei.c:getname() is called, we store the pointer in name and 71/* When fs/namei.c:getname() is called, we store the pointer in name and bump
78 * we don't let putname() free it (instead we free all of the saved 72 * the refcnt in the associated filename struct.
79 * pointers at syscall exit time).
80 * 73 *
81 * Further, in fs/namei.c:path_lookup() we store the inode and device. 74 * Further, in fs/namei.c:path_lookup() we store the inode and device.
82 */ 75 */
@@ -86,7 +79,6 @@ struct audit_names {
86 struct filename *name; 79 struct filename *name;
87 int name_len; /* number of chars to log */ 80 int name_len; /* number of chars to log */
88 bool hidden; /* don't log this record */ 81 bool hidden; /* don't log this record */
89 bool name_put; /* call __putname()? */
90 82
91 unsigned long ino; 83 unsigned long ino;
92 dev_t dev; 84 dev_t dev;
@@ -208,11 +200,6 @@ struct audit_context {
208 }; 200 };
209 int fds[2]; 201 int fds[2];
210 struct audit_proctitle proctitle; 202 struct audit_proctitle proctitle;
211
212#if AUDIT_DEBUG
213 int put_count;
214 int ino_count;
215#endif
216}; 203};
217 204
218extern u32 audit_ever_enabled; 205extern u32 audit_ever_enabled;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 3598e13f2a65..72e1660a79a3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -425,7 +425,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
425 goto exit_nofree; 425 goto exit_nofree;
426 426
427 bufp = data->buf; 427 bufp = data->buf;
428 entry->rule.vers_ops = 2;
429 for (i = 0; i < data->field_count; i++) { 428 for (i = 0; i < data->field_count; i++) {
430 struct audit_field *f = &entry->rule.fields[i]; 429 struct audit_field *f = &entry->rule.fields[i];
431 430
@@ -442,19 +441,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
442 if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { 441 if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
443 f->type = AUDIT_LOGINUID_SET; 442 f->type = AUDIT_LOGINUID_SET;
444 f->val = 0; 443 f->val = 0;
445 } 444 entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
446
447 if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
448 struct pid *pid;
449 rcu_read_lock();
450 pid = find_vpid(f->val);
451 if (!pid) {
452 rcu_read_unlock();
453 err = -ESRCH;
454 goto exit_free;
455 }
456 f->val = pid_nr(pid);
457 rcu_read_unlock();
458 } 445 }
459 446
460 err = audit_field_valid(entry, f); 447 err = audit_field_valid(entry, f);
@@ -630,6 +617,13 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
630 data->buflen += data->values[i] = 617 data->buflen += data->values[i] =
631 audit_pack_string(&bufp, krule->filterkey); 618 audit_pack_string(&bufp, krule->filterkey);
632 break; 619 break;
620 case AUDIT_LOGINUID_SET:
621 if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
622 data->fields[i] = AUDIT_LOGINUID;
623 data->values[i] = AUDIT_UID_UNSET;
624 break;
625 }
626 /* fallthrough if set */
633 default: 627 default:
634 data->values[i] = f->val; 628 data->values[i] = f->val;
635 } 629 }
@@ -646,6 +640,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
646 int i; 640 int i;
647 641
648 if (a->flags != b->flags || 642 if (a->flags != b->flags ||
643 a->pflags != b->pflags ||
649 a->listnr != b->listnr || 644 a->listnr != b->listnr ||
650 a->action != b->action || 645 a->action != b->action ||
651 a->field_count != b->field_count) 646 a->field_count != b->field_count)
@@ -762,8 +757,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
762 return ERR_PTR(-ENOMEM); 757 return ERR_PTR(-ENOMEM);
763 758
764 new = &entry->rule; 759 new = &entry->rule;
765 new->vers_ops = old->vers_ops;
766 new->flags = old->flags; 760 new->flags = old->flags;
761 new->pflags = old->pflags;
767 new->listnr = old->listnr; 762 new->listnr = old->listnr;
768 new->action = old->action; 763 new->action = old->action;
769 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 764 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c75522a83678..dc4ae70a7413 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -72,6 +72,8 @@
72#include <linux/fs_struct.h> 72#include <linux/fs_struct.h>
73#include <linux/compat.h> 73#include <linux/compat.h>
74#include <linux/ctype.h> 74#include <linux/ctype.h>
75#include <linux/string.h>
76#include <uapi/linux/limits.h>
75 77
76#include "audit.h" 78#include "audit.h"
77 79
@@ -864,33 +866,10 @@ static inline void audit_free_names(struct audit_context *context)
864{ 866{
865 struct audit_names *n, *next; 867 struct audit_names *n, *next;
866 868
867#if AUDIT_DEBUG == 2
868 if (context->put_count + context->ino_count != context->name_count) {
869 int i = 0;
870
871 pr_err("%s:%d(:%d): major=%d in_syscall=%d"
872 " name_count=%d put_count=%d ino_count=%d"
873 " [NOT freeing]\n", __FILE__, __LINE__,
874 context->serial, context->major, context->in_syscall,
875 context->name_count, context->put_count,
876 context->ino_count);
877 list_for_each_entry(n, &context->names_list, list) {
878 pr_err("names[%d] = %p = %s\n", i++, n->name,
879 n->name->name ?: "(null)");
880 }
881 dump_stack();
882 return;
883 }
884#endif
885#if AUDIT_DEBUG
886 context->put_count = 0;
887 context->ino_count = 0;
888#endif
889
890 list_for_each_entry_safe(n, next, &context->names_list, list) { 869 list_for_each_entry_safe(n, next, &context->names_list, list) {
891 list_del(&n->list); 870 list_del(&n->list);
892 if (n->name && n->name_put) 871 if (n->name)
893 final_putname(n->name); 872 putname(n->name);
894 if (n->should_free) 873 if (n->should_free)
895 kfree(n); 874 kfree(n);
896 } 875 }
@@ -1709,9 +1688,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
1709 list_add_tail(&aname->list, &context->names_list); 1688 list_add_tail(&aname->list, &context->names_list);
1710 1689
1711 context->name_count++; 1690 context->name_count++;
1712#if AUDIT_DEBUG
1713 context->ino_count++;
1714#endif
1715 return aname; 1691 return aname;
1716} 1692}
1717 1693
@@ -1732,8 +1708,10 @@ __audit_reusename(const __user char *uptr)
1732 list_for_each_entry(n, &context->names_list, list) { 1708 list_for_each_entry(n, &context->names_list, list) {
1733 if (!n->name) 1709 if (!n->name)
1734 continue; 1710 continue;
1735 if (n->name->uptr == uptr) 1711 if (n->name->uptr == uptr) {
1712 n->name->refcnt++;
1736 return n->name; 1713 return n->name;
1714 }
1737 } 1715 }
1738 return NULL; 1716 return NULL;
1739} 1717}
@@ -1750,19 +1728,8 @@ void __audit_getname(struct filename *name)
1750 struct audit_context *context = current->audit_context; 1728 struct audit_context *context = current->audit_context;
1751 struct audit_names *n; 1729 struct audit_names *n;
1752 1730
1753 if (!context->in_syscall) { 1731 if (!context->in_syscall)
1754#if AUDIT_DEBUG == 2
1755 pr_err("%s:%d(:%d): ignoring getname(%p)\n",
1756 __FILE__, __LINE__, context->serial, name);
1757 dump_stack();
1758#endif
1759 return; 1732 return;
1760 }
1761
1762#if AUDIT_DEBUG
1763 /* The filename _must_ have a populated ->name */
1764 BUG_ON(!name->name);
1765#endif
1766 1733
1767 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); 1734 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
1768 if (!n) 1735 if (!n)
@@ -1770,56 +1737,13 @@ void __audit_getname(struct filename *name)
1770 1737
1771 n->name = name; 1738 n->name = name;
1772 n->name_len = AUDIT_NAME_FULL; 1739 n->name_len = AUDIT_NAME_FULL;
1773 n->name_put = true;
1774 name->aname = n; 1740 name->aname = n;
1741 name->refcnt++;
1775 1742
1776 if (!context->pwd.dentry) 1743 if (!context->pwd.dentry)
1777 get_fs_pwd(current->fs, &context->pwd); 1744 get_fs_pwd(current->fs, &context->pwd);
1778} 1745}
1779 1746
1780/* audit_putname - intercept a putname request
1781 * @name: name to intercept and delay for putname
1782 *
1783 * If we have stored the name from getname in the audit context,
1784 * then we delay the putname until syscall exit.
1785 * Called from include/linux/fs.h:putname().
1786 */
1787void audit_putname(struct filename *name)
1788{
1789 struct audit_context *context = current->audit_context;
1790
1791 BUG_ON(!context);
1792 if (!name->aname || !context->in_syscall) {
1793#if AUDIT_DEBUG == 2
1794 pr_err("%s:%d(:%d): final_putname(%p)\n",
1795 __FILE__, __LINE__, context->serial, name);
1796 if (context->name_count) {
1797 struct audit_names *n;
1798 int i = 0;
1799
1800 list_for_each_entry(n, &context->names_list, list)
1801 pr_err("name[%d] = %p = %s\n", i++, n->name,
1802 n->name->name ?: "(null)");
1803 }
1804#endif
1805 final_putname(name);
1806 }
1807#if AUDIT_DEBUG
1808 else {
1809 ++context->put_count;
1810 if (context->put_count > context->name_count) {
1811 pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
1812 " name_count=%d put_count=%d\n",
1813 __FILE__, __LINE__,
1814 context->serial, context->major,
1815 context->in_syscall, name->name,
1816 context->name_count, context->put_count);
1817 dump_stack();
1818 }
1819 }
1820#endif
1821}
1822
1823/** 1747/**
1824 * __audit_inode - store the inode and device from a lookup 1748 * __audit_inode - store the inode and device from a lookup
1825 * @name: name being audited 1749 * @name: name being audited
@@ -1840,10 +1764,6 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
1840 if (!name) 1764 if (!name)
1841 goto out_alloc; 1765 goto out_alloc;
1842 1766
1843#if AUDIT_DEBUG
1844 /* The struct filename _must_ have a populated ->name */
1845 BUG_ON(!name->name);
1846#endif
1847 /* 1767 /*
1848 * If we have a pointer to an audit_names entry already, then we can 1768 * If we have a pointer to an audit_names entry already, then we can
1849 * just use it directly if the type is correct. 1769 * just use it directly if the type is correct.
@@ -1861,8 +1781,17 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
1861 } 1781 }
1862 1782
1863 list_for_each_entry_reverse(n, &context->names_list, list) { 1783 list_for_each_entry_reverse(n, &context->names_list, list) {
1864 /* does the name pointer match? */ 1784 if (n->ino) {
1865 if (!n->name || n->name->name != name->name) 1785 /* valid inode number, use that for the comparison */
1786 if (n->ino != inode->i_ino ||
1787 n->dev != inode->i_sb->s_dev)
1788 continue;
1789 } else if (n->name) {
1790 /* inode number has not been set, check the name */
1791 if (strcmp(n->name->name, name->name))
1792 continue;
1793 } else
1794 /* no inode and no name (?!) ... this is odd ... */
1866 continue; 1795 continue;
1867 1796
1868 /* match the correct record type */ 1797 /* match the correct record type */
@@ -1877,12 +1806,15 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
1877 } 1806 }
1878 1807
1879out_alloc: 1808out_alloc:
1880 /* unable to find the name from a previous getname(). Allocate a new 1809 /* unable to find an entry with both a matching name and type */
1881 * anonymous entry. 1810 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
1882 */
1883 n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
1884 if (!n) 1811 if (!n)
1885 return; 1812 return;
1813 if (name) {
1814 n->name = name;
1815 name->refcnt++;
1816 }
1817
1886out: 1818out:
1887 if (parent) { 1819 if (parent) {
1888 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; 1820 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
@@ -1933,11 +1865,16 @@ void __audit_inode_child(const struct inode *parent,
1933 1865
1934 /* look for a parent entry first */ 1866 /* look for a parent entry first */
1935 list_for_each_entry(n, &context->names_list, list) { 1867 list_for_each_entry(n, &context->names_list, list) {
1936 if (!n->name || n->type != AUDIT_TYPE_PARENT) 1868 if (!n->name ||
1869 (n->type != AUDIT_TYPE_PARENT &&
1870 n->type != AUDIT_TYPE_UNKNOWN))
1937 continue; 1871 continue;
1938 1872
1939 if (n->ino == parent->i_ino && 1873 if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
1940 !audit_compare_dname_path(dname, n->name->name, n->name_len)) { 1874 !audit_compare_dname_path(dname,
1875 n->name->name, n->name_len)) {
1876 if (n->type == AUDIT_TYPE_UNKNOWN)
1877 n->type = AUDIT_TYPE_PARENT;
1941 found_parent = n; 1878 found_parent = n;
1942 break; 1879 break;
1943 } 1880 }
@@ -1946,11 +1883,8 @@ void __audit_inode_child(const struct inode *parent,
1946 /* is there a matching child entry? */ 1883 /* is there a matching child entry? */
1947 list_for_each_entry(n, &context->names_list, list) { 1884 list_for_each_entry(n, &context->names_list, list) {
1948 /* can only match entries that have a name */ 1885 /* can only match entries that have a name */
1949 if (!n->name || n->type != type) 1886 if (!n->name ||
1950 continue; 1887 (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
1951
1952 /* if we found a parent, make sure this one is a child of it */
1953 if (found_parent && (n->name != found_parent->name))
1954 continue; 1888 continue;
1955 1889
1956 if (!strcmp(dname, n->name->name) || 1890 if (!strcmp(dname, n->name->name) ||
@@ -1958,6 +1892,8 @@ void __audit_inode_child(const struct inode *parent,
1958 found_parent ? 1892 found_parent ?
1959 found_parent->name_len : 1893 found_parent->name_len :
1960 AUDIT_NAME_FULL)) { 1894 AUDIT_NAME_FULL)) {
1895 if (n->type == AUDIT_TYPE_UNKNOWN)
1896 n->type = type;
1961 found_child = n; 1897 found_child = n;
1962 break; 1898 break;
1963 } 1899 }
@@ -1982,10 +1918,10 @@ void __audit_inode_child(const struct inode *parent,
1982 if (found_parent) { 1918 if (found_parent) {
1983 found_child->name = found_parent->name; 1919 found_child->name = found_parent->name;
1984 found_child->name_len = AUDIT_NAME_FULL; 1920 found_child->name_len = AUDIT_NAME_FULL;
1985 /* don't call __putname() */ 1921 found_child->name->refcnt++;
1986 found_child->name_put = false;
1987 } 1922 }
1988 } 1923 }
1924
1989 if (inode) 1925 if (inode)
1990 audit_copy_inode(found_child, dentry, inode); 1926 audit_copy_inode(found_child, dentry, inode);
1991 else 1927 else
@@ -2368,7 +2304,6 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2368 struct audit_aux_data_bprm_fcaps *ax; 2304 struct audit_aux_data_bprm_fcaps *ax;
2369 struct audit_context *context = current->audit_context; 2305 struct audit_context *context = current->audit_context;
2370 struct cpu_vfs_cap_data vcaps; 2306 struct cpu_vfs_cap_data vcaps;
2371 struct dentry *dentry;
2372 2307
2373 ax = kmalloc(sizeof(*ax), GFP_KERNEL); 2308 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2374 if (!ax) 2309 if (!ax)
@@ -2378,9 +2313,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2378 ax->d.next = context->aux; 2313 ax->d.next = context->aux;
2379 context->aux = (void *)ax; 2314 context->aux = (void *)ax;
2380 2315
2381 dentry = dget(bprm->file->f_path.dentry); 2316 get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
2382 get_vfs_caps_from_disk(dentry, &vcaps);
2383 dput(dentry);
2384 2317
2385 ax->fcap.permitted = vcaps.permitted; 2318 ax->fcap.permitted = vcaps.permitted;
2386 ax->fcap.inheritable = vcaps.inheritable; 2319 ax->fcap.inheritable = vcaps.inheritable;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d6594e457a25..a64e7a207d2b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -163,7 +163,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
163 163
164void bpf_jit_binary_free(struct bpf_binary_header *hdr) 164void bpf_jit_binary_free(struct bpf_binary_header *hdr)
165{ 165{
166 module_free(NULL, hdr); 166 module_memfree(hdr);
167} 167}
168#endif /* CONFIG_BPF_JIT */ 168#endif /* CONFIG_BPF_JIT */
169 169
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 088ac0b1b106..536edc2be307 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -150,7 +150,7 @@ static int map_lookup_elem(union bpf_attr *attr)
150 int ufd = attr->map_fd; 150 int ufd = attr->map_fd;
151 struct fd f = fdget(ufd); 151 struct fd f = fdget(ufd);
152 struct bpf_map *map; 152 struct bpf_map *map;
153 void *key, *value; 153 void *key, *value, *ptr;
154 int err; 154 int err;
155 155
156 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) 156 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
@@ -169,20 +169,29 @@ static int map_lookup_elem(union bpf_attr *attr)
169 if (copy_from_user(key, ukey, map->key_size) != 0) 169 if (copy_from_user(key, ukey, map->key_size) != 0)
170 goto free_key; 170 goto free_key;
171 171
172 err = -ENOENT; 172 err = -ENOMEM;
173 rcu_read_lock(); 173 value = kmalloc(map->value_size, GFP_USER);
174 value = map->ops->map_lookup_elem(map, key);
175 if (!value) 174 if (!value)
176 goto err_unlock; 175 goto free_key;
176
177 rcu_read_lock();
178 ptr = map->ops->map_lookup_elem(map, key);
179 if (ptr)
180 memcpy(value, ptr, map->value_size);
181 rcu_read_unlock();
182
183 err = -ENOENT;
184 if (!ptr)
185 goto free_value;
177 186
178 err = -EFAULT; 187 err = -EFAULT;
179 if (copy_to_user(uvalue, value, map->value_size) != 0) 188 if (copy_to_user(uvalue, value, map->value_size) != 0)
180 goto err_unlock; 189 goto free_value;
181 190
182 err = 0; 191 err = 0;
183 192
184err_unlock: 193free_value:
185 rcu_read_unlock(); 194 kfree(value);
186free_key: 195free_key:
187 kfree(key); 196 kfree(key);
188err_put: 197err_put:
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bb263d0caab3..29a7b2cc593e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1909,7 +1909,7 @@ static void cgroup_kill_sb(struct super_block *sb)
1909 * 1909 *
1910 * And don't kill the default root. 1910 * And don't kill the default root.
1911 */ 1911 */
1912 if (css_has_online_children(&root->cgrp.self) || 1912 if (!list_empty(&root->cgrp.self.children) ||
1913 root == &cgrp_dfl_root) 1913 root == &cgrp_dfl_root)
1914 cgroup_put(&root->cgrp); 1914 cgroup_put(&root->cgrp);
1915 else 1915 else
@@ -3077,7 +3077,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
3077#endif 3077#endif
3078 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), 3078 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3079 cgroup_file_mode(cft), 0, cft->kf_ops, cft, 3079 cgroup_file_mode(cft), 0, cft->kf_ops, cft,
3080 NULL, false, key); 3080 NULL, key);
3081 if (IS_ERR(kn)) 3081 if (IS_ERR(kn))
3082 return PTR_ERR(kn); 3082 return PTR_ERR(kn);
3083 3083
@@ -4373,16 +4373,20 @@ static void css_free_work_fn(struct work_struct *work)
4373{ 4373{
4374 struct cgroup_subsys_state *css = 4374 struct cgroup_subsys_state *css =
4375 container_of(work, struct cgroup_subsys_state, destroy_work); 4375 container_of(work, struct cgroup_subsys_state, destroy_work);
4376 struct cgroup_subsys *ss = css->ss;
4376 struct cgroup *cgrp = css->cgroup; 4377 struct cgroup *cgrp = css->cgroup;
4377 4378
4378 percpu_ref_exit(&css->refcnt); 4379 percpu_ref_exit(&css->refcnt);
4379 4380
4380 if (css->ss) { 4381 if (ss) {
4381 /* css free path */ 4382 /* css free path */
4383 int id = css->id;
4384
4382 if (css->parent) 4385 if (css->parent)
4383 css_put(css->parent); 4386 css_put(css->parent);
4384 4387
4385 css->ss->css_free(css); 4388 ss->css_free(css);
4389 cgroup_idr_remove(&ss->css_idr, id);
4386 cgroup_put(cgrp); 4390 cgroup_put(cgrp);
4387 } else { 4391 } else {
4388 /* cgroup free path */ 4392 /* cgroup free path */
@@ -4434,7 +4438,7 @@ static void css_release_work_fn(struct work_struct *work)
4434 4438
4435 if (ss) { 4439 if (ss) {
4436 /* css release path */ 4440 /* css release path */
4437 cgroup_idr_remove(&ss->css_idr, css->id); 4441 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4438 if (ss->css_released) 4442 if (ss->css_released)
4439 ss->css_released(css); 4443 ss->css_released(css);
4440 } else { 4444 } else {
diff --git a/kernel/compat.c b/kernel/compat.c
index ebb3c369d03d..24f00610c575 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -276,8 +276,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
276 * core implementation decides to return random nonsense. 276 * core implementation decides to return random nonsense.
277 */ 277 */
278 if (ret == -ERESTART_RESTARTBLOCK) { 278 if (ret == -ERESTART_RESTARTBLOCK) {
279 struct restart_block *restart 279 struct restart_block *restart = &current->restart_block;
280 = &current_thread_info()->restart_block;
281 280
282 restart->fn = compat_nanosleep_restart; 281 restart->fn = compat_nanosleep_restart;
283 restart->nanosleep.compat_rmtp = rmtp; 282 restart->nanosleep.compat_rmtp = rmtp;
@@ -860,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
860 return -EFAULT; 859 return -EFAULT;
861 860
862 if (err == -ERESTART_RESTARTBLOCK) { 861 if (err == -ERESTART_RESTARTBLOCK) {
863 restart = &current_thread_info()->restart_block; 862 restart = &current->restart_block;
864 restart->fn = compat_clock_nanosleep_restart; 863 restart->fn = compat_clock_nanosleep_restart;
865 restart->nanosleep.compat_rmtp = rmtp; 864 restart->nanosleep.compat_rmtp = rmtp;
866 } 865 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5d220234b3ca..1972b161c61e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,22 +58,23 @@ static int cpu_hotplug_disabled;
58 58
59static struct { 59static struct {
60 struct task_struct *active_writer; 60 struct task_struct *active_writer;
61 struct mutex lock; /* Synchronizes accesses to refcount, */ 61 /* wait queue to wake up the active_writer */
62 wait_queue_head_t wq;
63 /* verifies that no writer will get active while readers are active */
64 struct mutex lock;
62 /* 65 /*
63 * Also blocks the new readers during 66 * Also blocks the new readers during
64 * an ongoing cpu hotplug operation. 67 * an ongoing cpu hotplug operation.
65 */ 68 */
66 int refcount; 69 atomic_t refcount;
67 /* And allows lockless put_online_cpus(). */
68 atomic_t puts_pending;
69 70
70#ifdef CONFIG_DEBUG_LOCK_ALLOC 71#ifdef CONFIG_DEBUG_LOCK_ALLOC
71 struct lockdep_map dep_map; 72 struct lockdep_map dep_map;
72#endif 73#endif
73} cpu_hotplug = { 74} cpu_hotplug = {
74 .active_writer = NULL, 75 .active_writer = NULL,
76 .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
75 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), 77 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
76 .refcount = 0,
77#ifdef CONFIG_DEBUG_LOCK_ALLOC 78#ifdef CONFIG_DEBUG_LOCK_ALLOC
78 .dep_map = {.name = "cpu_hotplug.lock" }, 79 .dep_map = {.name = "cpu_hotplug.lock" },
79#endif 80#endif
@@ -86,15 +87,6 @@ static struct {
86#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) 87#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
87#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) 88#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
88 89
89static void apply_puts_pending(int max)
90{
91 int delta;
92
93 if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
94 delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
95 cpu_hotplug.refcount -= delta;
96 }
97}
98 90
99void get_online_cpus(void) 91void get_online_cpus(void)
100{ 92{
@@ -103,8 +95,7 @@ void get_online_cpus(void)
103 return; 95 return;
104 cpuhp_lock_acquire_read(); 96 cpuhp_lock_acquire_read();
105 mutex_lock(&cpu_hotplug.lock); 97 mutex_lock(&cpu_hotplug.lock);
106 apply_puts_pending(65536); 98 atomic_inc(&cpu_hotplug.refcount);
107 cpu_hotplug.refcount++;
108 mutex_unlock(&cpu_hotplug.lock); 99 mutex_unlock(&cpu_hotplug.lock);
109} 100}
110EXPORT_SYMBOL_GPL(get_online_cpus); 101EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -116,8 +107,7 @@ bool try_get_online_cpus(void)
116 if (!mutex_trylock(&cpu_hotplug.lock)) 107 if (!mutex_trylock(&cpu_hotplug.lock))
117 return false; 108 return false;
118 cpuhp_lock_acquire_tryread(); 109 cpuhp_lock_acquire_tryread();
119 apply_puts_pending(65536); 110 atomic_inc(&cpu_hotplug.refcount);
120 cpu_hotplug.refcount++;
121 mutex_unlock(&cpu_hotplug.lock); 111 mutex_unlock(&cpu_hotplug.lock);
122 return true; 112 return true;
123} 113}
@@ -125,20 +115,18 @@ EXPORT_SYMBOL_GPL(try_get_online_cpus);
125 115
126void put_online_cpus(void) 116void put_online_cpus(void)
127{ 117{
118 int refcount;
119
128 if (cpu_hotplug.active_writer == current) 120 if (cpu_hotplug.active_writer == current)
129 return; 121 return;
130 if (!mutex_trylock(&cpu_hotplug.lock)) {
131 atomic_inc(&cpu_hotplug.puts_pending);
132 cpuhp_lock_release();
133 return;
134 }
135 122
136 if (WARN_ON(!cpu_hotplug.refcount)) 123 refcount = atomic_dec_return(&cpu_hotplug.refcount);
137 cpu_hotplug.refcount++; /* try to fix things up */ 124 if (WARN_ON(refcount < 0)) /* try to fix things up */
125 atomic_inc(&cpu_hotplug.refcount);
126
127 if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
128 wake_up(&cpu_hotplug.wq);
138 129
139 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
140 wake_up_process(cpu_hotplug.active_writer);
141 mutex_unlock(&cpu_hotplug.lock);
142 cpuhp_lock_release(); 130 cpuhp_lock_release();
143 131
144} 132}
@@ -168,18 +156,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
168 */ 156 */
169void cpu_hotplug_begin(void) 157void cpu_hotplug_begin(void)
170{ 158{
171 cpu_hotplug.active_writer = current; 159 DEFINE_WAIT(wait);
172 160
161 cpu_hotplug.active_writer = current;
173 cpuhp_lock_acquire(); 162 cpuhp_lock_acquire();
163
174 for (;;) { 164 for (;;) {
175 mutex_lock(&cpu_hotplug.lock); 165 mutex_lock(&cpu_hotplug.lock);
176 apply_puts_pending(1); 166 prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
177 if (likely(!cpu_hotplug.refcount)) 167 if (likely(!atomic_read(&cpu_hotplug.refcount)))
178 break; 168 break;
179 __set_current_state(TASK_UNINTERRUPTIBLE);
180 mutex_unlock(&cpu_hotplug.lock); 169 mutex_unlock(&cpu_hotplug.lock);
181 schedule(); 170 schedule();
182 } 171 }
172 finish_wait(&cpu_hotplug.wq, &wait);
183} 173}
184 174
185void cpu_hotplug_done(void) 175void cpu_hotplug_done(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64b257f6bca2..1d1fe9361d29 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1707,40 +1707,27 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1707{ 1707{
1708 struct cpuset *cs = css_cs(seq_css(sf)); 1708 struct cpuset *cs = css_cs(seq_css(sf));
1709 cpuset_filetype_t type = seq_cft(sf)->private; 1709 cpuset_filetype_t type = seq_cft(sf)->private;
1710 ssize_t count;
1711 char *buf, *s;
1712 int ret = 0; 1710 int ret = 0;
1713 1711
1714 count = seq_get_buf(sf, &buf);
1715 s = buf;
1716
1717 spin_lock_irq(&callback_lock); 1712 spin_lock_irq(&callback_lock);
1718 1713
1719 switch (type) { 1714 switch (type) {
1720 case FILE_CPULIST: 1715 case FILE_CPULIST:
1721 s += cpulist_scnprintf(s, count, cs->cpus_allowed); 1716 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
1722 break; 1717 break;
1723 case FILE_MEMLIST: 1718 case FILE_MEMLIST:
1724 s += nodelist_scnprintf(s, count, cs->mems_allowed); 1719 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
1725 break; 1720 break;
1726 case FILE_EFFECTIVE_CPULIST: 1721 case FILE_EFFECTIVE_CPULIST:
1727 s += cpulist_scnprintf(s, count, cs->effective_cpus); 1722 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
1728 break; 1723 break;
1729 case FILE_EFFECTIVE_MEMLIST: 1724 case FILE_EFFECTIVE_MEMLIST:
1730 s += nodelist_scnprintf(s, count, cs->effective_mems); 1725 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
1731 break; 1726 break;
1732 default: 1727 default:
1733 ret = -EINVAL; 1728 ret = -EINVAL;
1734 goto out_unlock;
1735 } 1729 }
1736 1730
1737 if (s < buf + count - 1) {
1738 *s++ = '\n';
1739 seq_commit(sf, s - buf);
1740 } else {
1741 seq_commit(sf, -1);
1742 }
1743out_unlock:
1744 spin_unlock_irq(&callback_lock); 1731 spin_unlock_irq(&callback_lock);
1745 return ret; 1732 return ret;
1746} 1733}
@@ -2400,7 +2387,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2400 */ 2387 */
2401} 2388}
2402 2389
2403void cpuset_init_current_mems_allowed(void) 2390void __init cpuset_init_current_mems_allowed(void)
2404{ 2391{
2405 nodes_setall(current->mems_allowed); 2392 nodes_setall(current->mems_allowed);
2406} 2393}
@@ -2610,8 +2597,6 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2610 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2597 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2611} 2598}
2612 2599
2613#define CPUSET_NODELIST_LEN (256)
2614
2615/** 2600/**
2616 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed 2601 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2617 * @tsk: pointer to task_struct of some task. 2602 * @tsk: pointer to task_struct of some task.
@@ -2621,23 +2606,16 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2621 */ 2606 */
2622void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2607void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2623{ 2608{
2624 /* Statically allocated to prevent using excess stack. */
2625 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
2626 static DEFINE_SPINLOCK(cpuset_buffer_lock);
2627 struct cgroup *cgrp; 2609 struct cgroup *cgrp;
2628 2610
2629 spin_lock(&cpuset_buffer_lock);
2630 rcu_read_lock(); 2611 rcu_read_lock();
2631 2612
2632 cgrp = task_cs(tsk)->css.cgroup; 2613 cgrp = task_cs(tsk)->css.cgroup;
2633 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2634 tsk->mems_allowed);
2635 pr_info("%s cpuset=", tsk->comm); 2614 pr_info("%s cpuset=", tsk->comm);
2636 pr_cont_cgroup_name(cgrp); 2615 pr_cont_cgroup_name(cgrp);
2637 pr_cont(" mems_allowed=%s\n", cpuset_nodelist); 2616 pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
2638 2617
2639 rcu_read_unlock(); 2618 rcu_read_unlock();
2640 spin_unlock(&cpuset_buffer_lock);
2641} 2619}
2642 2620
2643/* 2621/*
@@ -2715,10 +2693,8 @@ out:
2715/* Display task mems_allowed in /proc/<pid>/status file. */ 2693/* Display task mems_allowed in /proc/<pid>/status file. */
2716void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2694void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2717{ 2695{
2718 seq_puts(m, "Mems_allowed:\t"); 2696 seq_printf(m, "Mems_allowed:\t%*pb\n",
2719 seq_nodemask(m, &task->mems_allowed); 2697 nodemask_pr_args(&task->mems_allowed));
2720 seq_puts(m, "\n"); 2698 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
2721 seq_puts(m, "Mems_allowed_list:\t"); 2699 nodemask_pr_args(&task->mems_allowed));
2722 seq_nodemask_list(m, &task->mems_allowed);
2723 seq_puts(m, "\n");
2724} 2700}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1adf62b39b96..07ce18ca71e0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -27,6 +27,9 @@
27 * version 2. This program is licensed "as is" without any warranty of any 27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied. 28 * kind, whether express or implied.
29 */ 29 */
30
31#define pr_fmt(fmt) "KGDB: " fmt
32
30#include <linux/pid_namespace.h> 33#include <linux/pid_namespace.h>
31#include <linux/clocksource.h> 34#include <linux/clocksource.h>
32#include <linux/serial_core.h> 35#include <linux/serial_core.h>
@@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr)
196 return err; 199 return err;
197 err = kgdb_arch_remove_breakpoint(&tmp); 200 err = kgdb_arch_remove_breakpoint(&tmp);
198 if (err) 201 if (err)
199 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " 202 pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n",
200 "memory destroyed at: %lx", addr); 203 addr);
201 return err; 204 return err;
202} 205}
203 206
@@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void)
256 error = kgdb_arch_set_breakpoint(&kgdb_break[i]); 259 error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
257 if (error) { 260 if (error) {
258 ret = error; 261 ret = error;
259 printk(KERN_INFO "KGDB: BP install failed: %lx", 262 pr_info("BP install failed: %lx\n",
260 kgdb_break[i].bpt_addr); 263 kgdb_break[i].bpt_addr);
261 continue; 264 continue;
262 } 265 }
263 266
@@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void)
319 continue; 322 continue;
320 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); 323 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
321 if (error) { 324 if (error) {
322 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", 325 pr_info("BP remove failed: %lx\n",
323 kgdb_break[i].bpt_addr); 326 kgdb_break[i].bpt_addr);
324 ret = error; 327 ret = error;
325 } 328 }
326 329
@@ -367,7 +370,7 @@ int dbg_remove_all_break(void)
367 goto setundefined; 370 goto setundefined;
368 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); 371 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
369 if (error) 372 if (error)
370 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", 373 pr_err("breakpoint remove failed: %lx\n",
371 kgdb_break[i].bpt_addr); 374 kgdb_break[i].bpt_addr);
372setundefined: 375setundefined:
373 kgdb_break[i].state = BP_UNDEFINED; 376 kgdb_break[i].state = BP_UNDEFINED;
@@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait)
400 if (print_wait) { 403 if (print_wait) {
401#ifdef CONFIG_KGDB_KDB 404#ifdef CONFIG_KGDB_KDB
402 if (!dbg_kdb_mode) 405 if (!dbg_kdb_mode)
403 printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n"); 406 pr_crit("waiting... or $3#33 for KDB\n");
404#else 407#else
405 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); 408 pr_crit("Waiting for remote debugger\n");
406#endif 409#endif
407 } 410 }
408 return 1; 411 return 1;
@@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
430 exception_level = 0; 433 exception_level = 0;
431 kgdb_skipexception(ks->ex_vector, ks->linux_regs); 434 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
432 dbg_activate_sw_breakpoints(); 435 dbg_activate_sw_breakpoints();
433 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", 436 pr_crit("re-enter error: breakpoint removed %lx\n", addr);
434 addr);
435 WARN_ON_ONCE(1); 437 WARN_ON_ONCE(1);
436 438
437 return 1; 439 return 1;
@@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
444 panic("Recursive entry to debugger"); 446 panic("Recursive entry to debugger");
445 } 447 }
446 448
447 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); 449 pr_crit("re-enter exception: ALL breakpoints killed\n");
448#ifdef CONFIG_KGDB_KDB 450#ifdef CONFIG_KGDB_KDB
449 /* Allow kdb to debug itself one level */ 451 /* Allow kdb to debug itself one level */
450 return 0; 452 return 0;
@@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
471 int cpu; 473 int cpu;
472 int trace_on = 0; 474 int trace_on = 0;
473 int online_cpus = num_online_cpus(); 475 int online_cpus = num_online_cpus();
476 u64 time_left;
474 477
475 kgdb_info[ks->cpu].enter_kgdb++; 478 kgdb_info[ks->cpu].enter_kgdb++;
476 kgdb_info[ks->cpu].exception_state |= exception_state; 479 kgdb_info[ks->cpu].exception_state |= exception_state;
@@ -595,9 +598,13 @@ return_normal:
595 /* 598 /*
596 * Wait for the other CPUs to be notified and be waiting for us: 599 * Wait for the other CPUs to be notified and be waiting for us:
597 */ 600 */
598 while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + 601 time_left = loops_per_jiffy * HZ;
599 atomic_read(&slaves_in_kgdb)) != online_cpus) 602 while (kgdb_do_roundup && --time_left &&
603 (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
604 online_cpus)
600 cpu_relax(); 605 cpu_relax();
606 if (!time_left)
607 pr_crit("KGDB: Timed out waiting for secondary CPUs.\n");
601 608
602 /* 609 /*
603 * At this point the primary processor is completely 610 * At this point the primary processor is completely
@@ -795,15 +802,15 @@ static struct console kgdbcons = {
795static void sysrq_handle_dbg(int key) 802static void sysrq_handle_dbg(int key)
796{ 803{
797 if (!dbg_io_ops) { 804 if (!dbg_io_ops) {
798 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); 805 pr_crit("ERROR: No KGDB I/O module available\n");
799 return; 806 return;
800 } 807 }
801 if (!kgdb_connected) { 808 if (!kgdb_connected) {
802#ifdef CONFIG_KGDB_KDB 809#ifdef CONFIG_KGDB_KDB
803 if (!dbg_kdb_mode) 810 if (!dbg_kdb_mode)
804 printk(KERN_CRIT "KGDB or $3#33 for KDB\n"); 811 pr_crit("KGDB or $3#33 for KDB\n");
805#else 812#else
806 printk(KERN_CRIT "Entering KGDB\n"); 813 pr_crit("Entering KGDB\n");
807#endif 814#endif
808 } 815 }
809 816
@@ -945,7 +952,7 @@ static void kgdb_initial_breakpoint(void)
945{ 952{
946 kgdb_break_asap = 0; 953 kgdb_break_asap = 0;
947 954
948 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); 955 pr_crit("Waiting for connection from remote gdb...\n");
949 kgdb_breakpoint(); 956 kgdb_breakpoint();
950} 957}
951 958
@@ -964,8 +971,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
964 if (dbg_io_ops) { 971 if (dbg_io_ops) {
965 spin_unlock(&kgdb_registration_lock); 972 spin_unlock(&kgdb_registration_lock);
966 973
967 printk(KERN_ERR "kgdb: Another I/O driver is already " 974 pr_err("Another I/O driver is already registered with KGDB\n");
968 "registered with KGDB.\n");
969 return -EBUSY; 975 return -EBUSY;
970 } 976 }
971 977
@@ -981,8 +987,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
981 987
982 spin_unlock(&kgdb_registration_lock); 988 spin_unlock(&kgdb_registration_lock);
983 989
984 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", 990 pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name);
985 new_dbg_io_ops->name);
986 991
987 /* Arm KGDB now. */ 992 /* Arm KGDB now. */
988 kgdb_register_callbacks(); 993 kgdb_register_callbacks();
@@ -1017,8 +1022,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
1017 1022
1018 spin_unlock(&kgdb_registration_lock); 1023 spin_unlock(&kgdb_registration_lock);
1019 1024
1020 printk(KERN_INFO 1025 pr_info("Unregistered I/O driver %s, debugger disabled\n",
1021 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
1022 old_dbg_io_ops->name); 1026 old_dbg_io_ops->name);
1023} 1027}
1024EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); 1028EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index b20d544f20c2..e1dbf4a2c69e 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -531,22 +531,29 @@ void __init kdb_initbptab(void)
531 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) 531 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
532 bp->bp_free = 1; 532 bp->bp_free = 1;
533 533
534 kdb_register_repeat("bp", kdb_bp, "[<vaddr>]", 534 kdb_register_flags("bp", kdb_bp, "[<vaddr>]",
535 "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS); 535 "Set/Display breakpoints", 0,
536 kdb_register_repeat("bl", kdb_bp, "[<vaddr>]", 536 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
537 "Display breakpoints", 0, KDB_REPEAT_NO_ARGS); 537 kdb_register_flags("bl", kdb_bp, "[<vaddr>]",
538 "Display breakpoints", 0,
539 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
538 if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) 540 if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
539 kdb_register_repeat("bph", kdb_bp, "[<vaddr>]", 541 kdb_register_flags("bph", kdb_bp, "[<vaddr>]",
540 "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS); 542 "[datar [length]|dataw [length]] Set hw brk", 0,
541 kdb_register_repeat("bc", kdb_bc, "<bpnum>", 543 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
542 "Clear Breakpoint", 0, KDB_REPEAT_NONE); 544 kdb_register_flags("bc", kdb_bc, "<bpnum>",
543 kdb_register_repeat("be", kdb_bc, "<bpnum>", 545 "Clear Breakpoint", 0,
544 "Enable Breakpoint", 0, KDB_REPEAT_NONE); 546 KDB_ENABLE_FLOW_CTRL);
545 kdb_register_repeat("bd", kdb_bc, "<bpnum>", 547 kdb_register_flags("be", kdb_bc, "<bpnum>",
546 "Disable Breakpoint", 0, KDB_REPEAT_NONE); 548 "Enable Breakpoint", 0,
547 549 KDB_ENABLE_FLOW_CTRL);
548 kdb_register_repeat("ss", kdb_ss, "", 550 kdb_register_flags("bd", kdb_bc, "<bpnum>",
549 "Single Step", 1, KDB_REPEAT_NO_ARGS); 551 "Disable Breakpoint", 0,
552 KDB_ENABLE_FLOW_CTRL);
553
554 kdb_register_flags("ss", kdb_ss, "",
555 "Single Step", 1,
556 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
550 /* 557 /*
551 * Architecture dependent initialization. 558 * Architecture dependent initialization.
552 */ 559 */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8859ca34dcfe..15e1a7af5dd0 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks)
129 ks->pass_exception = 1; 129 ks->pass_exception = 1;
130 KDB_FLAG_SET(CATASTROPHIC); 130 KDB_FLAG_SET(CATASTROPHIC);
131 } 131 }
132 /* set CATASTROPHIC if the system contains unresponsive processors */
133 for_each_online_cpu(i)
134 if (!kgdb_info[i].enter_kgdb)
135 KDB_FLAG_SET(CATASTROPHIC);
132 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { 136 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
133 KDB_STATE_CLEAR(SSBPT); 137 KDB_STATE_CLEAR(SSBPT);
134 KDB_STATE_CLEAR(DOING_SS); 138 KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 379650b984f8..7b40c5f07dce 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -12,6 +12,7 @@
12 */ 12 */
13 13
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/types.h>
15#include <linux/string.h> 16#include <linux/string.h>
16#include <linux/kernel.h> 17#include <linux/kernel.h>
17#include <linux/kmsg_dump.h> 18#include <linux/kmsg_dump.h>
@@ -23,6 +24,7 @@
23#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
24#include <linux/atomic.h> 25#include <linux/atomic.h>
25#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/moduleparam.h>
26#include <linux/mm.h> 28#include <linux/mm.h>
27#include <linux/init.h> 29#include <linux/init.h>
28#include <linux/kallsyms.h> 30#include <linux/kallsyms.h>
@@ -42,6 +44,12 @@
42#include <linux/slab.h> 44#include <linux/slab.h>
43#include "kdb_private.h" 45#include "kdb_private.h"
44 46
47#undef MODULE_PARAM_PREFIX
48#define MODULE_PARAM_PREFIX "kdb."
49
50static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE;
51module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600);
52
45#define GREP_LEN 256 53#define GREP_LEN 256
46char kdb_grep_string[GREP_LEN]; 54char kdb_grep_string[GREP_LEN];
47int kdb_grepping_flag; 55int kdb_grepping_flag;
@@ -121,6 +129,7 @@ static kdbmsg_t kdbmsgs[] = {
121 KDBMSG(BADLENGTH, "Invalid length field"), 129 KDBMSG(BADLENGTH, "Invalid length field"),
122 KDBMSG(NOBP, "No Breakpoint exists"), 130 KDBMSG(NOBP, "No Breakpoint exists"),
123 KDBMSG(BADADDR, "Invalid address"), 131 KDBMSG(BADADDR, "Invalid address"),
132 KDBMSG(NOPERM, "Permission denied"),
124}; 133};
125#undef KDBMSG 134#undef KDBMSG
126 135
@@ -188,6 +197,26 @@ struct task_struct *kdb_curr_task(int cpu)
188} 197}
189 198
190/* 199/*
200 * Check whether the flags of the current command and the permissions
201 * of the kdb console has allow a command to be run.
202 */
203static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
204 bool no_args)
205{
206 /* permissions comes from userspace so needs massaging slightly */
207 permissions &= KDB_ENABLE_MASK;
208 permissions |= KDB_ENABLE_ALWAYS_SAFE;
209
210 /* some commands change group when launched with no arguments */
211 if (no_args)
212 permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT;
213
214 flags |= KDB_ENABLE_ALL;
215
216 return permissions & flags;
217}
218
219/*
191 * kdbgetenv - This function will return the character string value of 220 * kdbgetenv - This function will return the character string value of
192 * an environment variable. 221 * an environment variable.
193 * Parameters: 222 * Parameters:
@@ -476,6 +505,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
476 kdb_symtab_t symtab; 505 kdb_symtab_t symtab;
477 506
478 /* 507 /*
508 * If the enable flags prohibit both arbitrary memory access
509 * and flow control then there are no reasonable grounds to
510 * provide symbol lookup.
511 */
512 if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL,
513 kdb_cmd_enabled, false))
514 return KDB_NOPERM;
515
516 /*
479 * Process arguments which follow the following syntax: 517 * Process arguments which follow the following syntax:
480 * 518 *
481 * symbol | numeric-address [+/- numeric-offset] 519 * symbol | numeric-address [+/- numeric-offset]
@@ -641,8 +679,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
641 if (!s->count) 679 if (!s->count)
642 s->usable = 0; 680 s->usable = 0;
643 if (s->usable) 681 if (s->usable)
644 kdb_register(s->name, kdb_exec_defcmd, 682 /* macros are always safe because when executed each
645 s->usage, s->help, 0); 683 * internal command re-enters kdb_parse() and is
684 * safety checked individually.
685 */
686 kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
687 s->help, 0,
688 KDB_ENABLE_ALWAYS_SAFE);
646 return 0; 689 return 0;
647 } 690 }
648 if (!s->usable) 691 if (!s->usable)
@@ -1003,25 +1046,22 @@ int kdb_parse(const char *cmdstr)
1003 1046
1004 if (i < kdb_max_commands) { 1047 if (i < kdb_max_commands) {
1005 int result; 1048 int result;
1049
1050 if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
1051 return KDB_NOPERM;
1052
1006 KDB_STATE_SET(CMD); 1053 KDB_STATE_SET(CMD);
1007 result = (*tp->cmd_func)(argc-1, (const char **)argv); 1054 result = (*tp->cmd_func)(argc-1, (const char **)argv);
1008 if (result && ignore_errors && result > KDB_CMD_GO) 1055 if (result && ignore_errors && result > KDB_CMD_GO)
1009 result = 0; 1056 result = 0;
1010 KDB_STATE_CLEAR(CMD); 1057 KDB_STATE_CLEAR(CMD);
1011 switch (tp->cmd_repeat) { 1058
1012 case KDB_REPEAT_NONE: 1059 if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
1013 argc = 0; 1060 return result;
1014 if (argv[0]) 1061
1015 *(argv[0]) = '\0'; 1062 argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
1016 break; 1063 if (argv[argc])
1017 case KDB_REPEAT_NO_ARGS: 1064 *(argv[argc]) = '\0';
1018 argc = 1;
1019 if (argv[1])
1020 *(argv[1]) = '\0';
1021 break;
1022 case KDB_REPEAT_WITH_ARGS:
1023 break;
1024 }
1025 return result; 1065 return result;
1026 } 1066 }
1027 1067
@@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv)
1921 */ 1961 */
1922static int kdb_sr(int argc, const char **argv) 1962static int kdb_sr(int argc, const char **argv)
1923{ 1963{
1964 bool check_mask =
1965 !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false);
1966
1924 if (argc != 1) 1967 if (argc != 1)
1925 return KDB_ARGCOUNT; 1968 return KDB_ARGCOUNT;
1969
1926 kdb_trap_printk++; 1970 kdb_trap_printk++;
1927 __handle_sysrq(*argv[1], false); 1971 __handle_sysrq(*argv[1], check_mask);
1928 kdb_trap_printk--; 1972 kdb_trap_printk--;
1929 1973
1930 return 0; 1974 return 0;
@@ -1979,7 +2023,7 @@ static int kdb_lsmod(int argc, const char **argv)
1979 kdb_printf("%-20s%8u 0x%p ", mod->name, 2023 kdb_printf("%-20s%8u 0x%p ", mod->name,
1980 mod->core_size, (void *)mod); 2024 mod->core_size, (void *)mod);
1981#ifdef CONFIG_MODULE_UNLOAD 2025#ifdef CONFIG_MODULE_UNLOAD
1982 kdb_printf("%4ld ", module_refcount(mod)); 2026 kdb_printf("%4d ", module_refcount(mod));
1983#endif 2027#endif
1984 if (mod->state == MODULE_STATE_GOING) 2028 if (mod->state == MODULE_STATE_GOING)
1985 kdb_printf(" (Unloading)"); 2029 kdb_printf(" (Unloading)");
@@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void)
2157 for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { 2201 for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
2158 if (!cpu_online(i)) { 2202 if (!cpu_online(i)) {
2159 state = 'F'; /* cpu is offline */ 2203 state = 'F'; /* cpu is offline */
2204 } else if (!kgdb_info[i].enter_kgdb) {
2205 state = 'D'; /* cpu is online but unresponsive */
2160 } else { 2206 } else {
2161 state = ' '; /* cpu is responding to kdb */ 2207 state = ' '; /* cpu is responding to kdb */
2162 if (kdb_task_state_char(KDB_TSK(i)) == 'I') 2208 if (kdb_task_state_char(KDB_TSK(i)) == 'I')
@@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv)
2210 /* 2256 /*
2211 * Validate cpunum 2257 * Validate cpunum
2212 */ 2258 */
2213 if ((cpunum > NR_CPUS) || !cpu_online(cpunum)) 2259 if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
2214 return KDB_BADCPUNUM; 2260 return KDB_BADCPUNUM;
2215 2261
2216 dbg_switch_cpu = cpunum; 2262 dbg_switch_cpu = cpunum;
@@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv)
2375 return 0; 2421 return 0;
2376 if (!kt->cmd_name) 2422 if (!kt->cmd_name)
2377 continue; 2423 continue;
2424 if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
2425 continue;
2378 if (strlen(kt->cmd_usage) > 20) 2426 if (strlen(kt->cmd_usage) > 20)
2379 space = "\n "; 2427 space = "\n ";
2380 kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, 2428 kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
@@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv)
2629} 2677}
2630 2678
2631/* 2679/*
2632 * kdb_register_repeat - This function is used to register a kernel 2680 * kdb_register_flags - This function is used to register a kernel
2633 * debugger command. 2681 * debugger command.
2634 * Inputs: 2682 * Inputs:
2635 * cmd Command name 2683 * cmd Command name
@@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv)
2641 * zero for success, one if a duplicate command. 2689 * zero for success, one if a duplicate command.
2642 */ 2690 */
2643#define kdb_command_extend 50 /* arbitrary */ 2691#define kdb_command_extend 50 /* arbitrary */
2644int kdb_register_repeat(char *cmd, 2692int kdb_register_flags(char *cmd,
2645 kdb_func_t func, 2693 kdb_func_t func,
2646 char *usage, 2694 char *usage,
2647 char *help, 2695 char *help,
2648 short minlen, 2696 short minlen,
2649 kdb_repeat_t repeat) 2697 kdb_cmdflags_t flags)
2650{ 2698{
2651 int i; 2699 int i;
2652 kdbtab_t *kp; 2700 kdbtab_t *kp;
@@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd,
2694 kp->cmd_func = func; 2742 kp->cmd_func = func;
2695 kp->cmd_usage = usage; 2743 kp->cmd_usage = usage;
2696 kp->cmd_help = help; 2744 kp->cmd_help = help;
2697 kp->cmd_flags = 0;
2698 kp->cmd_minlen = minlen; 2745 kp->cmd_minlen = minlen;
2699 kp->cmd_repeat = repeat; 2746 kp->cmd_flags = flags;
2700 2747
2701 return 0; 2748 return 0;
2702} 2749}
2703EXPORT_SYMBOL_GPL(kdb_register_repeat); 2750EXPORT_SYMBOL_GPL(kdb_register_flags);
2704 2751
2705 2752
2706/* 2753/*
2707 * kdb_register - Compatibility register function for commands that do 2754 * kdb_register - Compatibility register function for commands that do
2708 * not need to specify a repeat state. Equivalent to 2755 * not need to specify a repeat state. Equivalent to
2709 * kdb_register_repeat with KDB_REPEAT_NONE. 2756 * kdb_register_flags with flags set to 0.
2710 * Inputs: 2757 * Inputs:
2711 * cmd Command name 2758 * cmd Command name
2712 * func Function to execute the command 2759 * func Function to execute the command
@@ -2721,8 +2768,7 @@ int kdb_register(char *cmd,
2721 char *help, 2768 char *help,
2722 short minlen) 2769 short minlen)
2723{ 2770{
2724 return kdb_register_repeat(cmd, func, usage, help, minlen, 2771 return kdb_register_flags(cmd, func, usage, help, minlen, 0);
2725 KDB_REPEAT_NONE);
2726} 2772}
2727EXPORT_SYMBOL_GPL(kdb_register); 2773EXPORT_SYMBOL_GPL(kdb_register);
2728 2774
@@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void)
2764 for_each_kdbcmd(kp, i) 2810 for_each_kdbcmd(kp, i)
2765 kp->cmd_name = NULL; 2811 kp->cmd_name = NULL;
2766 2812
2767 kdb_register_repeat("md", kdb_md, "<vaddr>", 2813 kdb_register_flags("md", kdb_md, "<vaddr>",
2768 "Display Memory Contents, also mdWcN, e.g. md8c1", 1, 2814 "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
2769 KDB_REPEAT_NO_ARGS); 2815 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2770 kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>", 2816 kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>",
2771 "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS); 2817 "Display Raw Memory", 0,
2772 kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>", 2818 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2773 "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS); 2819 kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>",
2774 kdb_register_repeat("mds", kdb_md, "<vaddr>", 2820 "Display Physical Memory", 0,
2775 "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS); 2821 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2776 kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>", 2822 kdb_register_flags("mds", kdb_md, "<vaddr>",
2777 "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS); 2823 "Display Memory Symbolically", 0,
2778 kdb_register_repeat("go", kdb_go, "[<vaddr>]", 2824 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2779 "Continue Execution", 1, KDB_REPEAT_NONE); 2825 kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>",
2780 kdb_register_repeat("rd", kdb_rd, "", 2826 "Modify Memory Contents", 0,
2781 "Display Registers", 0, KDB_REPEAT_NONE); 2827 KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS);
2782 kdb_register_repeat("rm", kdb_rm, "<reg> <contents>", 2828 kdb_register_flags("go", kdb_go, "[<vaddr>]",
2783 "Modify Registers", 0, KDB_REPEAT_NONE); 2829 "Continue Execution", 1,
2784 kdb_register_repeat("ef", kdb_ef, "<vaddr>", 2830 KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
2785 "Display exception frame", 0, KDB_REPEAT_NONE); 2831 kdb_register_flags("rd", kdb_rd, "",
2786 kdb_register_repeat("bt", kdb_bt, "[<vaddr>]", 2832 "Display Registers", 0,
2787 "Stack traceback", 1, KDB_REPEAT_NONE); 2833 KDB_ENABLE_REG_READ);
2788 kdb_register_repeat("btp", kdb_bt, "<pid>", 2834 kdb_register_flags("rm", kdb_rm, "<reg> <contents>",
2789 "Display stack for process <pid>", 0, KDB_REPEAT_NONE); 2835 "Modify Registers", 0,
2790 kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", 2836 KDB_ENABLE_REG_WRITE);
2791 "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE); 2837 kdb_register_flags("ef", kdb_ef, "<vaddr>",
2792 kdb_register_repeat("btc", kdb_bt, "", 2838 "Display exception frame", 0,
2793 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); 2839 KDB_ENABLE_MEM_READ);
2794 kdb_register_repeat("btt", kdb_bt, "<vaddr>", 2840 kdb_register_flags("bt", kdb_bt, "[<vaddr>]",
2841 "Stack traceback", 1,
2842 KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
2843 kdb_register_flags("btp", kdb_bt, "<pid>",
2844 "Display stack for process <pid>", 0,
2845 KDB_ENABLE_INSPECT);
2846 kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
2847 "Backtrace all processes matching state flag", 0,
2848 KDB_ENABLE_INSPECT);
2849 kdb_register_flags("btc", kdb_bt, "",
2850 "Backtrace current process on each cpu", 0,
2851 KDB_ENABLE_INSPECT);
2852 kdb_register_flags("btt", kdb_bt, "<vaddr>",
2795 "Backtrace process given its struct task address", 0, 2853 "Backtrace process given its struct task address", 0,
2796 KDB_REPEAT_NONE); 2854 KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
2797 kdb_register_repeat("env", kdb_env, "", 2855 kdb_register_flags("env", kdb_env, "",
2798 "Show environment variables", 0, KDB_REPEAT_NONE); 2856 "Show environment variables", 0,
2799 kdb_register_repeat("set", kdb_set, "", 2857 KDB_ENABLE_ALWAYS_SAFE);
2800 "Set environment variables", 0, KDB_REPEAT_NONE); 2858 kdb_register_flags("set", kdb_set, "",
2801 kdb_register_repeat("help", kdb_help, "", 2859 "Set environment variables", 0,
2802 "Display Help Message", 1, KDB_REPEAT_NONE); 2860 KDB_ENABLE_ALWAYS_SAFE);
2803 kdb_register_repeat("?", kdb_help, "", 2861 kdb_register_flags("help", kdb_help, "",
2804 "Display Help Message", 0, KDB_REPEAT_NONE); 2862 "Display Help Message", 1,
2805 kdb_register_repeat("cpu", kdb_cpu, "<cpunum>", 2863 KDB_ENABLE_ALWAYS_SAFE);
2806 "Switch to new cpu", 0, KDB_REPEAT_NONE); 2864 kdb_register_flags("?", kdb_help, "",
2807 kdb_register_repeat("kgdb", kdb_kgdb, "", 2865 "Display Help Message", 0,
2808 "Enter kgdb mode", 0, KDB_REPEAT_NONE); 2866 KDB_ENABLE_ALWAYS_SAFE);
2809 kdb_register_repeat("ps", kdb_ps, "[<flags>|A]", 2867 kdb_register_flags("cpu", kdb_cpu, "<cpunum>",
2810 "Display active task list", 0, KDB_REPEAT_NONE); 2868 "Switch to new cpu", 0,
2811 kdb_register_repeat("pid", kdb_pid, "<pidnum>", 2869 KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
2812 "Switch to another task", 0, KDB_REPEAT_NONE); 2870 kdb_register_flags("kgdb", kdb_kgdb, "",
2813 kdb_register_repeat("reboot", kdb_reboot, "", 2871 "Enter kgdb mode", 0, 0);
2814 "Reboot the machine immediately", 0, KDB_REPEAT_NONE); 2872 kdb_register_flags("ps", kdb_ps, "[<flags>|A]",
2873 "Display active task list", 0,
2874 KDB_ENABLE_INSPECT);
2875 kdb_register_flags("pid", kdb_pid, "<pidnum>",
2876 "Switch to another task", 0,
2877 KDB_ENABLE_INSPECT);
2878 kdb_register_flags("reboot", kdb_reboot, "",
2879 "Reboot the machine immediately", 0,
2880 KDB_ENABLE_REBOOT);
2815#if defined(CONFIG_MODULES) 2881#if defined(CONFIG_MODULES)
2816 kdb_register_repeat("lsmod", kdb_lsmod, "", 2882 kdb_register_flags("lsmod", kdb_lsmod, "",
2817 "List loaded kernel modules", 0, KDB_REPEAT_NONE); 2883 "List loaded kernel modules", 0,
2884 KDB_ENABLE_INSPECT);
2818#endif 2885#endif
2819#if defined(CONFIG_MAGIC_SYSRQ) 2886#if defined(CONFIG_MAGIC_SYSRQ)
2820 kdb_register_repeat("sr", kdb_sr, "<key>", 2887 kdb_register_flags("sr", kdb_sr, "<key>",
2821 "Magic SysRq key", 0, KDB_REPEAT_NONE); 2888 "Magic SysRq key", 0,
2889 KDB_ENABLE_ALWAYS_SAFE);
2822#endif 2890#endif
2823#if defined(CONFIG_PRINTK) 2891#if defined(CONFIG_PRINTK)
2824 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", 2892 kdb_register_flags("dmesg", kdb_dmesg, "[lines]",
2825 "Display syslog buffer", 0, KDB_REPEAT_NONE); 2893 "Display syslog buffer", 0,
2894 KDB_ENABLE_ALWAYS_SAFE);
2826#endif 2895#endif
2827 if (arch_kgdb_ops.enable_nmi) { 2896 if (arch_kgdb_ops.enable_nmi) {
2828 kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", 2897 kdb_register_flags("disable_nmi", kdb_disable_nmi, "",
2829 "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); 2898 "Disable NMI entry to KDB", 0,
2830 } 2899 KDB_ENABLE_ALWAYS_SAFE);
2831 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", 2900 }
2832 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); 2901 kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
2833 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", 2902 "Define a set of commands, down to endefcmd", 0,
2834 "Send a signal to a process", 0, KDB_REPEAT_NONE); 2903 KDB_ENABLE_ALWAYS_SAFE);
2835 kdb_register_repeat("summary", kdb_summary, "", 2904 kdb_register_flags("kill", kdb_kill, "<-signal> <pid>",
2836 "Summarize the system", 4, KDB_REPEAT_NONE); 2905 "Send a signal to a process", 0,
2837 kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", 2906 KDB_ENABLE_SIGNAL);
2838 "Display per_cpu variables", 3, KDB_REPEAT_NONE); 2907 kdb_register_flags("summary", kdb_summary, "",
2839 kdb_register_repeat("grephelp", kdb_grep_help, "", 2908 "Summarize the system", 4,
2840 "Display help on | grep", 0, KDB_REPEAT_NONE); 2909 KDB_ENABLE_ALWAYS_SAFE);
2910 kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
2911 "Display per_cpu variables", 3,
2912 KDB_ENABLE_MEM_READ);
2913 kdb_register_flags("grephelp", kdb_grep_help, "",
2914 "Display help on | grep", 0,
2915 KDB_ENABLE_ALWAYS_SAFE);
2841} 2916}
2842 2917
2843/* Execute any commands defined in kdb_cmds. */ 2918/* Execute any commands defined in kdb_cmds. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 7afd3c8c41d5..eaacd1693954 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -172,10 +172,9 @@ typedef struct _kdbtab {
172 kdb_func_t cmd_func; /* Function to execute command */ 172 kdb_func_t cmd_func; /* Function to execute command */
173 char *cmd_usage; /* Usage String for this command */ 173 char *cmd_usage; /* Usage String for this command */
174 char *cmd_help; /* Help message for this command */ 174 char *cmd_help; /* Help message for this command */
175 short cmd_flags; /* Parsing flags */
176 short cmd_minlen; /* Minimum legal # command 175 short cmd_minlen; /* Minimum legal # command
177 * chars required */ 176 * chars required */
178 kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */ 177 kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
179} kdbtab_t; 178} kdbtab_t;
180 179
181extern int kdb_bt(int, const char **); /* KDB display back trace */ 180extern int kdb_bt(int, const char **); /* KDB display back trace */
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 103f5d147b2f..2925188f50ea 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -1,5 +1,5 @@
1ifdef CONFIG_FUNCTION_TRACER 1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
3endif 3endif
4 4
5obj-y := core.o ring_buffer.o callchain.o 5obj-y := core.o ring_buffer.o callchain.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4c1ee7f2bebc..f04daabfd1cf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -872,22 +872,32 @@ void perf_pmu_enable(struct pmu *pmu)
872 pmu->pmu_enable(pmu); 872 pmu->pmu_enable(pmu);
873} 873}
874 874
875static DEFINE_PER_CPU(struct list_head, rotation_list); 875static DEFINE_PER_CPU(struct list_head, active_ctx_list);
876 876
877/* 877/*
878 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized 878 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
879 * because they're strictly cpu affine and rotate_start is called with IRQs 879 * perf_event_task_tick() are fully serialized because they're strictly cpu
880 * disabled, while rotate_context is called from IRQ context. 880 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
881 * disabled, while perf_event_task_tick is called from IRQ context.
881 */ 882 */
882static void perf_pmu_rotate_start(struct pmu *pmu) 883static void perf_event_ctx_activate(struct perf_event_context *ctx)
883{ 884{
884 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 885 struct list_head *head = this_cpu_ptr(&active_ctx_list);
885 struct list_head *head = this_cpu_ptr(&rotation_list);
886 886
887 WARN_ON(!irqs_disabled()); 887 WARN_ON(!irqs_disabled());
888 888
889 if (list_empty(&cpuctx->rotation_list)) 889 WARN_ON(!list_empty(&ctx->active_ctx_list));
890 list_add(&cpuctx->rotation_list, head); 890
891 list_add(&ctx->active_ctx_list, head);
892}
893
894static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
895{
896 WARN_ON(!irqs_disabled());
897
898 WARN_ON(list_empty(&ctx->active_ctx_list));
899
900 list_del_init(&ctx->active_ctx_list);
891} 901}
892 902
893static void get_ctx(struct perf_event_context *ctx) 903static void get_ctx(struct perf_event_context *ctx)
@@ -907,6 +917,84 @@ static void put_ctx(struct perf_event_context *ctx)
907} 917}
908 918
909/* 919/*
920 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
921 * perf_pmu_migrate_context() we need some magic.
922 *
923 * Those places that change perf_event::ctx will hold both
924 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
925 *
926 * Lock ordering is by mutex address. There is one other site where
927 * perf_event_context::mutex nests and that is put_event(). But remember that
928 * that is a parent<->child context relation, and migration does not affect
929 * children, therefore these two orderings should not interact.
930 *
931 * The change in perf_event::ctx does not affect children (as claimed above)
932 * because the sys_perf_event_open() case will install a new event and break
933 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
934 * concerned with cpuctx and that doesn't have children.
935 *
936 * The places that change perf_event::ctx will issue:
937 *
938 * perf_remove_from_context();
939 * synchronize_rcu();
940 * perf_install_in_context();
941 *
942 * to affect the change. The remove_from_context() + synchronize_rcu() should
943 * quiesce the event, after which we can install it in the new location. This
944 * means that only external vectors (perf_fops, prctl) can perturb the event
945 * while in transit. Therefore all such accessors should also acquire
946 * perf_event_context::mutex to serialize against this.
947 *
948 * However; because event->ctx can change while we're waiting to acquire
949 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
950 * function.
951 *
952 * Lock order:
953 * task_struct::perf_event_mutex
954 * perf_event_context::mutex
955 * perf_event_context::lock
956 * perf_event::child_mutex;
957 * perf_event::mmap_mutex
958 * mmap_sem
959 */
960static struct perf_event_context *
961perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
962{
963 struct perf_event_context *ctx;
964
965again:
966 rcu_read_lock();
967 ctx = ACCESS_ONCE(event->ctx);
968 if (!atomic_inc_not_zero(&ctx->refcount)) {
969 rcu_read_unlock();
970 goto again;
971 }
972 rcu_read_unlock();
973
974 mutex_lock_nested(&ctx->mutex, nesting);
975 if (event->ctx != ctx) {
976 mutex_unlock(&ctx->mutex);
977 put_ctx(ctx);
978 goto again;
979 }
980
981 return ctx;
982}
983
984static inline struct perf_event_context *
985perf_event_ctx_lock(struct perf_event *event)
986{
987 return perf_event_ctx_lock_nested(event, 0);
988}
989
990static void perf_event_ctx_unlock(struct perf_event *event,
991 struct perf_event_context *ctx)
992{
993 mutex_unlock(&ctx->mutex);
994 put_ctx(ctx);
995}
996
997/*
910 * This must be done under the ctx->lock, such as to serialize against 998 * This must be done under the ctx->lock, such as to serialize against
911 * context_equiv(), therefore we cannot call put_ctx() since that might end up 999 * context_equiv(), therefore we cannot call put_ctx() since that might end up
912 * calling scheduler related locks and ctx->lock nests inside those. 1000 * calling scheduler related locks and ctx->lock nests inside those.
@@ -1155,8 +1243,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1155 ctx->nr_branch_stack++; 1243 ctx->nr_branch_stack++;
1156 1244
1157 list_add_rcu(&event->event_entry, &ctx->event_list); 1245 list_add_rcu(&event->event_entry, &ctx->event_list);
1158 if (!ctx->nr_events)
1159 perf_pmu_rotate_start(ctx->pmu);
1160 ctx->nr_events++; 1246 ctx->nr_events++;
1161 if (event->attr.inherit_stat) 1247 if (event->attr.inherit_stat)
1162 ctx->nr_stat++; 1248 ctx->nr_stat++;
@@ -1275,6 +1361,8 @@ static void perf_group_attach(struct perf_event *event)
1275 if (group_leader == event) 1361 if (group_leader == event)
1276 return; 1362 return;
1277 1363
1364 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1365
1278 if (group_leader->group_flags & PERF_GROUP_SOFTWARE && 1366 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1279 !is_software_event(event)) 1367 !is_software_event(event))
1280 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; 1368 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
@@ -1296,6 +1384,10 @@ static void
1296list_del_event(struct perf_event *event, struct perf_event_context *ctx) 1384list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1297{ 1385{
1298 struct perf_cpu_context *cpuctx; 1386 struct perf_cpu_context *cpuctx;
1387
1388 WARN_ON_ONCE(event->ctx != ctx);
1389 lockdep_assert_held(&ctx->lock);
1390
1299 /* 1391 /*
1300 * We can have double detach due to exit/hot-unplug + close. 1392 * We can have double detach due to exit/hot-unplug + close.
1301 */ 1393 */
@@ -1380,6 +1472,8 @@ static void perf_group_detach(struct perf_event *event)
1380 1472
1381 /* Inherit group flags from the previous leader */ 1473 /* Inherit group flags from the previous leader */
1382 sibling->group_flags = event->group_flags; 1474 sibling->group_flags = event->group_flags;
1475
1476 WARN_ON_ONCE(sibling->ctx != event->ctx);
1383 } 1477 }
1384 1478
1385out: 1479out:
@@ -1442,6 +1536,10 @@ event_sched_out(struct perf_event *event,
1442{ 1536{
1443 u64 tstamp = perf_event_time(event); 1537 u64 tstamp = perf_event_time(event);
1444 u64 delta; 1538 u64 delta;
1539
1540 WARN_ON_ONCE(event->ctx != ctx);
1541 lockdep_assert_held(&ctx->lock);
1542
1445 /* 1543 /*
1446 * An event which could not be activated because of 1544 * An event which could not be activated because of
1447 * filter mismatch still needs to have its timings 1545 * filter mismatch still needs to have its timings
@@ -1471,7 +1569,8 @@ event_sched_out(struct perf_event *event,
1471 1569
1472 if (!is_software_event(event)) 1570 if (!is_software_event(event))
1473 cpuctx->active_oncpu--; 1571 cpuctx->active_oncpu--;
1474 ctx->nr_active--; 1572 if (!--ctx->nr_active)
1573 perf_event_ctx_deactivate(ctx);
1475 if (event->attr.freq && event->attr.sample_freq) 1574 if (event->attr.freq && event->attr.sample_freq)
1476 ctx->nr_freq--; 1575 ctx->nr_freq--;
1477 if (event->attr.exclusive || !cpuctx->active_oncpu) 1576 if (event->attr.exclusive || !cpuctx->active_oncpu)
@@ -1654,7 +1753,7 @@ int __perf_event_disable(void *info)
1654 * is the current context on this CPU and preemption is disabled, 1753 * is the current context on this CPU and preemption is disabled,
1655 * hence we can't get into perf_event_task_sched_out for this context. 1754 * hence we can't get into perf_event_task_sched_out for this context.
1656 */ 1755 */
1657void perf_event_disable(struct perf_event *event) 1756static void _perf_event_disable(struct perf_event *event)
1658{ 1757{
1659 struct perf_event_context *ctx = event->ctx; 1758 struct perf_event_context *ctx = event->ctx;
1660 struct task_struct *task = ctx->task; 1759 struct task_struct *task = ctx->task;
@@ -1695,6 +1794,19 @@ retry:
1695 } 1794 }
1696 raw_spin_unlock_irq(&ctx->lock); 1795 raw_spin_unlock_irq(&ctx->lock);
1697} 1796}
1797
1798/*
1799 * Strictly speaking kernel users cannot create groups and therefore this
1800 * interface does not need the perf_event_ctx_lock() magic.
1801 */
1802void perf_event_disable(struct perf_event *event)
1803{
1804 struct perf_event_context *ctx;
1805
1806 ctx = perf_event_ctx_lock(event);
1807 _perf_event_disable(event);
1808 perf_event_ctx_unlock(event, ctx);
1809}
1698EXPORT_SYMBOL_GPL(perf_event_disable); 1810EXPORT_SYMBOL_GPL(perf_event_disable);
1699 1811
1700static void perf_set_shadow_time(struct perf_event *event, 1812static void perf_set_shadow_time(struct perf_event *event,
@@ -1782,7 +1894,8 @@ event_sched_in(struct perf_event *event,
1782 1894
1783 if (!is_software_event(event)) 1895 if (!is_software_event(event))
1784 cpuctx->active_oncpu++; 1896 cpuctx->active_oncpu++;
1785 ctx->nr_active++; 1897 if (!ctx->nr_active++)
1898 perf_event_ctx_activate(ctx);
1786 if (event->attr.freq && event->attr.sample_freq) 1899 if (event->attr.freq && event->attr.sample_freq)
1787 ctx->nr_freq++; 1900 ctx->nr_freq++;
1788 1901
@@ -2158,7 +2271,7 @@ unlock:
2158 * perf_event_for_each_child or perf_event_for_each as described 2271 * perf_event_for_each_child or perf_event_for_each as described
2159 * for perf_event_disable. 2272 * for perf_event_disable.
2160 */ 2273 */
2161void perf_event_enable(struct perf_event *event) 2274static void _perf_event_enable(struct perf_event *event)
2162{ 2275{
2163 struct perf_event_context *ctx = event->ctx; 2276 struct perf_event_context *ctx = event->ctx;
2164 struct task_struct *task = ctx->task; 2277 struct task_struct *task = ctx->task;
@@ -2214,9 +2327,21 @@ retry:
2214out: 2327out:
2215 raw_spin_unlock_irq(&ctx->lock); 2328 raw_spin_unlock_irq(&ctx->lock);
2216} 2329}
2330
2331/*
2332 * See perf_event_disable();
2333 */
2334void perf_event_enable(struct perf_event *event)
2335{
2336 struct perf_event_context *ctx;
2337
2338 ctx = perf_event_ctx_lock(event);
2339 _perf_event_enable(event);
2340 perf_event_ctx_unlock(event, ctx);
2341}
2217EXPORT_SYMBOL_GPL(perf_event_enable); 2342EXPORT_SYMBOL_GPL(perf_event_enable);
2218 2343
2219int perf_event_refresh(struct perf_event *event, int refresh) 2344static int _perf_event_refresh(struct perf_event *event, int refresh)
2220{ 2345{
2221 /* 2346 /*
2222 * not supported on inherited events 2347 * not supported on inherited events
@@ -2225,10 +2350,25 @@ int perf_event_refresh(struct perf_event *event, int refresh)
2225 return -EINVAL; 2350 return -EINVAL;
2226 2351
2227 atomic_add(refresh, &event->event_limit); 2352 atomic_add(refresh, &event->event_limit);
2228 perf_event_enable(event); 2353 _perf_event_enable(event);
2229 2354
2230 return 0; 2355 return 0;
2231} 2356}
2357
2358/*
2359 * See perf_event_disable()
2360 */
2361int perf_event_refresh(struct perf_event *event, int refresh)
2362{
2363 struct perf_event_context *ctx;
2364 int ret;
2365
2366 ctx = perf_event_ctx_lock(event);
2367 ret = _perf_event_refresh(event, refresh);
2368 perf_event_ctx_unlock(event, ctx);
2369
2370 return ret;
2371}
2232EXPORT_SYMBOL_GPL(perf_event_refresh); 2372EXPORT_SYMBOL_GPL(perf_event_refresh);
2233 2373
2234static void ctx_sched_out(struct perf_event_context *ctx, 2374static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2612,12 +2752,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2612 2752
2613 perf_pmu_enable(ctx->pmu); 2753 perf_pmu_enable(ctx->pmu);
2614 perf_ctx_unlock(cpuctx, ctx); 2754 perf_ctx_unlock(cpuctx, ctx);
2615
2616 /*
2617 * Since these rotations are per-cpu, we need to ensure the
2618 * cpu-context we got scheduled on is actually rotating.
2619 */
2620 perf_pmu_rotate_start(ctx->pmu);
2621} 2755}
2622 2756
2623/* 2757/*
@@ -2905,25 +3039,18 @@ static void rotate_ctx(struct perf_event_context *ctx)
2905 list_rotate_left(&ctx->flexible_groups); 3039 list_rotate_left(&ctx->flexible_groups);
2906} 3040}
2907 3041
2908/*
2909 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2910 * because they're strictly cpu affine and rotate_start is called with IRQs
2911 * disabled, while rotate_context is called from IRQ context.
2912 */
2913static int perf_rotate_context(struct perf_cpu_context *cpuctx) 3042static int perf_rotate_context(struct perf_cpu_context *cpuctx)
2914{ 3043{
2915 struct perf_event_context *ctx = NULL; 3044 struct perf_event_context *ctx = NULL;
2916 int rotate = 0, remove = 1; 3045 int rotate = 0;
2917 3046
2918 if (cpuctx->ctx.nr_events) { 3047 if (cpuctx->ctx.nr_events) {
2919 remove = 0;
2920 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 3048 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2921 rotate = 1; 3049 rotate = 1;
2922 } 3050 }
2923 3051
2924 ctx = cpuctx->task_ctx; 3052 ctx = cpuctx->task_ctx;
2925 if (ctx && ctx->nr_events) { 3053 if (ctx && ctx->nr_events) {
2926 remove = 0;
2927 if (ctx->nr_events != ctx->nr_active) 3054 if (ctx->nr_events != ctx->nr_active)
2928 rotate = 1; 3055 rotate = 1;
2929 } 3056 }
@@ -2947,8 +3074,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
2947 perf_pmu_enable(cpuctx->ctx.pmu); 3074 perf_pmu_enable(cpuctx->ctx.pmu);
2948 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 3075 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2949done: 3076done:
2950 if (remove)
2951 list_del_init(&cpuctx->rotation_list);
2952 3077
2953 return rotate; 3078 return rotate;
2954} 3079}
@@ -2966,9 +3091,8 @@ bool perf_event_can_stop_tick(void)
2966 3091
2967void perf_event_task_tick(void) 3092void perf_event_task_tick(void)
2968{ 3093{
2969 struct list_head *head = this_cpu_ptr(&rotation_list); 3094 struct list_head *head = this_cpu_ptr(&active_ctx_list);
2970 struct perf_cpu_context *cpuctx, *tmp; 3095 struct perf_event_context *ctx, *tmp;
2971 struct perf_event_context *ctx;
2972 int throttled; 3096 int throttled;
2973 3097
2974 WARN_ON(!irqs_disabled()); 3098 WARN_ON(!irqs_disabled());
@@ -2976,14 +3100,8 @@ void perf_event_task_tick(void)
2976 __this_cpu_inc(perf_throttled_seq); 3100 __this_cpu_inc(perf_throttled_seq);
2977 throttled = __this_cpu_xchg(perf_throttled_count, 0); 3101 throttled = __this_cpu_xchg(perf_throttled_count, 0);
2978 3102
2979 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { 3103 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
2980 ctx = &cpuctx->ctx;
2981 perf_adjust_freq_unthr_context(ctx, throttled); 3104 perf_adjust_freq_unthr_context(ctx, throttled);
2982
2983 ctx = cpuctx->task_ctx;
2984 if (ctx)
2985 perf_adjust_freq_unthr_context(ctx, throttled);
2986 }
2987} 3105}
2988 3106
2989static int event_enable_on_exec(struct perf_event *event, 3107static int event_enable_on_exec(struct perf_event *event,
@@ -3142,6 +3260,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
3142{ 3260{
3143 raw_spin_lock_init(&ctx->lock); 3261 raw_spin_lock_init(&ctx->lock);
3144 mutex_init(&ctx->mutex); 3262 mutex_init(&ctx->mutex);
3263 INIT_LIST_HEAD(&ctx->active_ctx_list);
3145 INIT_LIST_HEAD(&ctx->pinned_groups); 3264 INIT_LIST_HEAD(&ctx->pinned_groups);
3146 INIT_LIST_HEAD(&ctx->flexible_groups); 3265 INIT_LIST_HEAD(&ctx->flexible_groups);
3147 INIT_LIST_HEAD(&ctx->event_list); 3266 INIT_LIST_HEAD(&ctx->event_list);
@@ -3421,7 +3540,16 @@ static void perf_remove_from_owner(struct perf_event *event)
3421 rcu_read_unlock(); 3540 rcu_read_unlock();
3422 3541
3423 if (owner) { 3542 if (owner) {
3424 mutex_lock(&owner->perf_event_mutex); 3543 /*
3544 * If we're here through perf_event_exit_task() we're already
3545 * holding ctx->mutex which would be an inversion wrt. the
3546 * normal lock order.
3547 *
3548 * However we can safely take this lock because its the child
3549 * ctx->mutex.
3550 */
3551 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3552
3425 /* 3553 /*
3426 * We have to re-check the event->owner field, if it is cleared 3554 * We have to re-check the event->owner field, if it is cleared
3427 * we raced with perf_event_exit_task(), acquiring the mutex 3555 * we raced with perf_event_exit_task(), acquiring the mutex
@@ -3440,7 +3568,7 @@ static void perf_remove_from_owner(struct perf_event *event)
3440 */ 3568 */
3441static void put_event(struct perf_event *event) 3569static void put_event(struct perf_event *event)
3442{ 3570{
3443 struct perf_event_context *ctx = event->ctx; 3571 struct perf_event_context *ctx;
3444 3572
3445 if (!atomic_long_dec_and_test(&event->refcount)) 3573 if (!atomic_long_dec_and_test(&event->refcount))
3446 return; 3574 return;
@@ -3448,7 +3576,6 @@ static void put_event(struct perf_event *event)
3448 if (!is_kernel_event(event)) 3576 if (!is_kernel_event(event))
3449 perf_remove_from_owner(event); 3577 perf_remove_from_owner(event);
3450 3578
3451 WARN_ON_ONCE(ctx->parent_ctx);
3452 /* 3579 /*
3453 * There are two ways this annotation is useful: 3580 * There are two ways this annotation is useful:
3454 * 3581 *
@@ -3461,7 +3588,8 @@ static void put_event(struct perf_event *event)
3461 * the last filedesc died, so there is no possibility 3588 * the last filedesc died, so there is no possibility
3462 * to trigger the AB-BA case. 3589 * to trigger the AB-BA case.
3463 */ 3590 */
3464 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); 3591 ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
3592 WARN_ON_ONCE(ctx->parent_ctx);
3465 perf_remove_from_context(event, true); 3593 perf_remove_from_context(event, true);
3466 mutex_unlock(&ctx->mutex); 3594 mutex_unlock(&ctx->mutex);
3467 3595
@@ -3547,12 +3675,13 @@ static int perf_event_read_group(struct perf_event *event,
3547 u64 read_format, char __user *buf) 3675 u64 read_format, char __user *buf)
3548{ 3676{
3549 struct perf_event *leader = event->group_leader, *sub; 3677 struct perf_event *leader = event->group_leader, *sub;
3550 int n = 0, size = 0, ret = -EFAULT;
3551 struct perf_event_context *ctx = leader->ctx; 3678 struct perf_event_context *ctx = leader->ctx;
3552 u64 values[5]; 3679 int n = 0, size = 0, ret;
3553 u64 count, enabled, running; 3680 u64 count, enabled, running;
3681 u64 values[5];
3682
3683 lockdep_assert_held(&ctx->mutex);
3554 3684
3555 mutex_lock(&ctx->mutex);
3556 count = perf_event_read_value(leader, &enabled, &running); 3685 count = perf_event_read_value(leader, &enabled, &running);
3557 3686
3558 values[n++] = 1 + leader->nr_siblings; 3687 values[n++] = 1 + leader->nr_siblings;
@@ -3567,7 +3696,7 @@ static int perf_event_read_group(struct perf_event *event,
3567 size = n * sizeof(u64); 3696 size = n * sizeof(u64);
3568 3697
3569 if (copy_to_user(buf, values, size)) 3698 if (copy_to_user(buf, values, size))
3570 goto unlock; 3699 return -EFAULT;
3571 3700
3572 ret = size; 3701 ret = size;
3573 3702
@@ -3581,14 +3710,11 @@ static int perf_event_read_group(struct perf_event *event,
3581 size = n * sizeof(u64); 3710 size = n * sizeof(u64);
3582 3711
3583 if (copy_to_user(buf + ret, values, size)) { 3712 if (copy_to_user(buf + ret, values, size)) {
3584 ret = -EFAULT; 3713 return -EFAULT;
3585 goto unlock;
3586 } 3714 }
3587 3715
3588 ret += size; 3716 ret += size;
3589 } 3717 }
3590unlock:
3591 mutex_unlock(&ctx->mutex);
3592 3718
3593 return ret; 3719 return ret;
3594} 3720}
@@ -3660,8 +3786,14 @@ static ssize_t
3660perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) 3786perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3661{ 3787{
3662 struct perf_event *event = file->private_data; 3788 struct perf_event *event = file->private_data;
3789 struct perf_event_context *ctx;
3790 int ret;
3791
3792 ctx = perf_event_ctx_lock(event);
3793 ret = perf_read_hw(event, buf, count);
3794 perf_event_ctx_unlock(event, ctx);
3663 3795
3664 return perf_read_hw(event, buf, count); 3796 return ret;
3665} 3797}
3666 3798
3667static unsigned int perf_poll(struct file *file, poll_table *wait) 3799static unsigned int perf_poll(struct file *file, poll_table *wait)
@@ -3687,7 +3819,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3687 return events; 3819 return events;
3688} 3820}
3689 3821
3690static void perf_event_reset(struct perf_event *event) 3822static void _perf_event_reset(struct perf_event *event)
3691{ 3823{
3692 (void)perf_event_read(event); 3824 (void)perf_event_read(event);
3693 local64_set(&event->count, 0); 3825 local64_set(&event->count, 0);
@@ -3706,6 +3838,7 @@ static void perf_event_for_each_child(struct perf_event *event,
3706 struct perf_event *child; 3838 struct perf_event *child;
3707 3839
3708 WARN_ON_ONCE(event->ctx->parent_ctx); 3840 WARN_ON_ONCE(event->ctx->parent_ctx);
3841
3709 mutex_lock(&event->child_mutex); 3842 mutex_lock(&event->child_mutex);
3710 func(event); 3843 func(event);
3711 list_for_each_entry(child, &event->child_list, child_list) 3844 list_for_each_entry(child, &event->child_list, child_list)
@@ -3719,14 +3852,13 @@ static void perf_event_for_each(struct perf_event *event,
3719 struct perf_event_context *ctx = event->ctx; 3852 struct perf_event_context *ctx = event->ctx;
3720 struct perf_event *sibling; 3853 struct perf_event *sibling;
3721 3854
3722 WARN_ON_ONCE(ctx->parent_ctx); 3855 lockdep_assert_held(&ctx->mutex);
3723 mutex_lock(&ctx->mutex); 3856
3724 event = event->group_leader; 3857 event = event->group_leader;
3725 3858
3726 perf_event_for_each_child(event, func); 3859 perf_event_for_each_child(event, func);
3727 list_for_each_entry(sibling, &event->sibling_list, group_entry) 3860 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3728 perf_event_for_each_child(sibling, func); 3861 perf_event_for_each_child(sibling, func);
3729 mutex_unlock(&ctx->mutex);
3730} 3862}
3731 3863
3732static int perf_event_period(struct perf_event *event, u64 __user *arg) 3864static int perf_event_period(struct perf_event *event, u64 __user *arg)
@@ -3796,25 +3928,24 @@ static int perf_event_set_output(struct perf_event *event,
3796 struct perf_event *output_event); 3928 struct perf_event *output_event);
3797static int perf_event_set_filter(struct perf_event *event, void __user *arg); 3929static int perf_event_set_filter(struct perf_event *event, void __user *arg);
3798 3930
3799static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 3931static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
3800{ 3932{
3801 struct perf_event *event = file->private_data;
3802 void (*func)(struct perf_event *); 3933 void (*func)(struct perf_event *);
3803 u32 flags = arg; 3934 u32 flags = arg;
3804 3935
3805 switch (cmd) { 3936 switch (cmd) {
3806 case PERF_EVENT_IOC_ENABLE: 3937 case PERF_EVENT_IOC_ENABLE:
3807 func = perf_event_enable; 3938 func = _perf_event_enable;
3808 break; 3939 break;
3809 case PERF_EVENT_IOC_DISABLE: 3940 case PERF_EVENT_IOC_DISABLE:
3810 func = perf_event_disable; 3941 func = _perf_event_disable;
3811 break; 3942 break;
3812 case PERF_EVENT_IOC_RESET: 3943 case PERF_EVENT_IOC_RESET:
3813 func = perf_event_reset; 3944 func = _perf_event_reset;
3814 break; 3945 break;
3815 3946
3816 case PERF_EVENT_IOC_REFRESH: 3947 case PERF_EVENT_IOC_REFRESH:
3817 return perf_event_refresh(event, arg); 3948 return _perf_event_refresh(event, arg);
3818 3949
3819 case PERF_EVENT_IOC_PERIOD: 3950 case PERF_EVENT_IOC_PERIOD:
3820 return perf_event_period(event, (u64 __user *)arg); 3951 return perf_event_period(event, (u64 __user *)arg);
@@ -3861,6 +3992,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3861 return 0; 3992 return 0;
3862} 3993}
3863 3994
3995static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3996{
3997 struct perf_event *event = file->private_data;
3998 struct perf_event_context *ctx;
3999 long ret;
4000
4001 ctx = perf_event_ctx_lock(event);
4002 ret = _perf_ioctl(event, cmd, arg);
4003 perf_event_ctx_unlock(event, ctx);
4004
4005 return ret;
4006}
4007
3864#ifdef CONFIG_COMPAT 4008#ifdef CONFIG_COMPAT
3865static long perf_compat_ioctl(struct file *file, unsigned int cmd, 4009static long perf_compat_ioctl(struct file *file, unsigned int cmd,
3866 unsigned long arg) 4010 unsigned long arg)
@@ -3883,11 +4027,15 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
3883 4027
3884int perf_event_task_enable(void) 4028int perf_event_task_enable(void)
3885{ 4029{
4030 struct perf_event_context *ctx;
3886 struct perf_event *event; 4031 struct perf_event *event;
3887 4032
3888 mutex_lock(&current->perf_event_mutex); 4033 mutex_lock(&current->perf_event_mutex);
3889 list_for_each_entry(event, &current->perf_event_list, owner_entry) 4034 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
3890 perf_event_for_each_child(event, perf_event_enable); 4035 ctx = perf_event_ctx_lock(event);
4036 perf_event_for_each_child(event, _perf_event_enable);
4037 perf_event_ctx_unlock(event, ctx);
4038 }
3891 mutex_unlock(&current->perf_event_mutex); 4039 mutex_unlock(&current->perf_event_mutex);
3892 4040
3893 return 0; 4041 return 0;
@@ -3895,11 +4043,15 @@ int perf_event_task_enable(void)
3895 4043
3896int perf_event_task_disable(void) 4044int perf_event_task_disable(void)
3897{ 4045{
4046 struct perf_event_context *ctx;
3898 struct perf_event *event; 4047 struct perf_event *event;
3899 4048
3900 mutex_lock(&current->perf_event_mutex); 4049 mutex_lock(&current->perf_event_mutex);
3901 list_for_each_entry(event, &current->perf_event_list, owner_entry) 4050 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
3902 perf_event_for_each_child(event, perf_event_disable); 4051 ctx = perf_event_ctx_lock(event);
4052 perf_event_for_each_child(event, _perf_event_disable);
4053 perf_event_ctx_unlock(event, ctx);
4054 }
3903 mutex_unlock(&current->perf_event_mutex); 4055 mutex_unlock(&current->perf_event_mutex);
3904 4056
3905 return 0; 4057 return 0;
@@ -3949,7 +4101,8 @@ unlock:
3949 rcu_read_unlock(); 4101 rcu_read_unlock();
3950} 4102}
3951 4103
3952void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) 4104void __weak arch_perf_update_userpage(
4105 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
3953{ 4106{
3954} 4107}
3955 4108
@@ -3999,7 +4152,7 @@ void perf_event_update_userpage(struct perf_event *event)
3999 userpg->time_running = running + 4152 userpg->time_running = running +
4000 atomic64_read(&event->child_total_time_running); 4153 atomic64_read(&event->child_total_time_running);
4001 4154
4002 arch_perf_update_userpage(userpg, now); 4155 arch_perf_update_userpage(event, userpg, now);
4003 4156
4004 barrier(); 4157 barrier();
4005 ++userpg->lock; 4158 ++userpg->lock;
@@ -4141,6 +4294,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
4141 4294
4142 atomic_inc(&event->mmap_count); 4295 atomic_inc(&event->mmap_count);
4143 atomic_inc(&event->rb->mmap_count); 4296 atomic_inc(&event->rb->mmap_count);
4297
4298 if (event->pmu->event_mapped)
4299 event->pmu->event_mapped(event);
4144} 4300}
4145 4301
4146/* 4302/*
@@ -4160,6 +4316,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
4160 int mmap_locked = rb->mmap_locked; 4316 int mmap_locked = rb->mmap_locked;
4161 unsigned long size = perf_data_size(rb); 4317 unsigned long size = perf_data_size(rb);
4162 4318
4319 if (event->pmu->event_unmapped)
4320 event->pmu->event_unmapped(event);
4321
4163 atomic_dec(&rb->mmap_count); 4322 atomic_dec(&rb->mmap_count);
4164 4323
4165 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 4324 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4361,6 +4520,9 @@ unlock:
4361 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; 4520 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
4362 vma->vm_ops = &perf_mmap_vmops; 4521 vma->vm_ops = &perf_mmap_vmops;
4363 4522
4523 if (event->pmu->event_mapped)
4524 event->pmu->event_mapped(event);
4525
4364 return ret; 4526 return ret;
4365} 4527}
4366 4528
@@ -4461,18 +4623,14 @@ perf_output_sample_regs(struct perf_output_handle *handle,
4461} 4623}
4462 4624
4463static void perf_sample_regs_user(struct perf_regs *regs_user, 4625static void perf_sample_regs_user(struct perf_regs *regs_user,
4464 struct pt_regs *regs) 4626 struct pt_regs *regs,
4627 struct pt_regs *regs_user_copy)
4465{ 4628{
4466 if (!user_mode(regs)) { 4629 if (user_mode(regs)) {
4467 if (current->mm) 4630 regs_user->abi = perf_reg_abi(current);
4468 regs = task_pt_regs(current);
4469 else
4470 regs = NULL;
4471 }
4472
4473 if (regs) {
4474 regs_user->abi = perf_reg_abi(current);
4475 regs_user->regs = regs; 4631 regs_user->regs = regs;
4632 } else if (current->mm) {
4633 perf_get_regs_user(regs_user, regs, regs_user_copy);
4476 } else { 4634 } else {
4477 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; 4635 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
4478 regs_user->regs = NULL; 4636 regs_user->regs = NULL;
@@ -4951,7 +5109,8 @@ void perf_prepare_sample(struct perf_event_header *header,
4951 } 5109 }
4952 5110
4953 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) 5111 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
4954 perf_sample_regs_user(&data->regs_user, regs); 5112 perf_sample_regs_user(&data->regs_user, regs,
5113 &data->regs_user_copy);
4955 5114
4956 if (sample_type & PERF_SAMPLE_REGS_USER) { 5115 if (sample_type & PERF_SAMPLE_REGS_USER) {
4957 /* regs dump ABI info */ 5116 /* regs dump ABI info */
@@ -5892,6 +6051,8 @@ end:
5892 rcu_read_unlock(); 6051 rcu_read_unlock();
5893} 6052}
5894 6053
6054DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
6055
5895int perf_swevent_get_recursion_context(void) 6056int perf_swevent_get_recursion_context(void)
5896{ 6057{
5897 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 6058 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
@@ -5907,21 +6068,30 @@ inline void perf_swevent_put_recursion_context(int rctx)
5907 put_recursion_context(swhash->recursion, rctx); 6068 put_recursion_context(swhash->recursion, rctx);
5908} 6069}
5909 6070
5910void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 6071void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5911{ 6072{
5912 struct perf_sample_data data; 6073 struct perf_sample_data data;
5913 int rctx;
5914 6074
5915 preempt_disable_notrace(); 6075 if (WARN_ON_ONCE(!regs))
5916 rctx = perf_swevent_get_recursion_context();
5917 if (rctx < 0)
5918 return; 6076 return;
5919 6077
5920 perf_sample_data_init(&data, addr, 0); 6078 perf_sample_data_init(&data, addr, 0);
5921
5922 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); 6079 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
6080}
6081
6082void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6083{
6084 int rctx;
6085
6086 preempt_disable_notrace();
6087 rctx = perf_swevent_get_recursion_context();
6088 if (unlikely(rctx < 0))
6089 goto fail;
6090
6091 ___perf_sw_event(event_id, nr, regs, addr);
5923 6092
5924 perf_swevent_put_recursion_context(rctx); 6093 perf_swevent_put_recursion_context(rctx);
6094fail:
5925 preempt_enable_notrace(); 6095 preempt_enable_notrace();
5926} 6096}
5927 6097
@@ -6779,12 +6949,10 @@ skip_type:
6779 __perf_event_init_context(&cpuctx->ctx); 6949 __perf_event_init_context(&cpuctx->ctx);
6780 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); 6950 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
6781 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); 6951 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6782 cpuctx->ctx.type = cpu_context;
6783 cpuctx->ctx.pmu = pmu; 6952 cpuctx->ctx.pmu = pmu;
6784 6953
6785 __perf_cpu_hrtimer_init(cpuctx, cpu); 6954 __perf_cpu_hrtimer_init(cpuctx, cpu);
6786 6955
6787 INIT_LIST_HEAD(&cpuctx->rotation_list);
6788 cpuctx->unique_pmu = pmu; 6956 cpuctx->unique_pmu = pmu;
6789 } 6957 }
6790 6958
@@ -6857,6 +7025,20 @@ void perf_pmu_unregister(struct pmu *pmu)
6857} 7025}
6858EXPORT_SYMBOL_GPL(perf_pmu_unregister); 7026EXPORT_SYMBOL_GPL(perf_pmu_unregister);
6859 7027
7028static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7029{
7030 int ret;
7031
7032 if (!try_module_get(pmu->module))
7033 return -ENODEV;
7034 event->pmu = pmu;
7035 ret = pmu->event_init(event);
7036 if (ret)
7037 module_put(pmu->module);
7038
7039 return ret;
7040}
7041
6860struct pmu *perf_init_event(struct perf_event *event) 7042struct pmu *perf_init_event(struct perf_event *event)
6861{ 7043{
6862 struct pmu *pmu = NULL; 7044 struct pmu *pmu = NULL;
@@ -6869,24 +7051,14 @@ struct pmu *perf_init_event(struct perf_event *event)
6869 pmu = idr_find(&pmu_idr, event->attr.type); 7051 pmu = idr_find(&pmu_idr, event->attr.type);
6870 rcu_read_unlock(); 7052 rcu_read_unlock();
6871 if (pmu) { 7053 if (pmu) {
6872 if (!try_module_get(pmu->module)) { 7054 ret = perf_try_init_event(pmu, event);
6873 pmu = ERR_PTR(-ENODEV);
6874 goto unlock;
6875 }
6876 event->pmu = pmu;
6877 ret = pmu->event_init(event);
6878 if (ret) 7055 if (ret)
6879 pmu = ERR_PTR(ret); 7056 pmu = ERR_PTR(ret);
6880 goto unlock; 7057 goto unlock;
6881 } 7058 }
6882 7059
6883 list_for_each_entry_rcu(pmu, &pmus, entry) { 7060 list_for_each_entry_rcu(pmu, &pmus, entry) {
6884 if (!try_module_get(pmu->module)) { 7061 ret = perf_try_init_event(pmu, event);
6885 pmu = ERR_PTR(-ENODEV);
6886 goto unlock;
6887 }
6888 event->pmu = pmu;
6889 ret = pmu->event_init(event);
6890 if (!ret) 7062 if (!ret)
6891 goto unlock; 7063 goto unlock;
6892 7064
@@ -7250,6 +7422,15 @@ out:
7250 return ret; 7422 return ret;
7251} 7423}
7252 7424
7425static void mutex_lock_double(struct mutex *a, struct mutex *b)
7426{
7427 if (b < a)
7428 swap(a, b);
7429
7430 mutex_lock(a);
7431 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
7432}
7433
7253/** 7434/**
7254 * sys_perf_event_open - open a performance event, associate it to a task/cpu 7435 * sys_perf_event_open - open a performance event, associate it to a task/cpu
7255 * 7436 *
@@ -7265,7 +7446,7 @@ SYSCALL_DEFINE5(perf_event_open,
7265 struct perf_event *group_leader = NULL, *output_event = NULL; 7446 struct perf_event *group_leader = NULL, *output_event = NULL;
7266 struct perf_event *event, *sibling; 7447 struct perf_event *event, *sibling;
7267 struct perf_event_attr attr; 7448 struct perf_event_attr attr;
7268 struct perf_event_context *ctx; 7449 struct perf_event_context *ctx, *uninitialized_var(gctx);
7269 struct file *event_file = NULL; 7450 struct file *event_file = NULL;
7270 struct fd group = {NULL, 0}; 7451 struct fd group = {NULL, 0};
7271 struct task_struct *task = NULL; 7452 struct task_struct *task = NULL;
@@ -7423,7 +7604,19 @@ SYSCALL_DEFINE5(perf_event_open,
7423 * task or CPU context: 7604 * task or CPU context:
7424 */ 7605 */
7425 if (move_group) { 7606 if (move_group) {
7426 if (group_leader->ctx->type != ctx->type) 7607 /*
7608 * Make sure we're both on the same task, or both
7609 * per-cpu events.
7610 */
7611 if (group_leader->ctx->task != ctx->task)
7612 goto err_context;
7613
7614 /*
7615 * Make sure we're both events for the same CPU;
7616 * grouping events for different CPUs is broken; since
7617 * you can never concurrently schedule them anyhow.
7618 */
7619 if (group_leader->cpu != event->cpu)
7427 goto err_context; 7620 goto err_context;
7428 } else { 7621 } else {
7429 if (group_leader->ctx != ctx) 7622 if (group_leader->ctx != ctx)
@@ -7451,43 +7644,68 @@ SYSCALL_DEFINE5(perf_event_open,
7451 } 7644 }
7452 7645
7453 if (move_group) { 7646 if (move_group) {
7454 struct perf_event_context *gctx = group_leader->ctx; 7647 gctx = group_leader->ctx;
7455
7456 mutex_lock(&gctx->mutex);
7457 perf_remove_from_context(group_leader, false);
7458 7648
7459 /* 7649 /*
7460 * Removing from the context ends up with disabled 7650 * See perf_event_ctx_lock() for comments on the details
7461 * event. What we want here is event in the initial 7651 * of swizzling perf_event::ctx.
7462 * startup state, ready to be add into new context.
7463 */ 7652 */
7464 perf_event__state_init(group_leader); 7653 mutex_lock_double(&gctx->mutex, &ctx->mutex);
7654
7655 perf_remove_from_context(group_leader, false);
7656
7465 list_for_each_entry(sibling, &group_leader->sibling_list, 7657 list_for_each_entry(sibling, &group_leader->sibling_list,
7466 group_entry) { 7658 group_entry) {
7467 perf_remove_from_context(sibling, false); 7659 perf_remove_from_context(sibling, false);
7468 perf_event__state_init(sibling);
7469 put_ctx(gctx); 7660 put_ctx(gctx);
7470 } 7661 }
7471 mutex_unlock(&gctx->mutex); 7662 } else {
7472 put_ctx(gctx); 7663 mutex_lock(&ctx->mutex);
7473 } 7664 }
7474 7665
7475 WARN_ON_ONCE(ctx->parent_ctx); 7666 WARN_ON_ONCE(ctx->parent_ctx);
7476 mutex_lock(&ctx->mutex);
7477 7667
7478 if (move_group) { 7668 if (move_group) {
7669 /*
7670 * Wait for everybody to stop referencing the events through
7671 * the old lists, before installing it on new lists.
7672 */
7479 synchronize_rcu(); 7673 synchronize_rcu();
7480 perf_install_in_context(ctx, group_leader, group_leader->cpu); 7674
7481 get_ctx(ctx); 7675 /*
7676 * Install the group siblings before the group leader.
7677 *
7678 * Because a group leader will try and install the entire group
7679 * (through the sibling list, which is still in-tact), we can
7680 * end up with siblings installed in the wrong context.
7681 *
7682 * By installing siblings first we NO-OP because they're not
7683 * reachable through the group lists.
7684 */
7482 list_for_each_entry(sibling, &group_leader->sibling_list, 7685 list_for_each_entry(sibling, &group_leader->sibling_list,
7483 group_entry) { 7686 group_entry) {
7687 perf_event__state_init(sibling);
7484 perf_install_in_context(ctx, sibling, sibling->cpu); 7688 perf_install_in_context(ctx, sibling, sibling->cpu);
7485 get_ctx(ctx); 7689 get_ctx(ctx);
7486 } 7690 }
7691
7692 /*
7693 * Removing from the context ends up with disabled
7694 * event. What we want here is event in the initial
7695 * startup state, ready to be add into new context.
7696 */
7697 perf_event__state_init(group_leader);
7698 perf_install_in_context(ctx, group_leader, group_leader->cpu);
7699 get_ctx(ctx);
7487 } 7700 }
7488 7701
7489 perf_install_in_context(ctx, event, event->cpu); 7702 perf_install_in_context(ctx, event, event->cpu);
7490 perf_unpin_context(ctx); 7703 perf_unpin_context(ctx);
7704
7705 if (move_group) {
7706 mutex_unlock(&gctx->mutex);
7707 put_ctx(gctx);
7708 }
7491 mutex_unlock(&ctx->mutex); 7709 mutex_unlock(&ctx->mutex);
7492 7710
7493 put_online_cpus(); 7711 put_online_cpus();
@@ -7595,7 +7813,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7595 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; 7813 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
7596 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; 7814 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
7597 7815
7598 mutex_lock(&src_ctx->mutex); 7816 /*
7817 * See perf_event_ctx_lock() for comments on the details
7818 * of swizzling perf_event::ctx.
7819 */
7820 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
7599 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 7821 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
7600 event_entry) { 7822 event_entry) {
7601 perf_remove_from_context(event, false); 7823 perf_remove_from_context(event, false);
@@ -7603,11 +7825,36 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7603 put_ctx(src_ctx); 7825 put_ctx(src_ctx);
7604 list_add(&event->migrate_entry, &events); 7826 list_add(&event->migrate_entry, &events);
7605 } 7827 }
7606 mutex_unlock(&src_ctx->mutex);
7607 7828
7829 /*
7830 * Wait for the events to quiesce before re-instating them.
7831 */
7608 synchronize_rcu(); 7832 synchronize_rcu();
7609 7833
7610 mutex_lock(&dst_ctx->mutex); 7834 /*
7835 * Re-instate events in 2 passes.
7836 *
7837 * Skip over group leaders and only install siblings on this first
7838 * pass, siblings will not get enabled without a leader, however a
7839 * leader will enable its siblings, even if those are still on the old
7840 * context.
7841 */
7842 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
7843 if (event->group_leader == event)
7844 continue;
7845
7846 list_del(&event->migrate_entry);
7847 if (event->state >= PERF_EVENT_STATE_OFF)
7848 event->state = PERF_EVENT_STATE_INACTIVE;
7849 account_event_cpu(event, dst_cpu);
7850 perf_install_in_context(dst_ctx, event, dst_cpu);
7851 get_ctx(dst_ctx);
7852 }
7853
7854 /*
7855 * Once all the siblings are setup properly, install the group leaders
7856 * to make it go.
7857 */
7611 list_for_each_entry_safe(event, tmp, &events, migrate_entry) { 7858 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
7612 list_del(&event->migrate_entry); 7859 list_del(&event->migrate_entry);
7613 if (event->state >= PERF_EVENT_STATE_OFF) 7860 if (event->state >= PERF_EVENT_STATE_OFF)
@@ -7617,6 +7864,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7617 get_ctx(dst_ctx); 7864 get_ctx(dst_ctx);
7618 } 7865 }
7619 mutex_unlock(&dst_ctx->mutex); 7866 mutex_unlock(&dst_ctx->mutex);
7867 mutex_unlock(&src_ctx->mutex);
7620} 7868}
7621EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); 7869EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
7622 7870
@@ -7803,14 +8051,19 @@ static void perf_free_event(struct perf_event *event,
7803 8051
7804 put_event(parent); 8052 put_event(parent);
7805 8053
8054 raw_spin_lock_irq(&ctx->lock);
7806 perf_group_detach(event); 8055 perf_group_detach(event);
7807 list_del_event(event, ctx); 8056 list_del_event(event, ctx);
8057 raw_spin_unlock_irq(&ctx->lock);
7808 free_event(event); 8058 free_event(event);
7809} 8059}
7810 8060
7811/* 8061/*
7812 * free an unexposed, unused context as created by inheritance by 8062 * Free an unexposed, unused context as created by inheritance by
7813 * perf_event_init_task below, used by fork() in case of fail. 8063 * perf_event_init_task below, used by fork() in case of fail.
8064 *
8065 * Not all locks are strictly required, but take them anyway to be nice and
8066 * help out with the lockdep assertions.
7814 */ 8067 */
7815void perf_event_free_task(struct task_struct *task) 8068void perf_event_free_task(struct task_struct *task)
7816{ 8069{
@@ -8129,7 +8382,7 @@ static void __init perf_event_init_all_cpus(void)
8129 for_each_possible_cpu(cpu) { 8382 for_each_possible_cpu(cpu) {
8130 swhash = &per_cpu(swevent_htable, cpu); 8383 swhash = &per_cpu(swevent_htable, cpu);
8131 mutex_init(&swhash->hlist_mutex); 8384 mutex_init(&swhash->hlist_mutex);
8132 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); 8385 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
8133 } 8386 }
8134} 8387}
8135 8388
@@ -8150,22 +8403,11 @@ static void perf_event_init_cpu(int cpu)
8150} 8403}
8151 8404
8152#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC 8405#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
8153static void perf_pmu_rotate_stop(struct pmu *pmu)
8154{
8155 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
8156
8157 WARN_ON(!irqs_disabled());
8158
8159 list_del_init(&cpuctx->rotation_list);
8160}
8161
8162static void __perf_event_exit_context(void *__info) 8406static void __perf_event_exit_context(void *__info)
8163{ 8407{
8164 struct remove_event re = { .detach_group = true }; 8408 struct remove_event re = { .detach_group = true };
8165 struct perf_event_context *ctx = __info; 8409 struct perf_event_context *ctx = __info;
8166 8410
8167 perf_pmu_rotate_stop(ctx->pmu);
8168
8169 rcu_read_lock(); 8411 rcu_read_lock();
8170 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) 8412 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
8171 __perf_remove_from_context(&re); 8413 __perf_remove_from_context(&re);
@@ -8276,6 +8518,18 @@ void __init perf_event_init(void)
8276 != 1024); 8518 != 1024);
8277} 8519}
8278 8520
8521ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
8522 char *page)
8523{
8524 struct perf_pmu_events_attr *pmu_attr =
8525 container_of(attr, struct perf_pmu_events_attr, attr);
8526
8527 if (pmu_attr->event_str)
8528 return sprintf(page, "%s\n", pmu_attr->event_str);
8529
8530 return 0;
8531}
8532
8279static int __init perf_event_sysfs_init(void) 8533static int __init perf_event_sysfs_init(void)
8280{ 8534{
8281 struct pmu *pmu; 8535 struct pmu *pmu;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 146a5792b1d2..eadb95ce7aac 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -13,12 +13,13 @@
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/circ_buf.h> 15#include <linux/circ_buf.h>
16#include <linux/poll.h>
16 17
17#include "internal.h" 18#include "internal.h"
18 19
19static void perf_output_wakeup(struct perf_output_handle *handle) 20static void perf_output_wakeup(struct perf_output_handle *handle)
20{ 21{
21 atomic_set(&handle->rb->poll, POLL_IN); 22 atomic_set(&handle->rb->poll, POLLIN);
22 23
23 handle->event->pending_wakeup = 1; 24 handle->event->pending_wakeup = 1;
24 irq_work_queue(&handle->event->pending); 25 irq_work_queue(&handle->event->pending);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1ea4369890a3..feff10bbb307 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk)
435 task_unlock(tsk); 435 task_unlock(tsk);
436 mm_update_next_owner(mm); 436 mm_update_next_owner(mm);
437 mmput(mm); 437 mmput(mm);
438 clear_thread_flag(TIF_MEMDIE); 438 if (test_thread_flag(TIF_MEMDIE))
439 unmark_oom_victim();
439} 440}
440 441
441static struct task_struct *find_alive_thread(struct task_struct *p) 442static struct task_struct *find_alive_thread(struct task_struct *p)
@@ -1287,9 +1288,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1287static int wait_consider_task(struct wait_opts *wo, int ptrace, 1288static int wait_consider_task(struct wait_opts *wo, int ptrace,
1288 struct task_struct *p) 1289 struct task_struct *p)
1289{ 1290{
1291 /*
1292 * We can race with wait_task_zombie() from another thread.
1293 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1294 * can't confuse the checks below.
1295 */
1296 int exit_state = ACCESS_ONCE(p->exit_state);
1290 int ret; 1297 int ret;
1291 1298
1292 if (unlikely(p->exit_state == EXIT_DEAD)) 1299 if (unlikely(exit_state == EXIT_DEAD))
1293 return 0; 1300 return 0;
1294 1301
1295 ret = eligible_child(wo, p); 1302 ret = eligible_child(wo, p);
@@ -1310,7 +1317,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1310 return 0; 1317 return 0;
1311 } 1318 }
1312 1319
1313 if (unlikely(p->exit_state == EXIT_TRACE)) { 1320 if (unlikely(exit_state == EXIT_TRACE)) {
1314 /* 1321 /*
1315 * ptrace == 0 means we are the natural parent. In this case 1322 * ptrace == 0 means we are the natural parent. In this case
1316 * we should clear notask_error, debugger will notify us. 1323 * we should clear notask_error, debugger will notify us.
@@ -1337,7 +1344,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1337 } 1344 }
1338 1345
1339 /* slay zombie? */ 1346 /* slay zombie? */
1340 if (p->exit_state == EXIT_ZOMBIE) { 1347 if (exit_state == EXIT_ZOMBIE) {
1341 /* we don't reap group leaders with subthreads */ 1348 /* we don't reap group leaders with subthreads */
1342 if (!delay_group_leader(p)) { 1349 if (!delay_group_leader(p)) {
1343 /* 1350 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2ddade9f1..cf65139615a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
438 atomic_inc(&mapping->i_mmap_writable); 438 atomic_inc(&mapping->i_mmap_writable);
439 flush_dcache_mmap_lock(mapping); 439 flush_dcache_mmap_lock(mapping);
440 /* insert tmp into the share list, just after mpnt */ 440 /* insert tmp into the share list, just after mpnt */
441 if (unlikely(tmp->vm_flags & VM_NONLINEAR)) 441 vma_interval_tree_insert_after(tmp, mpnt,
442 vma_nonlinear_insert(tmp, 442 &mapping->i_mmap);
443 &mapping->i_mmap_nonlinear);
444 else
445 vma_interval_tree_insert_after(tmp, mpnt,
446 &mapping->i_mmap);
447 flush_dcache_mmap_unlock(mapping); 443 flush_dcache_mmap_unlock(mapping);
448 i_mmap_unlock_write(mapping); 444 i_mmap_unlock_write(mapping);
449 } 445 }
@@ -559,6 +555,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
559 INIT_LIST_HEAD(&mm->mmlist); 555 INIT_LIST_HEAD(&mm->mmlist);
560 mm->core_state = NULL; 556 mm->core_state = NULL;
561 atomic_long_set(&mm->nr_ptes, 0); 557 atomic_long_set(&mm->nr_ptes, 0);
558 mm_nr_pmds_init(mm);
562 mm->map_count = 0; 559 mm->map_count = 0;
563 mm->locked_vm = 0; 560 mm->locked_vm = 0;
564 mm->pinned_vm = 0; 561 mm->pinned_vm = 0;
@@ -607,6 +604,14 @@ static void check_mm(struct mm_struct *mm)
607 printk(KERN_ALERT "BUG: Bad rss-counter state " 604 printk(KERN_ALERT "BUG: Bad rss-counter state "
608 "mm:%p idx:%d val:%ld\n", mm, i, x); 605 "mm:%p idx:%d val:%ld\n", mm, i, x);
609 } 606 }
607
608 if (atomic_long_read(&mm->nr_ptes))
609 pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
610 atomic_long_read(&mm->nr_ptes));
611 if (mm_nr_pmds(mm))
612 pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
613 mm_nr_pmds(mm));
614
610#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 615#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
611 VM_BUG_ON_MM(mm->pmd_huge_pte, mm); 616 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
612#endif 617#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index 63678b573d61..2a5e3830e953 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2217,7 +2217,7 @@ retry:
2217 if (!abs_time) 2217 if (!abs_time)
2218 goto out; 2218 goto out;
2219 2219
2220 restart = &current_thread_info()->restart_block; 2220 restart = &current->restart_block;
2221 restart->fn = futex_wait_restart; 2221 restart->fn = futex_wait_restart;
2222 restart->futex.uaddr = uaddr; 2222 restart->futex.uaddr = uaddr;
2223 restart->futex.val = val; 2223 restart->futex.val = val;
@@ -2258,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart)
2258 * if there are waiters then it will block, it does PI, etc. (Due to 2258 * if there are waiters then it will block, it does PI, etc. (Due to
2259 * races the kernel might see a 0 value of the futex too.) 2259 * races the kernel might see a 0 value of the futex too.)
2260 */ 2260 */
2261static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, 2261static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
2262 ktime_t *time, int trylock) 2262 ktime_t *time, int trylock)
2263{ 2263{
2264 struct hrtimer_sleeper timeout, *to = NULL; 2264 struct hrtimer_sleeper timeout, *to = NULL;
@@ -2953,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2953 case FUTEX_WAKE_OP: 2953 case FUTEX_WAKE_OP:
2954 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); 2954 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2955 case FUTEX_LOCK_PI: 2955 case FUTEX_LOCK_PI:
2956 return futex_lock_pi(uaddr, flags, val, timeout, 0); 2956 return futex_lock_pi(uaddr, flags, timeout, 0);
2957 case FUTEX_UNLOCK_PI: 2957 case FUTEX_UNLOCK_PI:
2958 return futex_unlock_pi(uaddr, flags); 2958 return futex_unlock_pi(uaddr, flags);
2959 case FUTEX_TRYLOCK_PI: 2959 case FUTEX_TRYLOCK_PI:
2960 return futex_lock_pi(uaddr, flags, 0, timeout, 1); 2960 return futex_lock_pi(uaddr, flags, NULL, 1);
2961 case FUTEX_WAIT_REQUEUE_PI: 2961 case FUTEX_WAIT_REQUEUE_PI:
2962 val3 = FUTEX_BITSET_MATCH_ANY; 2962 val3 = FUTEX_BITSET_MATCH_ANY;
2963 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, 2963 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 80692373abd6..196a06fbc122 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -243,6 +243,9 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
243 return -EINVAL; 243 return -EINVAL;
244 desc->affinity_hint = m; 244 desc->affinity_hint = m;
245 irq_put_desc_unlock(desc, flags); 245 irq_put_desc_unlock(desc, flags);
246 /* set the initial affinity to prevent every interrupt being on CPU0 */
247 if (m)
248 __irq_set_affinity(irq, m, false);
246 return 0; 249 return 0;
247} 250}
248EXPORT_SYMBOL_GPL(irq_set_affinity_hint); 251EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9dc9bfd8a678..df2f4642d1e7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -46,10 +46,9 @@ static int show_irq_affinity(int type, struct seq_file *m, void *v)
46 mask = desc->pending_mask; 46 mask = desc->pending_mask;
47#endif 47#endif
48 if (type) 48 if (type)
49 seq_cpumask_list(m, mask); 49 seq_printf(m, "%*pbl\n", cpumask_pr_args(mask));
50 else 50 else
51 seq_cpumask(m, mask); 51 seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
52 seq_putc(m, '\n');
53 return 0; 52 return 0;
54} 53}
55 54
@@ -67,8 +66,7 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
67 cpumask_copy(mask, desc->affinity_hint); 66 cpumask_copy(mask, desc->affinity_hint);
68 raw_spin_unlock_irqrestore(&desc->lock, flags); 67 raw_spin_unlock_irqrestore(&desc->lock, flags);
69 68
70 seq_cpumask(m, mask); 69 seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
71 seq_putc(m, '\n');
72 free_cpumask_var(mask); 70 free_cpumask_var(mask);
73 71
74 return 0; 72 return 0;
@@ -186,8 +184,7 @@ static const struct file_operations irq_affinity_list_proc_fops = {
186 184
187static int default_affinity_show(struct seq_file *m, void *v) 185static int default_affinity_show(struct seq_file *m, void *v)
188{ 186{
189 seq_cpumask(m, irq_default_affinity); 187 seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity));
190 seq_putc(m, '\n');
191 return 0; 188 return 0;
192} 189}
193 190
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9a8a01abbaed..38c25b1f2fd5 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -444,7 +444,7 @@ arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
444} 444}
445 445
446/* 446/*
447 * Free up memory used by kernel, initrd, and comand line. This is temporary 447 * Free up memory used by kernel, initrd, and command line. This is temporary
448 * memory allocation which is not needed any more after these buffers have 448 * memory allocation which is not needed any more after these buffers have
449 * been loaded into separate segments and have been copied elsewhere. 449 * been loaded into separate segments and have been copied elsewhere.
450 */ 450 */
@@ -856,8 +856,6 @@ static int kimage_set_destination(struct kimage *image,
856 856
857 destination &= PAGE_MASK; 857 destination &= PAGE_MASK;
858 result = kimage_add_entry(image, destination | IND_DESTINATION); 858 result = kimage_add_entry(image, destination | IND_DESTINATION);
859 if (result == 0)
860 image->destination = destination;
861 859
862 return result; 860 return result;
863} 861}
@@ -869,8 +867,6 @@ static int kimage_add_page(struct kimage *image, unsigned long page)
869 867
870 page &= PAGE_MASK; 868 page &= PAGE_MASK;
871 result = kimage_add_entry(image, page | IND_SOURCE); 869 result = kimage_add_entry(image, page | IND_SOURCE);
872 if (result == 0)
873 image->destination += PAGE_SIZE;
874 870
875 return result; 871 return result;
876} 872}
@@ -1288,19 +1284,22 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
1288 if (nr_segments > 0) { 1284 if (nr_segments > 0) {
1289 unsigned long i; 1285 unsigned long i;
1290 1286
1291 /* Loading another kernel to reboot into */ 1287 if (flags & KEXEC_ON_CRASH) {
1292 if ((flags & KEXEC_ON_CRASH) == 0) 1288 /*
1293 result = kimage_alloc_init(&image, entry, nr_segments, 1289 * Loading another kernel to switch to if this one
1294 segments, flags); 1290 * crashes. Free any current crash dump kernel before
1295 /* Loading another kernel to switch to if this one crashes */
1296 else if (flags & KEXEC_ON_CRASH) {
1297 /* Free any current crash dump kernel before
1298 * we corrupt it. 1291 * we corrupt it.
1299 */ 1292 */
1293
1300 kimage_free(xchg(&kexec_crash_image, NULL)); 1294 kimage_free(xchg(&kexec_crash_image, NULL));
1301 result = kimage_alloc_init(&image, entry, nr_segments, 1295 result = kimage_alloc_init(&image, entry, nr_segments,
1302 segments, flags); 1296 segments, flags);
1303 crash_map_reserved_pages(); 1297 crash_map_reserved_pages();
1298 } else {
1299 /* Loading another kernel to reboot into. */
1300
1301 result = kimage_alloc_init(&image, entry, nr_segments,
1302 segments, flags);
1304 } 1303 }
1305 if (result) 1304 if (result)
1306 goto out; 1305 goto out;
@@ -2512,7 +2511,7 @@ static int kexec_apply_relocations(struct kimage *image)
2512 continue; 2511 continue;
2513 2512
2514 /* 2513 /*
2515 * Respective archicture needs to provide support for applying 2514 * Respective architecture needs to provide support for applying
2516 * relocations of type SHT_RELA/SHT_REL. 2515 * relocations of type SHT_RELA/SHT_REL.
2517 */ 2516 */
2518 if (sechdrs[i].sh_type == SHT_RELA) 2517 if (sechdrs[i].sh_type == SHT_RELA)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 06f58309fed2..c90e417bb963 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -127,7 +127,7 @@ static void *alloc_insn_page(void)
127 127
128static void free_insn_page(void *page) 128static void free_insn_page(void *page)
129{ 129{
130 module_free(NULL, page); 130 module_memfree(page);
131} 131}
132 132
133struct kprobe_insn_cache kprobe_insn_slots = { 133struct kprobe_insn_cache kprobe_insn_slots = {
@@ -717,7 +717,7 @@ static void prepare_optimized_kprobe(struct kprobe *p)
717 struct optimized_kprobe *op; 717 struct optimized_kprobe *op;
718 718
719 op = container_of(p, struct optimized_kprobe, kp); 719 op = container_of(p, struct optimized_kprobe, kp);
720 arch_prepare_optimized_kprobe(op); 720 arch_prepare_optimized_kprobe(op, p);
721} 721}
722 722
723/* Allocate new optimized_kprobe and try to prepare optimized instructions */ 723/* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -731,7 +731,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
731 731
732 INIT_LIST_HEAD(&op->list); 732 INIT_LIST_HEAD(&op->list);
733 op->kp.addr = p->addr; 733 op->kp.addr = p->addr;
734 arch_prepare_optimized_kprobe(op); 734 arch_prepare_optimized_kprobe(op, p);
735 735
736 return &op->kp; 736 return &op->kp;
737} 737}
@@ -869,7 +869,8 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt)
869{ 869{
870 struct kprobe *_p; 870 struct kprobe *_p;
871 871
872 unoptimize_kprobe(p, false); /* Try to unoptimize */ 872 /* Try to unoptimize */
873 unoptimize_kprobe(p, kprobes_all_disarmed);
873 874
874 if (!kprobe_queued(p)) { 875 if (!kprobe_queued(p)) {
875 arch_disarm_kprobe(p); 876 arch_disarm_kprobe(p);
@@ -1571,7 +1572,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
1571 1572
1572 /* Try to disarm and disable this/parent probe */ 1573 /* Try to disarm and disable this/parent probe */
1573 if (p == orig_p || aggr_kprobe_disabled(orig_p)) { 1574 if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
1574 disarm_kprobe(orig_p, true); 1575 /*
1576 * If kprobes_all_disarmed is set, orig_p
1577 * should have already been disarmed, so
1578 * skip unneed disarming process.
1579 */
1580 if (!kprobes_all_disarmed)
1581 disarm_kprobe(orig_p, true);
1575 orig_p->flags |= KPROBE_FLAG_DISABLED; 1582 orig_p->flags |= KPROBE_FLAG_DISABLED;
1576 } 1583 }
1577 } 1584 }
@@ -2320,6 +2327,12 @@ static void arm_all_kprobes(void)
2320 if (!kprobes_all_disarmed) 2327 if (!kprobes_all_disarmed)
2321 goto already_enabled; 2328 goto already_enabled;
2322 2329
2330 /*
2331 * optimize_kprobe() called by arm_kprobe() checks
2332 * kprobes_all_disarmed, so set kprobes_all_disarmed before
2333 * arm_kprobe.
2334 */
2335 kprobes_all_disarmed = false;
2323 /* Arming kprobes doesn't optimize kprobe itself */ 2336 /* Arming kprobes doesn't optimize kprobe itself */
2324 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2337 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2325 head = &kprobe_table[i]; 2338 head = &kprobe_table[i];
@@ -2328,7 +2341,6 @@ static void arm_all_kprobes(void)
2328 arm_kprobe(p); 2341 arm_kprobe(p);
2329 } 2342 }
2330 2343
2331 kprobes_all_disarmed = false;
2332 printk(KERN_INFO "Kprobes globally enabled\n"); 2344 printk(KERN_INFO "Kprobes globally enabled\n");
2333 2345
2334already_enabled: 2346already_enabled:
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
new file mode 100644
index 000000000000..045022557936
--- /dev/null
+++ b/kernel/livepatch/Kconfig
@@ -0,0 +1,18 @@
1config HAVE_LIVEPATCH
2 bool
3 help
4 Arch supports kernel live patching
5
6config LIVEPATCH
7 bool "Kernel Live Patching"
8 depends on DYNAMIC_FTRACE_WITH_REGS
9 depends on MODULES
10 depends on SYSFS
11 depends on KALLSYMS_ALL
12 depends on HAVE_LIVEPATCH
13 help
14 Say Y here if you want to support kernel live patching.
15 This option has no runtime impact until a kernel "patch"
16 module uses the interface provided by this option to register
17 a patch, causing calls to patched functions to be redirected
18 to new function code contained in the patch module.
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
new file mode 100644
index 000000000000..e8780c0901d9
--- /dev/null
+++ b/kernel/livepatch/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_LIVEPATCH) += livepatch.o
2
3livepatch-objs := core.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
new file mode 100644
index 000000000000..ff7f47d026ac
--- /dev/null
+++ b/kernel/livepatch/core.c
@@ -0,0 +1,1015 @@
1/*
2 * core.c - Kernel Live Patching Core
3 *
4 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
5 * Copyright (C) 2014 SUSE
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
23#include <linux/module.h>
24#include <linux/kernel.h>
25#include <linux/mutex.h>
26#include <linux/slab.h>
27#include <linux/ftrace.h>
28#include <linux/list.h>
29#include <linux/kallsyms.h>
30#include <linux/livepatch.h>
31
32/**
33 * struct klp_ops - structure for tracking registered ftrace ops structs
34 *
35 * A single ftrace_ops is shared between all enabled replacement functions
36 * (klp_func structs) which have the same old_addr. This allows the switch
37 * between function versions to happen instantaneously by updating the klp_ops
38 * struct's func_stack list. The winner is the klp_func at the top of the
39 * func_stack (front of the list).
40 *
41 * @node: node for the global klp_ops list
42 * @func_stack: list head for the stack of klp_func's (active func is on top)
43 * @fops: registered ftrace ops struct
44 */
45struct klp_ops {
46 struct list_head node;
47 struct list_head func_stack;
48 struct ftrace_ops fops;
49};
50
51/*
52 * The klp_mutex protects the global lists and state transitions of any
53 * structure reachable from them. References to any structure must be obtained
54 * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
55 * ensure it gets consistent data).
56 */
57static DEFINE_MUTEX(klp_mutex);
58
59static LIST_HEAD(klp_patches);
60static LIST_HEAD(klp_ops);
61
62static struct kobject *klp_root_kobj;
63
64static struct klp_ops *klp_find_ops(unsigned long old_addr)
65{
66 struct klp_ops *ops;
67 struct klp_func *func;
68
69 list_for_each_entry(ops, &klp_ops, node) {
70 func = list_first_entry(&ops->func_stack, struct klp_func,
71 stack_node);
72 if (func->old_addr == old_addr)
73 return ops;
74 }
75
76 return NULL;
77}
78
79static bool klp_is_module(struct klp_object *obj)
80{
81 return obj->name;
82}
83
84static bool klp_is_object_loaded(struct klp_object *obj)
85{
86 return !obj->name || obj->mod;
87}
88
89/* sets obj->mod if object is not vmlinux and module is found */
90static void klp_find_object_module(struct klp_object *obj)
91{
92 if (!klp_is_module(obj))
93 return;
94
95 mutex_lock(&module_mutex);
96 /*
97 * We don't need to take a reference on the module here because we have
98 * the klp_mutex, which is also taken by the module notifier. This
99 * prevents any module from unloading until we release the klp_mutex.
100 */
101 obj->mod = find_module(obj->name);
102 mutex_unlock(&module_mutex);
103}
104
105/* klp_mutex must be held by caller */
106static bool klp_is_patch_registered(struct klp_patch *patch)
107{
108 struct klp_patch *mypatch;
109
110 list_for_each_entry(mypatch, &klp_patches, list)
111 if (mypatch == patch)
112 return true;
113
114 return false;
115}
116
117static bool klp_initialized(void)
118{
119 return klp_root_kobj;
120}
121
122struct klp_find_arg {
123 const char *objname;
124 const char *name;
125 unsigned long addr;
126 /*
127 * If count == 0, the symbol was not found. If count == 1, a unique
128 * match was found and addr is set. If count > 1, there is
129 * unresolvable ambiguity among "count" number of symbols with the same
130 * name in the same object.
131 */
132 unsigned long count;
133};
134
135static int klp_find_callback(void *data, const char *name,
136 struct module *mod, unsigned long addr)
137{
138 struct klp_find_arg *args = data;
139
140 if ((mod && !args->objname) || (!mod && args->objname))
141 return 0;
142
143 if (strcmp(args->name, name))
144 return 0;
145
146 if (args->objname && strcmp(args->objname, mod->name))
147 return 0;
148
149 /*
150 * args->addr might be overwritten if another match is found
151 * but klp_find_object_symbol() handles this and only returns the
152 * addr if count == 1.
153 */
154 args->addr = addr;
155 args->count++;
156
157 return 0;
158}
159
160static int klp_find_object_symbol(const char *objname, const char *name,
161 unsigned long *addr)
162{
163 struct klp_find_arg args = {
164 .objname = objname,
165 .name = name,
166 .addr = 0,
167 .count = 0
168 };
169
170 kallsyms_on_each_symbol(klp_find_callback, &args);
171
172 if (args.count == 0)
173 pr_err("symbol '%s' not found in symbol table\n", name);
174 else if (args.count > 1)
175 pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
176 args.count, name, objname);
177 else {
178 *addr = args.addr;
179 return 0;
180 }
181
182 *addr = 0;
183 return -EINVAL;
184}
185
186struct klp_verify_args {
187 const char *name;
188 const unsigned long addr;
189};
190
191static int klp_verify_callback(void *data, const char *name,
192 struct module *mod, unsigned long addr)
193{
194 struct klp_verify_args *args = data;
195
196 if (!mod &&
197 !strcmp(args->name, name) &&
198 args->addr == addr)
199 return 1;
200
201 return 0;
202}
203
204static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
205{
206 struct klp_verify_args args = {
207 .name = name,
208 .addr = addr,
209 };
210
211 if (kallsyms_on_each_symbol(klp_verify_callback, &args))
212 return 0;
213
214 pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
215 name, addr);
216 return -EINVAL;
217}
218
219static int klp_find_verify_func_addr(struct klp_object *obj,
220 struct klp_func *func)
221{
222 int ret;
223
224#if defined(CONFIG_RANDOMIZE_BASE)
225 /* KASLR is enabled, disregard old_addr from user */
226 func->old_addr = 0;
227#endif
228
229 if (!func->old_addr || klp_is_module(obj))
230 ret = klp_find_object_symbol(obj->name, func->old_name,
231 &func->old_addr);
232 else
233 ret = klp_verify_vmlinux_symbol(func->old_name,
234 func->old_addr);
235
236 return ret;
237}
238
239/*
240 * external symbols are located outside the parent object (where the parent
241 * object is either vmlinux or the kmod being patched).
242 */
243static int klp_find_external_symbol(struct module *pmod, const char *name,
244 unsigned long *addr)
245{
246 const struct kernel_symbol *sym;
247
248 /* first, check if it's an exported symbol */
249 preempt_disable();
250 sym = find_symbol(name, NULL, NULL, true, true);
251 preempt_enable();
252 if (sym) {
253 *addr = sym->value;
254 return 0;
255 }
256
257 /* otherwise check if it's in another .o within the patch module */
258 return klp_find_object_symbol(pmod->name, name, addr);
259}
260
261static int klp_write_object_relocations(struct module *pmod,
262 struct klp_object *obj)
263{
264 int ret;
265 struct klp_reloc *reloc;
266
267 if (WARN_ON(!klp_is_object_loaded(obj)))
268 return -EINVAL;
269
270 if (WARN_ON(!obj->relocs))
271 return -EINVAL;
272
273 for (reloc = obj->relocs; reloc->name; reloc++) {
274 if (!klp_is_module(obj)) {
275 ret = klp_verify_vmlinux_symbol(reloc->name,
276 reloc->val);
277 if (ret)
278 return ret;
279 } else {
280 /* module, reloc->val needs to be discovered */
281 if (reloc->external)
282 ret = klp_find_external_symbol(pmod,
283 reloc->name,
284 &reloc->val);
285 else
286 ret = klp_find_object_symbol(obj->mod->name,
287 reloc->name,
288 &reloc->val);
289 if (ret)
290 return ret;
291 }
292 ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
293 reloc->val + reloc->addend);
294 if (ret) {
295 pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
296 reloc->name, reloc->val, ret);
297 return ret;
298 }
299 }
300
301 return 0;
302}
303
304static void notrace klp_ftrace_handler(unsigned long ip,
305 unsigned long parent_ip,
306 struct ftrace_ops *fops,
307 struct pt_regs *regs)
308{
309 struct klp_ops *ops;
310 struct klp_func *func;
311
312 ops = container_of(fops, struct klp_ops, fops);
313
314 rcu_read_lock();
315 func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
316 stack_node);
317 rcu_read_unlock();
318
319 if (WARN_ON_ONCE(!func))
320 return;
321
322 klp_arch_set_pc(regs, (unsigned long)func->new_func);
323}
324
325static int klp_disable_func(struct klp_func *func)
326{
327 struct klp_ops *ops;
328 int ret;
329
330 if (WARN_ON(func->state != KLP_ENABLED))
331 return -EINVAL;
332
333 if (WARN_ON(!func->old_addr))
334 return -EINVAL;
335
336 ops = klp_find_ops(func->old_addr);
337 if (WARN_ON(!ops))
338 return -EINVAL;
339
340 if (list_is_singular(&ops->func_stack)) {
341 ret = unregister_ftrace_function(&ops->fops);
342 if (ret) {
343 pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
344 func->old_name, ret);
345 return ret;
346 }
347
348 ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
349 if (ret)
350 pr_warn("function unregister succeeded but failed to clear the filter\n");
351
352 list_del_rcu(&func->stack_node);
353 list_del(&ops->node);
354 kfree(ops);
355 } else {
356 list_del_rcu(&func->stack_node);
357 }
358
359 func->state = KLP_DISABLED;
360
361 return 0;
362}
363
364static int klp_enable_func(struct klp_func *func)
365{
366 struct klp_ops *ops;
367 int ret;
368
369 if (WARN_ON(!func->old_addr))
370 return -EINVAL;
371
372 if (WARN_ON(func->state != KLP_DISABLED))
373 return -EINVAL;
374
375 ops = klp_find_ops(func->old_addr);
376 if (!ops) {
377 ops = kzalloc(sizeof(*ops), GFP_KERNEL);
378 if (!ops)
379 return -ENOMEM;
380
381 ops->fops.func = klp_ftrace_handler;
382 ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
383 FTRACE_OPS_FL_DYNAMIC |
384 FTRACE_OPS_FL_IPMODIFY;
385
386 list_add(&ops->node, &klp_ops);
387
388 INIT_LIST_HEAD(&ops->func_stack);
389 list_add_rcu(&func->stack_node, &ops->func_stack);
390
391 ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0);
392 if (ret) {
393 pr_err("failed to set ftrace filter for function '%s' (%d)\n",
394 func->old_name, ret);
395 goto err;
396 }
397
398 ret = register_ftrace_function(&ops->fops);
399 if (ret) {
400 pr_err("failed to register ftrace handler for function '%s' (%d)\n",
401 func->old_name, ret);
402 ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
403 goto err;
404 }
405
406
407 } else {
408 list_add_rcu(&func->stack_node, &ops->func_stack);
409 }
410
411 func->state = KLP_ENABLED;
412
413 return 0;
414
415err:
416 list_del_rcu(&func->stack_node);
417 list_del(&ops->node);
418 kfree(ops);
419 return ret;
420}
421
422static int klp_disable_object(struct klp_object *obj)
423{
424 struct klp_func *func;
425 int ret;
426
427 for (func = obj->funcs; func->old_name; func++) {
428 if (func->state != KLP_ENABLED)
429 continue;
430
431 ret = klp_disable_func(func);
432 if (ret)
433 return ret;
434 }
435
436 obj->state = KLP_DISABLED;
437
438 return 0;
439}
440
441static int klp_enable_object(struct klp_object *obj)
442{
443 struct klp_func *func;
444 int ret;
445
446 if (WARN_ON(obj->state != KLP_DISABLED))
447 return -EINVAL;
448
449 if (WARN_ON(!klp_is_object_loaded(obj)))
450 return -EINVAL;
451
452 for (func = obj->funcs; func->old_name; func++) {
453 ret = klp_enable_func(func);
454 if (ret)
455 goto unregister;
456 }
457 obj->state = KLP_ENABLED;
458
459 return 0;
460
461unregister:
462 WARN_ON(klp_disable_object(obj));
463 return ret;
464}
465
466static int __klp_disable_patch(struct klp_patch *patch)
467{
468 struct klp_object *obj;
469 int ret;
470
471 /* enforce stacking: only the last enabled patch can be disabled */
472 if (!list_is_last(&patch->list, &klp_patches) &&
473 list_next_entry(patch, list)->state == KLP_ENABLED)
474 return -EBUSY;
475
476 pr_notice("disabling patch '%s'\n", patch->mod->name);
477
478 for (obj = patch->objs; obj->funcs; obj++) {
479 if (obj->state != KLP_ENABLED)
480 continue;
481
482 ret = klp_disable_object(obj);
483 if (ret)
484 return ret;
485 }
486
487 patch->state = KLP_DISABLED;
488
489 return 0;
490}
491
492/**
493 * klp_disable_patch() - disables a registered patch
494 * @patch: The registered, enabled patch to be disabled
495 *
496 * Unregisters the patched functions from ftrace.
497 *
498 * Return: 0 on success, otherwise error
499 */
500int klp_disable_patch(struct klp_patch *patch)
501{
502 int ret;
503
504 mutex_lock(&klp_mutex);
505
506 if (!klp_is_patch_registered(patch)) {
507 ret = -EINVAL;
508 goto err;
509 }
510
511 if (patch->state == KLP_DISABLED) {
512 ret = -EINVAL;
513 goto err;
514 }
515
516 ret = __klp_disable_patch(patch);
517
518err:
519 mutex_unlock(&klp_mutex);
520 return ret;
521}
522EXPORT_SYMBOL_GPL(klp_disable_patch);
523
524static int __klp_enable_patch(struct klp_patch *patch)
525{
526 struct klp_object *obj;
527 int ret;
528
529 if (WARN_ON(patch->state != KLP_DISABLED))
530 return -EINVAL;
531
532 /* enforce stacking: only the first disabled patch can be enabled */
533 if (patch->list.prev != &klp_patches &&
534 list_prev_entry(patch, list)->state == KLP_DISABLED)
535 return -EBUSY;
536
537 pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
538 add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
539
540 pr_notice("enabling patch '%s'\n", patch->mod->name);
541
542 for (obj = patch->objs; obj->funcs; obj++) {
543 klp_find_object_module(obj);
544
545 if (!klp_is_object_loaded(obj))
546 continue;
547
548 ret = klp_enable_object(obj);
549 if (ret)
550 goto unregister;
551 }
552
553 patch->state = KLP_ENABLED;
554
555 return 0;
556
557unregister:
558 WARN_ON(__klp_disable_patch(patch));
559 return ret;
560}
561
562/**
563 * klp_enable_patch() - enables a registered patch
564 * @patch: The registered, disabled patch to be enabled
565 *
566 * Performs the needed symbol lookups and code relocations,
567 * then registers the patched functions with ftrace.
568 *
569 * Return: 0 on success, otherwise error
570 */
571int klp_enable_patch(struct klp_patch *patch)
572{
573 int ret;
574
575 mutex_lock(&klp_mutex);
576
577 if (!klp_is_patch_registered(patch)) {
578 ret = -EINVAL;
579 goto err;
580 }
581
582 ret = __klp_enable_patch(patch);
583
584err:
585 mutex_unlock(&klp_mutex);
586 return ret;
587}
588EXPORT_SYMBOL_GPL(klp_enable_patch);
589
590/*
591 * Sysfs Interface
592 *
593 * /sys/kernel/livepatch
594 * /sys/kernel/livepatch/<patch>
595 * /sys/kernel/livepatch/<patch>/enabled
596 * /sys/kernel/livepatch/<patch>/<object>
597 * /sys/kernel/livepatch/<patch>/<object>/<func>
598 */
599
600static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
601 const char *buf, size_t count)
602{
603 struct klp_patch *patch;
604 int ret;
605 unsigned long val;
606
607 ret = kstrtoul(buf, 10, &val);
608 if (ret)
609 return -EINVAL;
610
611 if (val != KLP_DISABLED && val != KLP_ENABLED)
612 return -EINVAL;
613
614 patch = container_of(kobj, struct klp_patch, kobj);
615
616 mutex_lock(&klp_mutex);
617
618 if (val == patch->state) {
619 /* already in requested state */
620 ret = -EINVAL;
621 goto err;
622 }
623
624 if (val == KLP_ENABLED) {
625 ret = __klp_enable_patch(patch);
626 if (ret)
627 goto err;
628 } else {
629 ret = __klp_disable_patch(patch);
630 if (ret)
631 goto err;
632 }
633
634 mutex_unlock(&klp_mutex);
635
636 return count;
637
638err:
639 mutex_unlock(&klp_mutex);
640 return ret;
641}
642
643static ssize_t enabled_show(struct kobject *kobj,
644 struct kobj_attribute *attr, char *buf)
645{
646 struct klp_patch *patch;
647
648 patch = container_of(kobj, struct klp_patch, kobj);
649 return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
650}
651
652static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
653static struct attribute *klp_patch_attrs[] = {
654 &enabled_kobj_attr.attr,
655 NULL
656};
657
658static void klp_kobj_release_patch(struct kobject *kobj)
659{
660 /*
661 * Once we have a consistency model we'll need to module_put() the
662 * patch module here. See klp_register_patch() for more details.
663 */
664}
665
666static struct kobj_type klp_ktype_patch = {
667 .release = klp_kobj_release_patch,
668 .sysfs_ops = &kobj_sysfs_ops,
669 .default_attrs = klp_patch_attrs,
670};
671
672static void klp_kobj_release_func(struct kobject *kobj)
673{
674}
675
676static struct kobj_type klp_ktype_func = {
677 .release = klp_kobj_release_func,
678 .sysfs_ops = &kobj_sysfs_ops,
679};
680
681/*
682 * Free all functions' kobjects in the array up to some limit. When limit is
683 * NULL, all kobjects are freed.
684 */
685static void klp_free_funcs_limited(struct klp_object *obj,
686 struct klp_func *limit)
687{
688 struct klp_func *func;
689
690 for (func = obj->funcs; func->old_name && func != limit; func++)
691 kobject_put(&func->kobj);
692}
693
694/* Clean up when a patched object is unloaded */
695static void klp_free_object_loaded(struct klp_object *obj)
696{
697 struct klp_func *func;
698
699 obj->mod = NULL;
700
701 for (func = obj->funcs; func->old_name; func++)
702 func->old_addr = 0;
703}
704
705/*
706 * Free all objects' kobjects in the array up to some limit. When limit is
707 * NULL, all kobjects are freed.
708 */
709static void klp_free_objects_limited(struct klp_patch *patch,
710 struct klp_object *limit)
711{
712 struct klp_object *obj;
713
714 for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
715 klp_free_funcs_limited(obj, NULL);
716 kobject_put(obj->kobj);
717 }
718}
719
720static void klp_free_patch(struct klp_patch *patch)
721{
722 klp_free_objects_limited(patch, NULL);
723 if (!list_empty(&patch->list))
724 list_del(&patch->list);
725 kobject_put(&patch->kobj);
726}
727
728static int klp_init_func(struct klp_object *obj, struct klp_func *func)
729{
730 INIT_LIST_HEAD(&func->stack_node);
731 func->state = KLP_DISABLED;
732
733 return kobject_init_and_add(&func->kobj, &klp_ktype_func,
734 obj->kobj, func->old_name);
735}
736
737/* parts of the initialization that is done only when the object is loaded */
738static int klp_init_object_loaded(struct klp_patch *patch,
739 struct klp_object *obj)
740{
741 struct klp_func *func;
742 int ret;
743
744 if (obj->relocs) {
745 ret = klp_write_object_relocations(patch->mod, obj);
746 if (ret)
747 return ret;
748 }
749
750 for (func = obj->funcs; func->old_name; func++) {
751 ret = klp_find_verify_func_addr(obj, func);
752 if (ret)
753 return ret;
754 }
755
756 return 0;
757}
758
759static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
760{
761 struct klp_func *func;
762 int ret;
763 const char *name;
764
765 if (!obj->funcs)
766 return -EINVAL;
767
768 obj->state = KLP_DISABLED;
769
770 klp_find_object_module(obj);
771
772 name = klp_is_module(obj) ? obj->name : "vmlinux";
773 obj->kobj = kobject_create_and_add(name, &patch->kobj);
774 if (!obj->kobj)
775 return -ENOMEM;
776
777 for (func = obj->funcs; func->old_name; func++) {
778 ret = klp_init_func(obj, func);
779 if (ret)
780 goto free;
781 }
782
783 if (klp_is_object_loaded(obj)) {
784 ret = klp_init_object_loaded(patch, obj);
785 if (ret)
786 goto free;
787 }
788
789 return 0;
790
791free:
792 klp_free_funcs_limited(obj, func);
793 kobject_put(obj->kobj);
794 return ret;
795}
796
797static int klp_init_patch(struct klp_patch *patch)
798{
799 struct klp_object *obj;
800 int ret;
801
802 if (!patch->objs)
803 return -EINVAL;
804
805 mutex_lock(&klp_mutex);
806
807 patch->state = KLP_DISABLED;
808
809 ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
810 klp_root_kobj, patch->mod->name);
811 if (ret)
812 goto unlock;
813
814 for (obj = patch->objs; obj->funcs; obj++) {
815 ret = klp_init_object(patch, obj);
816 if (ret)
817 goto free;
818 }
819
820 list_add_tail(&patch->list, &klp_patches);
821
822 mutex_unlock(&klp_mutex);
823
824 return 0;
825
826free:
827 klp_free_objects_limited(patch, obj);
828 kobject_put(&patch->kobj);
829unlock:
830 mutex_unlock(&klp_mutex);
831 return ret;
832}
833
834/**
835 * klp_unregister_patch() - unregisters a patch
836 * @patch: Disabled patch to be unregistered
837 *
838 * Frees the data structures and removes the sysfs interface.
839 *
840 * Return: 0 on success, otherwise error
841 */
842int klp_unregister_patch(struct klp_patch *patch)
843{
844 int ret = 0;
845
846 mutex_lock(&klp_mutex);
847
848 if (!klp_is_patch_registered(patch)) {
849 ret = -EINVAL;
850 goto out;
851 }
852
853 if (patch->state == KLP_ENABLED) {
854 ret = -EBUSY;
855 goto out;
856 }
857
858 klp_free_patch(patch);
859
860out:
861 mutex_unlock(&klp_mutex);
862 return ret;
863}
864EXPORT_SYMBOL_GPL(klp_unregister_patch);
865
866/**
867 * klp_register_patch() - registers a patch
868 * @patch: Patch to be registered
869 *
870 * Initializes the data structure associated with the patch and
871 * creates the sysfs interface.
872 *
873 * Return: 0 on success, otherwise error
874 */
875int klp_register_patch(struct klp_patch *patch)
876{
877 int ret;
878
879 if (!klp_initialized())
880 return -ENODEV;
881
882 if (!patch || !patch->mod)
883 return -EINVAL;
884
885 /*
886 * A reference is taken on the patch module to prevent it from being
887 * unloaded. Right now, we don't allow patch modules to unload since
888 * there is currently no method to determine if a thread is still
889 * running in the patched code contained in the patch module once
890 * the ftrace registration is successful.
891 */
892 if (!try_module_get(patch->mod))
893 return -ENODEV;
894
895 ret = klp_init_patch(patch);
896 if (ret)
897 module_put(patch->mod);
898
899 return ret;
900}
901EXPORT_SYMBOL_GPL(klp_register_patch);
902
903static void klp_module_notify_coming(struct klp_patch *patch,
904 struct klp_object *obj)
905{
906 struct module *pmod = patch->mod;
907 struct module *mod = obj->mod;
908 int ret;
909
910 ret = klp_init_object_loaded(patch, obj);
911 if (ret)
912 goto err;
913
914 if (patch->state == KLP_DISABLED)
915 return;
916
917 pr_notice("applying patch '%s' to loading module '%s'\n",
918 pmod->name, mod->name);
919
920 ret = klp_enable_object(obj);
921 if (!ret)
922 return;
923
924err:
925 pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
926 pmod->name, mod->name, ret);
927}
928
929static void klp_module_notify_going(struct klp_patch *patch,
930 struct klp_object *obj)
931{
932 struct module *pmod = patch->mod;
933 struct module *mod = obj->mod;
934 int ret;
935
936 if (patch->state == KLP_DISABLED)
937 goto disabled;
938
939 pr_notice("reverting patch '%s' on unloading module '%s'\n",
940 pmod->name, mod->name);
941
942 ret = klp_disable_object(obj);
943 if (ret)
944 pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
945 pmod->name, mod->name, ret);
946
947disabled:
948 klp_free_object_loaded(obj);
949}
950
951static int klp_module_notify(struct notifier_block *nb, unsigned long action,
952 void *data)
953{
954 struct module *mod = data;
955 struct klp_patch *patch;
956 struct klp_object *obj;
957
958 if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING)
959 return 0;
960
961 mutex_lock(&klp_mutex);
962
963 list_for_each_entry(patch, &klp_patches, list) {
964 for (obj = patch->objs; obj->funcs; obj++) {
965 if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
966 continue;
967
968 if (action == MODULE_STATE_COMING) {
969 obj->mod = mod;
970 klp_module_notify_coming(patch, obj);
971 } else /* MODULE_STATE_GOING */
972 klp_module_notify_going(patch, obj);
973
974 break;
975 }
976 }
977
978 mutex_unlock(&klp_mutex);
979
980 return 0;
981}
982
983static struct notifier_block klp_module_nb = {
984 .notifier_call = klp_module_notify,
985 .priority = INT_MIN+1, /* called late but before ftrace notifier */
986};
987
988static int klp_init(void)
989{
990 int ret;
991
992 ret = klp_check_compiler_support();
993 if (ret) {
994 pr_info("Your compiler is too old; turning off.\n");
995 return -EINVAL;
996 }
997
998 ret = register_module_notifier(&klp_module_nb);
999 if (ret)
1000 return ret;
1001
1002 klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
1003 if (!klp_root_kobj) {
1004 ret = -ENOMEM;
1005 goto unregister;
1006 }
1007
1008 return 0;
1009
1010unregister:
1011 unregister_module_notifier(&klp_module_nb);
1012 return ret;
1013}
1014
1015module_init(klp_init);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8541bfdfd232..de7a416cca2a 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,11 +1,11 @@
1 1
2obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o 2obj-y += mutex.o semaphore.o rwsem.o
3 3
4ifdef CONFIG_FUNCTION_TRACER 4ifdef CONFIG_FUNCTION_TRACER
5CFLAGS_REMOVE_lockdep.o = -pg 5CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
6CFLAGS_REMOVE_lockdep_proc.o = -pg 6CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
7CFLAGS_REMOVE_mutex-debug.o = -pg 7CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
8CFLAGS_REMOVE_rtmutex-debug.o = -pg 8CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
9endif 9endif
10 10
11obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 11obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
14obj-$(CONFIG_LOCKDEP) += lockdep_proc.o 14obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
15endif 15endif
16obj-$(CONFIG_SMP) += spinlock.o 16obj-$(CONFIG_SMP) += spinlock.o
17obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
17obj-$(CONFIG_SMP) += lglock.o 18obj-$(CONFIG_SMP) += lglock.o
18obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 19obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
19obj-$(CONFIG_RT_MUTEXES) += rtmutex.o 20obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 4d60986fcbee..d1fe2ba5bac9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
108 arch_mcs_spin_unlock_contended(&next->locked); 108 arch_mcs_spin_unlock_contended(&next->locked);
109} 109}
110 110
111/*
112 * Cancellable version of the MCS lock above.
113 *
114 * Intended for adaptive spinning of sleeping locks:
115 * mutex_lock()/rwsem_down_{read,write}() etc.
116 */
117
118struct optimistic_spin_node {
119 struct optimistic_spin_node *next, *prev;
120 int locked; /* 1 if lock acquired */
121 int cpu; /* encoded CPU # value */
122};
123
124extern bool osq_lock(struct optimistic_spin_queue *lock);
125extern void osq_unlock(struct optimistic_spin_queue *lock);
126
127#endif /* __LINUX_MCS_SPINLOCK_H */ 111#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 5cf6731b98e9..3ef3736002d8 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock)
80 DEBUG_LOCKS_WARN_ON(lock->owner != current); 80 DEBUG_LOCKS_WARN_ON(lock->owner != current);
81 81
82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
83 mutex_clear_owner(lock);
84 } 83 }
85 84
86 /* 85 /*
87 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug 86 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
88 * mutexes so that we can do it here after we've verified state. 87 * mutexes so that we can do it here after we've verified state.
89 */ 88 */
89 mutex_clear_owner(lock);
90 atomic_set(&lock->count, 1); 90 atomic_set(&lock->count, 1);
91} 91}
92 92
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 454195194d4a..94674e5919cb 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -81,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
81 * The mutex must later on be released by the same task that 81 * The mutex must later on be released by the same task that
82 * acquired it. Recursive locking is not allowed. The task 82 * acquired it. Recursive locking is not allowed. The task
83 * may not exit without first unlocking the mutex. Also, kernel 83 * may not exit without first unlocking the mutex. Also, kernel
84 * memory where the mutex resides mutex must not be freed with 84 * memory where the mutex resides must not be freed with
85 * the mutex still locked. The mutex must first be initialized 85 * the mutex still locked. The mutex must first be initialized
86 * (or statically defined) before it can be locked. memset()-ing 86 * (or statically defined) before it can be locked. memset()-ing
87 * the mutex to 0 is not allowed. 87 * the mutex to 0 is not allowed.
@@ -147,7 +147,7 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
147} 147}
148 148
149/* 149/*
150 * after acquiring lock with fastpath or when we lost out in contested 150 * After acquiring lock with fastpath or when we lost out in contested
151 * slowpath, set ctx and wake up any waiters so they can recheck. 151 * slowpath, set ctx and wake up any waiters so they can recheck.
152 * 152 *
153 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, 153 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
@@ -191,19 +191,32 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
191 spin_unlock_mutex(&lock->base.wait_lock, flags); 191 spin_unlock_mutex(&lock->base.wait_lock, flags);
192} 192}
193 193
194
195#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
196/* 194/*
197 * In order to avoid a stampede of mutex spinners from acquiring the mutex 195 * After acquiring lock in the slowpath set ctx and wake up any
198 * more or less simultaneously, the spinners need to acquire a MCS lock 196 * waiters so they can recheck.
199 * first before spinning on the owner field.
200 * 197 *
198 * Callers must hold the mutex wait_lock.
201 */ 199 */
200static __always_inline void
201ww_mutex_set_context_slowpath(struct ww_mutex *lock,
202 struct ww_acquire_ctx *ctx)
203{
204 struct mutex_waiter *cur;
202 205
203/* 206 ww_mutex_lock_acquired(lock, ctx);
204 * Mutex spinning code migrated from kernel/sched/core.c 207 lock->ctx = ctx;
205 */ 208
209 /*
210 * Give any possible sleeping processes the chance to wake up,
211 * so they can recheck if they have to back off.
212 */
213 list_for_each_entry(cur, &lock->base.wait_list, list) {
214 debug_mutex_wake_waiter(&lock->base, cur);
215 wake_up_process(cur->task);
216 }
217}
206 218
219#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
207static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 220static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
208{ 221{
209 if (lock->owner != owner) 222 if (lock->owner != owner)
@@ -307,6 +320,11 @@ static bool mutex_optimistic_spin(struct mutex *lock,
307 if (!mutex_can_spin_on_owner(lock)) 320 if (!mutex_can_spin_on_owner(lock))
308 goto done; 321 goto done;
309 322
323 /*
324 * In order to avoid a stampede of mutex spinners trying to
325 * acquire the mutex all at once, the spinners need to take a
326 * MCS (queued) lock first before spinning on the owner field.
327 */
310 if (!osq_lock(&lock->osq)) 328 if (!osq_lock(&lock->osq))
311 goto done; 329 goto done;
312 330
@@ -469,7 +487,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
469EXPORT_SYMBOL(ww_mutex_unlock); 487EXPORT_SYMBOL(ww_mutex_unlock);
470 488
471static inline int __sched 489static inline int __sched
472__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) 490__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
473{ 491{
474 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); 492 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
475 struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); 493 struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
@@ -557,7 +575,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
557 } 575 }
558 576
559 if (use_ww_ctx && ww_ctx->acquired > 0) { 577 if (use_ww_ctx && ww_ctx->acquired > 0) {
560 ret = __mutex_lock_check_stamp(lock, ww_ctx); 578 ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
561 if (ret) 579 if (ret)
562 goto err; 580 goto err;
563 } 581 }
@@ -569,6 +587,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
569 schedule_preempt_disabled(); 587 schedule_preempt_disabled();
570 spin_lock_mutex(&lock->wait_lock, flags); 588 spin_lock_mutex(&lock->wait_lock, flags);
571 } 589 }
590 __set_task_state(task, TASK_RUNNING);
591
572 mutex_remove_waiter(lock, &waiter, current_thread_info()); 592 mutex_remove_waiter(lock, &waiter, current_thread_info());
573 /* set it to 0 if there are no waiters left: */ 593 /* set it to 0 if there are no waiters left: */
574 if (likely(list_empty(&lock->wait_list))) 594 if (likely(list_empty(&lock->wait_list)))
@@ -582,23 +602,7 @@ skip_wait:
582 602
583 if (use_ww_ctx) { 603 if (use_ww_ctx) {
584 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); 604 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
585 struct mutex_waiter *cur; 605 ww_mutex_set_context_slowpath(ww, ww_ctx);
586
587 /*
588 * This branch gets optimized out for the common case,
589 * and is only important for ww_mutex_lock.
590 */
591 ww_mutex_lock_acquired(ww, ww_ctx);
592 ww->ctx = ww_ctx;
593
594 /*
595 * Give any possible sleeping processes the chance to wake up,
596 * so they can recheck if they have to back off.
597 */
598 list_for_each_entry(cur, &lock->wait_list, list) {
599 debug_mutex_wake_waiter(lock, cur);
600 wake_up_process(cur->task);
601 }
602 } 606 }
603 607
604 spin_unlock_mutex(&lock->wait_lock, flags); 608 spin_unlock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/osq_lock.c
index 9887a905a762..c112d00341b0 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,8 +1,6 @@
1#include <linux/percpu.h> 1#include <linux/percpu.h>
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include "mcs_spinlock.h" 3#include <linux/osq_lock.h>
4
5#ifdef CONFIG_SMP
6 4
7/* 5/*
8 * An MCS like lock especially tailored for optimistic spinning for sleeping 6 * An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -111,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
111 * cmpxchg in an attempt to undo our queueing. 109 * cmpxchg in an attempt to undo our queueing.
112 */ 110 */
113 111
114 while (!smp_load_acquire(&node->locked)) { 112 while (!ACCESS_ONCE(node->locked)) {
115 /* 113 /*
116 * If we need to reschedule bail... so we can block. 114 * If we need to reschedule bail... so we can block.
117 */ 115 */
@@ -203,6 +201,3 @@ void osq_unlock(struct optimistic_spin_queue *lock)
203 if (next) 201 if (next)
204 ACCESS_ONCE(next->locked) = 1; 202 ACCESS_ONCE(next->locked) = 1;
205} 203}
206
207#endif
208
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7c98873a3077..3059bc2f022d 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1130,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
1130 set_current_state(state); 1130 set_current_state(state);
1131 } 1131 }
1132 1132
1133 __set_current_state(TASK_RUNNING);
1133 return ret; 1134 return ret;
1134} 1135}
1135 1136
@@ -1188,10 +1189,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
1188 ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); 1189 ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
1189 1190
1190 if (likely(!ret)) 1191 if (likely(!ret))
1192 /* sleep on the mutex */
1191 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); 1193 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
1192 1194
1193 set_current_state(TASK_RUNNING);
1194
1195 if (unlikely(ret)) { 1195 if (unlikely(ret)) {
1196 remove_waiter(lock, &waiter); 1196 remove_waiter(lock, &waiter);
1197 rt_mutex_handle_deadlock(ret, chwalk, &waiter); 1197 rt_mutex_handle_deadlock(ret, chwalk, &waiter);
@@ -1626,10 +1626,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1626 1626
1627 set_current_state(TASK_INTERRUPTIBLE); 1627 set_current_state(TASK_INTERRUPTIBLE);
1628 1628
1629 /* sleep on the mutex */
1629 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); 1630 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
1630 1631
1631 set_current_state(TASK_RUNNING);
1632
1633 if (unlikely(ret)) 1632 if (unlikely(ret))
1634 remove_waiter(lock, waiter); 1633 remove_waiter(lock, waiter);
1635 1634
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2c93571162cb..2555ae15ec14 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -154,7 +154,7 @@ void __sched __down_read(struct rw_semaphore *sem)
154 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 154 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
155 } 155 }
156 156
157 tsk->state = TASK_RUNNING; 157 __set_task_state(tsk, TASK_RUNNING);
158 out: 158 out:
159 ; 159 ;
160} 160}
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 7628c3fc37ca..2f7cc4076f50 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -242,8 +242,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
242 schedule(); 242 schedule();
243 } 243 }
244 244
245 tsk->state = TASK_RUNNING; 245 __set_task_state(tsk, TASK_RUNNING);
246
247 return sem; 246 return sem;
248} 247}
249EXPORT_SYMBOL(rwsem_down_read_failed); 248EXPORT_SYMBOL(rwsem_down_read_failed);
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..db3ccb1dd614 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
363} 363}
364EXPORT_SYMBOL(_raw_spin_lock_nested); 364EXPORT_SYMBOL(_raw_spin_lock_nested);
365 365
366void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
367{
368 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
369 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
370 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
371}
372EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
373
366unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, 374unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
367 int subclass) 375 int subclass)
368{ 376{
diff --git a/kernel/module.c b/kernel/module.c
index 3965511ae133..b34813f725e9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
56#include <linux/async.h> 56#include <linux/async.h>
57#include <linux/percpu.h> 57#include <linux/percpu.h>
58#include <linux/kmemleak.h> 58#include <linux/kmemleak.h>
59#include <linux/kasan.h>
59#include <linux/jump_label.h> 60#include <linux/jump_label.h>
60#include <linux/pfn.h> 61#include <linux/pfn.h>
61#include <linux/bsearch.h> 62#include <linux/bsearch.h>
@@ -772,9 +773,18 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
772 return 0; 773 return 0;
773} 774}
774 775
775unsigned long module_refcount(struct module *mod) 776/**
777 * module_refcount - return the refcount or -1 if unloading
778 *
779 * @mod: the module we're checking
780 *
781 * Returns:
782 * -1 if the module is in the process of unloading
783 * otherwise the number of references in the kernel to the module
784 */
785int module_refcount(struct module *mod)
776{ 786{
777 return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE; 787 return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
778} 788}
779EXPORT_SYMBOL(module_refcount); 789EXPORT_SYMBOL(module_refcount);
780 790
@@ -856,7 +866,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
856 struct module_use *use; 866 struct module_use *use;
857 int printed_something = 0; 867 int printed_something = 0;
858 868
859 seq_printf(m, " %lu ", module_refcount(mod)); 869 seq_printf(m, " %i ", module_refcount(mod));
860 870
861 /* 871 /*
862 * Always include a trailing , so userspace can differentiate 872 * Always include a trailing , so userspace can differentiate
@@ -908,7 +918,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
908static ssize_t show_refcnt(struct module_attribute *mattr, 918static ssize_t show_refcnt(struct module_attribute *mattr,
909 struct module_kobject *mk, char *buffer) 919 struct module_kobject *mk, char *buffer)
910{ 920{
911 return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); 921 return sprintf(buffer, "%i\n", module_refcount(mk->mod));
912} 922}
913 923
914static struct module_attribute modinfo_refcnt = 924static struct module_attribute modinfo_refcnt =
@@ -1216,6 +1226,12 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
1216 const unsigned long *crc; 1226 const unsigned long *crc;
1217 int err; 1227 int err;
1218 1228
1229 /*
1230 * The module_mutex should not be a heavily contended lock;
1231 * if we get the occasional sleep here, we'll go an extra iteration
1232 * in the wait_event_interruptible(), which is harmless.
1233 */
1234 sched_annotate_sleep();
1219 mutex_lock(&module_mutex); 1235 mutex_lock(&module_mutex);
1220 sym = find_symbol(name, &owner, &crc, 1236 sym = find_symbol(name, &owner, &crc,
1221 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); 1237 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
@@ -1795,15 +1811,20 @@ static void unset_module_core_ro_nx(struct module *mod) { }
1795static void unset_module_init_ro_nx(struct module *mod) { } 1811static void unset_module_init_ro_nx(struct module *mod) { }
1796#endif 1812#endif
1797 1813
1798void __weak module_free(struct module *mod, void *module_region) 1814void __weak module_memfree(void *module_region)
1799{ 1815{
1800 vfree(module_region); 1816 vfree(module_region);
1817 kasan_module_free(module_region);
1801} 1818}
1802 1819
1803void __weak module_arch_cleanup(struct module *mod) 1820void __weak module_arch_cleanup(struct module *mod)
1804{ 1821{
1805} 1822}
1806 1823
1824void __weak module_arch_freeing_init(struct module *mod)
1825{
1826}
1827
1807/* Free a module, remove from lists, etc. */ 1828/* Free a module, remove from lists, etc. */
1808static void free_module(struct module *mod) 1829static void free_module(struct module *mod)
1809{ 1830{
@@ -1841,7 +1862,8 @@ static void free_module(struct module *mod)
1841 1862
1842 /* This may be NULL, but that's OK */ 1863 /* This may be NULL, but that's OK */
1843 unset_module_init_ro_nx(mod); 1864 unset_module_init_ro_nx(mod);
1844 module_free(mod, mod->module_init); 1865 module_arch_freeing_init(mod);
1866 module_memfree(mod->module_init);
1845 kfree(mod->args); 1867 kfree(mod->args);
1846 percpu_modfree(mod); 1868 percpu_modfree(mod);
1847 1869
@@ -1850,7 +1872,7 @@ static void free_module(struct module *mod)
1850 1872
1851 /* Finally, free the core (containing the module structure) */ 1873 /* Finally, free the core (containing the module structure) */
1852 unset_module_core_ro_nx(mod); 1874 unset_module_core_ro_nx(mod);
1853 module_free(mod, mod->module_core); 1875 module_memfree(mod->module_core);
1854 1876
1855#ifdef CONFIG_MPU 1877#ifdef CONFIG_MPU
1856 update_protections(current->mm); 1878 update_protections(current->mm);
@@ -2785,7 +2807,7 @@ static int move_module(struct module *mod, struct load_info *info)
2785 */ 2807 */
2786 kmemleak_ignore(ptr); 2808 kmemleak_ignore(ptr);
2787 if (!ptr) { 2809 if (!ptr) {
2788 module_free(mod, mod->module_core); 2810 module_memfree(mod->module_core);
2789 return -ENOMEM; 2811 return -ENOMEM;
2790 } 2812 }
2791 memset(ptr, 0, mod->init_size); 2813 memset(ptr, 0, mod->init_size);
@@ -2930,8 +2952,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
2930static void module_deallocate(struct module *mod, struct load_info *info) 2952static void module_deallocate(struct module *mod, struct load_info *info)
2931{ 2953{
2932 percpu_modfree(mod); 2954 percpu_modfree(mod);
2933 module_free(mod, mod->module_init); 2955 module_arch_freeing_init(mod);
2934 module_free(mod, mod->module_core); 2956 module_memfree(mod->module_init);
2957 module_memfree(mod->module_core);
2935} 2958}
2936 2959
2937int __weak module_finalize(const Elf_Ehdr *hdr, 2960int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -2963,6 +2986,12 @@ static bool finished_loading(const char *name)
2963 struct module *mod; 2986 struct module *mod;
2964 bool ret; 2987 bool ret;
2965 2988
2989 /*
2990 * The module_mutex should not be a heavily contended lock;
2991 * if we get the occasional sleep here, we'll go an extra iteration
2992 * in the wait_event_interruptible(), which is harmless.
2993 */
2994 sched_annotate_sleep();
2966 mutex_lock(&module_mutex); 2995 mutex_lock(&module_mutex);
2967 mod = find_module_all(name, strlen(name), true); 2996 mod = find_module_all(name, strlen(name), true);
2968 ret = !mod || mod->state == MODULE_STATE_LIVE 2997 ret = !mod || mod->state == MODULE_STATE_LIVE
@@ -2983,10 +3012,36 @@ static void do_mod_ctors(struct module *mod)
2983#endif 3012#endif
2984} 3013}
2985 3014
2986/* This is where the real work happens */ 3015/* For freeing module_init on success, in case kallsyms traversing */
2987static int do_init_module(struct module *mod) 3016struct mod_initfree {
3017 struct rcu_head rcu;
3018 void *module_init;
3019};
3020
3021static void do_free_init(struct rcu_head *head)
3022{
3023 struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
3024 module_memfree(m->module_init);
3025 kfree(m);
3026}
3027
3028/*
3029 * This is where the real work happens.
3030 *
3031 * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
3032 * helper command 'lx-symbols'.
3033 */
3034static noinline int do_init_module(struct module *mod)
2988{ 3035{
2989 int ret = 0; 3036 int ret = 0;
3037 struct mod_initfree *freeinit;
3038
3039 freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
3040 if (!freeinit) {
3041 ret = -ENOMEM;
3042 goto fail;
3043 }
3044 freeinit->module_init = mod->module_init;
2990 3045
2991 /* 3046 /*
2992 * We want to find out whether @mod uses async during init. Clear 3047 * We want to find out whether @mod uses async during init. Clear
@@ -2999,18 +3054,7 @@ static int do_init_module(struct module *mod)
2999 if (mod->init != NULL) 3054 if (mod->init != NULL)
3000 ret = do_one_initcall(mod->init); 3055 ret = do_one_initcall(mod->init);
3001 if (ret < 0) { 3056 if (ret < 0) {
3002 /* 3057 goto fail_free_freeinit;
3003 * Init routine failed: abort. Try to protect us from
3004 * buggy refcounters.
3005 */
3006 mod->state = MODULE_STATE_GOING;
3007 synchronize_sched();
3008 module_put(mod);
3009 blocking_notifier_call_chain(&module_notify_list,
3010 MODULE_STATE_GOING, mod);
3011 free_module(mod);
3012 wake_up_all(&module_wq);
3013 return ret;
3014 } 3058 }
3015 if (ret > 0) { 3059 if (ret > 0) {
3016 pr_warn("%s: '%s'->init suspiciously returned %d, it should " 3060 pr_warn("%s: '%s'->init suspiciously returned %d, it should "
@@ -3055,15 +3099,35 @@ static int do_init_module(struct module *mod)
3055 mod->strtab = mod->core_strtab; 3099 mod->strtab = mod->core_strtab;
3056#endif 3100#endif
3057 unset_module_init_ro_nx(mod); 3101 unset_module_init_ro_nx(mod);
3058 module_free(mod, mod->module_init); 3102 module_arch_freeing_init(mod);
3059 mod->module_init = NULL; 3103 mod->module_init = NULL;
3060 mod->init_size = 0; 3104 mod->init_size = 0;
3061 mod->init_ro_size = 0; 3105 mod->init_ro_size = 0;
3062 mod->init_text_size = 0; 3106 mod->init_text_size = 0;
3107 /*
3108 * We want to free module_init, but be aware that kallsyms may be
3109 * walking this with preempt disabled. In all the failure paths,
3110 * we call synchronize_rcu/synchronize_sched, but we don't want
3111 * to slow down the success path, so use actual RCU here.
3112 */
3113 call_rcu(&freeinit->rcu, do_free_init);
3063 mutex_unlock(&module_mutex); 3114 mutex_unlock(&module_mutex);
3064 wake_up_all(&module_wq); 3115 wake_up_all(&module_wq);
3065 3116
3066 return 0; 3117 return 0;
3118
3119fail_free_freeinit:
3120 kfree(freeinit);
3121fail:
3122 /* Try to protect us from buggy refcounters. */
3123 mod->state = MODULE_STATE_GOING;
3124 synchronize_sched();
3125 module_put(mod);
3126 blocking_notifier_call_chain(&module_notify_list,
3127 MODULE_STATE_GOING, mod);
3128 free_module(mod);
3129 wake_up_all(&module_wq);
3130 return ret;
3067} 3131}
3068 3132
3069static int may_init_module(void) 3133static int may_init_module(void)
@@ -3075,32 +3139,6 @@ static int may_init_module(void)
3075} 3139}
3076 3140
3077/* 3141/*
3078 * Can't use wait_event_interruptible() because our condition
3079 * 'finished_loading()' contains a blocking primitive itself (mutex_lock).
3080 */
3081static int wait_finished_loading(struct module *mod)
3082{
3083 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3084 int ret = 0;
3085
3086 add_wait_queue(&module_wq, &wait);
3087 for (;;) {
3088 if (finished_loading(mod->name))
3089 break;
3090
3091 if (signal_pending(current)) {
3092 ret = -ERESTARTSYS;
3093 break;
3094 }
3095
3096 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3097 }
3098 remove_wait_queue(&module_wq, &wait);
3099
3100 return ret;
3101}
3102
3103/*
3104 * We try to place it in the list now to make sure it's unique before 3142 * We try to place it in the list now to make sure it's unique before
3105 * we dedicate too many resources. In particular, temporary percpu 3143 * we dedicate too many resources. In particular, temporary percpu
3106 * memory exhaustion. 3144 * memory exhaustion.
@@ -3120,8 +3158,8 @@ again:
3120 || old->state == MODULE_STATE_UNFORMED) { 3158 || old->state == MODULE_STATE_UNFORMED) {
3121 /* Wait in case it fails to load. */ 3159 /* Wait in case it fails to load. */
3122 mutex_unlock(&module_mutex); 3160 mutex_unlock(&module_mutex);
3123 3161 err = wait_event_interruptible(module_wq,
3124 err = wait_finished_loading(mod); 3162 finished_loading(mod->name));
3125 if (err) 3163 if (err)
3126 goto out_unlocked; 3164 goto out_unlocked;
3127 goto again; 3165 goto again;
@@ -3220,7 +3258,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
3220 mod->sig_ok = info->sig_ok; 3258 mod->sig_ok = info->sig_ok;
3221 if (!mod->sig_ok) { 3259 if (!mod->sig_ok) {
3222 pr_notice_once("%s: module verification failed: signature " 3260 pr_notice_once("%s: module verification failed: signature "
3223 "and/or required key missing - tainting " 3261 "and/or required key missing - tainting "
3224 "kernel\n", mod->name); 3262 "kernel\n", mod->name);
3225 add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK); 3263 add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
3226 } 3264 }
@@ -3311,6 +3349,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
3311 module_bug_cleanup(mod); 3349 module_bug_cleanup(mod);
3312 mutex_unlock(&module_mutex); 3350 mutex_unlock(&module_mutex);
3313 3351
3352 /* Free lock-classes: */
3353 lockdep_free_key_range(mod->module_core, mod->core_size);
3354
3314 /* we can't deallocate the module until we clear memory protection */ 3355 /* we can't deallocate the module until we clear memory protection */
3315 unset_module_init_ro_nx(mod); 3356 unset_module_init_ro_nx(mod);
3316 unset_module_core_ro_nx(mod); 3357 unset_module_core_ro_nx(mod);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4803da6eab62..ae9fc7cc360e 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
402} 402}
403EXPORT_SYMBOL_GPL(raw_notifier_call_chain); 403EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
404 404
405#ifdef CONFIG_SRCU
405/* 406/*
406 * SRCU notifier chain routines. Registration and unregistration 407 * SRCU notifier chain routines. Registration and unregistration
407 * use a mutex, and call_chain is synchronized by SRCU (no locks). 408 * use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
528} 529}
529EXPORT_SYMBOL_GPL(srcu_init_notifier_head); 530EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
530 531
532#endif /* CONFIG_SRCU */
533
531static ATOMIC_NOTIFIER_HEAD(die_chain); 534static ATOMIC_NOTIFIER_HEAD(die_chain);
532 535
533int notrace notify_die(enum die_val val, const char *str, 536int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/padata.c b/kernel/padata.c
index 161402f0b517..b38bea9c466a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -917,15 +917,10 @@ static ssize_t show_cpumask(struct padata_instance *pinst,
917 else 917 else
918 cpumask = pinst->cpumask.pcpu; 918 cpumask = pinst->cpumask.pcpu;
919 919
920 len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask), 920 len = snprintf(buf, PAGE_SIZE, "%*pb\n",
921 nr_cpu_ids); 921 nr_cpu_ids, cpumask_bits(cpumask));
922 if (PAGE_SIZE - len < 2)
923 len = -EINVAL;
924 else
925 len += sprintf(buf + len, "\n");
926
927 mutex_unlock(&pinst->lock); 922 mutex_unlock(&pinst->lock);
928 return len; 923 return len < PAGE_SIZE ? len : -EINVAL;
929} 924}
930 925
931static ssize_t store_cpumask(struct padata_instance *pinst, 926static ssize_t store_cpumask(struct padata_instance *pinst,
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d8d6f906dec..8136ad76e5fd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -226,6 +226,7 @@ static const struct tnt tnts[] = {
226 { TAINT_OOT_MODULE, 'O', ' ' }, 226 { TAINT_OOT_MODULE, 'O', ' ' },
227 { TAINT_UNSIGNED_MODULE, 'E', ' ' }, 227 { TAINT_UNSIGNED_MODULE, 'E', ' ' },
228 { TAINT_SOFTLOCKUP, 'L', ' ' }, 228 { TAINT_SOFTLOCKUP, 'L', ' ' },
229 { TAINT_LIVEPATCH, 'K', ' ' },
229}; 230};
230 231
231/** 232/**
@@ -246,6 +247,7 @@ static const struct tnt tnts[] = {
246 * 'O' - Out-of-tree module has been loaded. 247 * 'O' - Out-of-tree module has been loaded.
247 * 'E' - Unsigned module has been loaded. 248 * 'E' - Unsigned module has been loaded.
248 * 'L' - A soft lockup has previously occurred. 249 * 'L' - A soft lockup has previously occurred.
250 * 'K' - Kernel has been live patched.
249 * 251 *
250 * The string is overwritten by the next call to print_tainted(). 252 * The string is overwritten by the next call to print_tainted().
251 */ 253 */
diff --git a/kernel/params.c b/kernel/params.c
index 0af9b2c4e56c..728e05b167de 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -642,12 +642,15 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
642 mk->mp->grp.attrs = new_attrs; 642 mk->mp->grp.attrs = new_attrs;
643 643
644 /* Tack new one on the end. */ 644 /* Tack new one on the end. */
645 memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0]));
645 sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); 646 sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
646 mk->mp->attrs[mk->mp->num].param = kp; 647 mk->mp->attrs[mk->mp->num].param = kp;
647 mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; 648 mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
648 /* Do not allow runtime DAC changes to make param writable. */ 649 /* Do not allow runtime DAC changes to make param writable. */
649 if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) 650 if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
650 mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; 651 mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
652 else
653 mk->mp->attrs[mk->mp->num].mattr.store = NULL;
651 mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; 654 mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
652 mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; 655 mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
653 mk->mp->num++; 656 mk->mp->num++;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 48b28d387c7f..7e01f78f0417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -251,6 +251,7 @@ config APM_EMULATION
251 251
252config PM_OPP 252config PM_OPP
253 bool 253 bool
254 select SRCU
254 ---help--- 255 ---help---
255 SOCs have a standard set of tuples consisting of frequency and 256 SOCs have a standard set of tuples consisting of frequency and
256 voltage pairs that the device will support per voltage domain. This 257 voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5a6ec8678b9a..564f786df470 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -84,8 +84,8 @@ static int try_to_freeze_tasks(bool user_only)
84 elapsed_msecs = elapsed_msecs64; 84 elapsed_msecs = elapsed_msecs64;
85 85
86 if (todo) { 86 if (todo) {
87 printk("\n"); 87 pr_cont("\n");
88 printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " 88 pr_err("Freezing of tasks %s after %d.%03d seconds "
89 "(%d tasks refusing to freeze, wq_busy=%d):\n", 89 "(%d tasks refusing to freeze, wq_busy=%d):\n",
90 wakeup ? "aborted" : "failed", 90 wakeup ? "aborted" : "failed",
91 elapsed_msecs / 1000, elapsed_msecs % 1000, 91 elapsed_msecs / 1000, elapsed_msecs % 1000,
@@ -101,37 +101,13 @@ static int try_to_freeze_tasks(bool user_only)
101 read_unlock(&tasklist_lock); 101 read_unlock(&tasklist_lock);
102 } 102 }
103 } else { 103 } else {
104 printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, 104 pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
105 elapsed_msecs % 1000); 105 elapsed_msecs % 1000);
106 } 106 }
107 107
108 return todo ? -EBUSY : 0; 108 return todo ? -EBUSY : 0;
109} 109}
110 110
111static bool __check_frozen_processes(void)
112{
113 struct task_struct *g, *p;
114
115 for_each_process_thread(g, p)
116 if (p != current && !freezer_should_skip(p) && !frozen(p))
117 return false;
118
119 return true;
120}
121
122/*
123 * Returns true if all freezable tasks (except for current) are frozen already
124 */
125static bool check_frozen_processes(void)
126{
127 bool ret;
128
129 read_lock(&tasklist_lock);
130 ret = __check_frozen_processes();
131 read_unlock(&tasklist_lock);
132 return ret;
133}
134
135/** 111/**
136 * freeze_processes - Signal user space processes to enter the refrigerator. 112 * freeze_processes - Signal user space processes to enter the refrigerator.
137 * The current thread will not be frozen. The same process that calls 113 * The current thread will not be frozen. The same process that calls
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void)
142int freeze_processes(void) 118int freeze_processes(void)
143{ 119{
144 int error; 120 int error;
145 int oom_kills_saved;
146 121
147 error = __usermodehelper_disable(UMH_FREEZING); 122 error = __usermodehelper_disable(UMH_FREEZING);
148 if (error) 123 if (error)
@@ -155,31 +130,24 @@ int freeze_processes(void)
155 atomic_inc(&system_freezing_cnt); 130 atomic_inc(&system_freezing_cnt);
156 131
157 pm_wakeup_clear(); 132 pm_wakeup_clear();
158 printk("Freezing user space processes ... "); 133 pr_info("Freezing user space processes ... ");
159 pm_freezing = true; 134 pm_freezing = true;
160 oom_kills_saved = oom_kills_count();
161 error = try_to_freeze_tasks(true); 135 error = try_to_freeze_tasks(true);
162 if (!error) { 136 if (!error) {
163 __usermodehelper_set_disable_depth(UMH_DISABLED); 137 __usermodehelper_set_disable_depth(UMH_DISABLED);
164 oom_killer_disable(); 138 pr_cont("done.");
165
166 /*
167 * There might have been an OOM kill while we were
168 * freezing tasks and the killed task might be still
169 * on the way out so we have to double check for race.
170 */
171 if (oom_kills_count() != oom_kills_saved &&
172 !check_frozen_processes()) {
173 __usermodehelper_set_disable_depth(UMH_ENABLED);
174 printk("OOM in progress.");
175 error = -EBUSY;
176 } else {
177 printk("done.");
178 }
179 } 139 }
180 printk("\n"); 140 pr_cont("\n");
181 BUG_ON(in_atomic()); 141 BUG_ON(in_atomic());
182 142
143 /*
144 * Now that the whole userspace is frozen we need to disbale
145 * the OOM killer to disallow any further interference with
146 * killable tasks.
147 */
148 if (!error && !oom_killer_disable())
149 error = -EBUSY;
150
183 if (error) 151 if (error)
184 thaw_processes(); 152 thaw_processes();
185 return error; 153 return error;
@@ -197,13 +165,14 @@ int freeze_kernel_threads(void)
197{ 165{
198 int error; 166 int error;
199 167
200 printk("Freezing remaining freezable tasks ... "); 168 pr_info("Freezing remaining freezable tasks ... ");
169
201 pm_nosig_freezing = true; 170 pm_nosig_freezing = true;
202 error = try_to_freeze_tasks(false); 171 error = try_to_freeze_tasks(false);
203 if (!error) 172 if (!error)
204 printk("done."); 173 pr_cont("done.");
205 174
206 printk("\n"); 175 pr_cont("\n");
207 BUG_ON(in_atomic()); 176 BUG_ON(in_atomic());
208 177
209 if (error) 178 if (error)
@@ -224,7 +193,7 @@ void thaw_processes(void)
224 193
225 oom_killer_enable(); 194 oom_killer_enable();
226 195
227 printk("Restarting tasks ... "); 196 pr_info("Restarting tasks ... ");
228 197
229 __usermodehelper_set_disable_depth(UMH_FREEZING); 198 __usermodehelper_set_disable_depth(UMH_FREEZING);
230 thaw_workqueues(); 199 thaw_workqueues();
@@ -243,7 +212,7 @@ void thaw_processes(void)
243 usermodehelper_enable(); 212 usermodehelper_enable();
244 213
245 schedule(); 214 schedule();
246 printk("done.\n"); 215 pr_cont("done.\n");
247 trace_suspend_resume(TPS("thaw_processes"), 0, false); 216 trace_suspend_resume(TPS("thaw_processes"), 0, false);
248} 217}
249 218
@@ -252,7 +221,7 @@ void thaw_kernel_threads(void)
252 struct task_struct *g, *p; 221 struct task_struct *g, *p;
253 222
254 pm_nosig_freezing = false; 223 pm_nosig_freezing = false;
255 printk("Restarting kernel threads ... "); 224 pr_info("Restarting kernel threads ... ");
256 225
257 thaw_workqueues(); 226 thaw_workqueues();
258 227
@@ -264,5 +233,5 @@ void thaw_kernel_threads(void)
264 read_unlock(&tasklist_lock); 233 read_unlock(&tasklist_lock);
265 234
266 schedule(); 235 schedule();
267 printk("done.\n"); 236 pr_cont("done.\n");
268} 237}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 5f4c006c4b1e..97b0df71303e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -41,6 +41,8 @@
41#include <linux/platform_device.h> 41#include <linux/platform_device.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/kernel.h> 43#include <linux/kernel.h>
44#include <linux/debugfs.h>
45#include <linux/seq_file.h>
44 46
45#include <linux/uaccess.h> 47#include <linux/uaccess.h>
46#include <linux/export.h> 48#include <linux/export.h>
@@ -182,6 +184,81 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
182 c->target_value = value; 184 c->target_value = value;
183} 185}
184 186
187static inline int pm_qos_get_value(struct pm_qos_constraints *c);
188static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
189{
190 struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
191 struct pm_qos_constraints *c;
192 struct pm_qos_request *req;
193 char *type;
194 unsigned long flags;
195 int tot_reqs = 0;
196 int active_reqs = 0;
197
198 if (IS_ERR_OR_NULL(qos)) {
199 pr_err("%s: bad qos param!\n", __func__);
200 return -EINVAL;
201 }
202 c = qos->constraints;
203 if (IS_ERR_OR_NULL(c)) {
204 pr_err("%s: Bad constraints on qos?\n", __func__);
205 return -EINVAL;
206 }
207
208 /* Lock to ensure we have a snapshot */
209 spin_lock_irqsave(&pm_qos_lock, flags);
210 if (plist_head_empty(&c->list)) {
211 seq_puts(s, "Empty!\n");
212 goto out;
213 }
214
215 switch (c->type) {
216 case PM_QOS_MIN:
217 type = "Minimum";
218 break;
219 case PM_QOS_MAX:
220 type = "Maximum";
221 break;
222 case PM_QOS_SUM:
223 type = "Sum";
224 break;
225 default:
226 type = "Unknown";
227 }
228
229 plist_for_each_entry(req, &c->list, node) {
230 char *state = "Default";
231
232 if ((req->node).prio != c->default_value) {
233 active_reqs++;
234 state = "Active";
235 }
236 tot_reqs++;
237 seq_printf(s, "%d: %d: %s\n", tot_reqs,
238 (req->node).prio, state);
239 }
240
241 seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n",
242 type, pm_qos_get_value(c), active_reqs, tot_reqs);
243
244out:
245 spin_unlock_irqrestore(&pm_qos_lock, flags);
246 return 0;
247}
248
249static int pm_qos_dbg_open(struct inode *inode, struct file *file)
250{
251 return single_open(file, pm_qos_dbg_show_requests,
252 inode->i_private);
253}
254
255static const struct file_operations pm_qos_debug_fops = {
256 .open = pm_qos_dbg_open,
257 .read = seq_read,
258 .llseek = seq_lseek,
259 .release = single_release,
260};
261
185/** 262/**
186 * pm_qos_update_target - manages the constraints list and calls the notifiers 263 * pm_qos_update_target - manages the constraints list and calls the notifiers
187 * if needed 264 * if needed
@@ -509,12 +586,17 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
509EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 586EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
510 587
511/* User space interface to PM QoS classes via misc devices */ 588/* User space interface to PM QoS classes via misc devices */
512static int register_pm_qos_misc(struct pm_qos_object *qos) 589static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
513{ 590{
514 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; 591 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
515 qos->pm_qos_power_miscdev.name = qos->name; 592 qos->pm_qos_power_miscdev.name = qos->name;
516 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; 593 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
517 594
595 if (d) {
596 (void)debugfs_create_file(qos->name, S_IRUGO, d,
597 (void *)qos, &pm_qos_debug_fops);
598 }
599
518 return misc_register(&qos->pm_qos_power_miscdev); 600 return misc_register(&qos->pm_qos_power_miscdev);
519} 601}
520 602
@@ -608,11 +690,16 @@ static int __init pm_qos_power_init(void)
608{ 690{
609 int ret = 0; 691 int ret = 0;
610 int i; 692 int i;
693 struct dentry *d;
611 694
612 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); 695 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
613 696
697 d = debugfs_create_dir("pm_qos", NULL);
698 if (IS_ERR_OR_NULL(d))
699 d = NULL;
700
614 for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { 701 for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
615 ret = register_pm_qos_misc(pm_qos_array[i]); 702 ret = register_pm_qos_misc(pm_qos_array[i], d);
616 if (ret < 0) { 703 if (ret < 0) {
617 printk(KERN_ERR "pm_qos_param: %s setup failed\n", 704 printk(KERN_ERR "pm_qos_param: %s setup failed\n",
618 pm_qos_array[i]->name); 705 pm_qos_array[i]->name);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0c40c16174b4..c24d5a23bf93 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1472,9 +1472,9 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1472/** 1472/**
1473 * free_unnecessary_pages - Release preallocated pages not needed for the image 1473 * free_unnecessary_pages - Release preallocated pages not needed for the image
1474 */ 1474 */
1475static void free_unnecessary_pages(void) 1475static unsigned long free_unnecessary_pages(void)
1476{ 1476{
1477 unsigned long save, to_free_normal, to_free_highmem; 1477 unsigned long save, to_free_normal, to_free_highmem, free;
1478 1478
1479 save = count_data_pages(); 1479 save = count_data_pages();
1480 if (alloc_normal >= save) { 1480 if (alloc_normal >= save) {
@@ -1495,6 +1495,7 @@ static void free_unnecessary_pages(void)
1495 else 1495 else
1496 to_free_normal = 0; 1496 to_free_normal = 0;
1497 } 1497 }
1498 free = to_free_normal + to_free_highmem;
1498 1499
1499 memory_bm_position_reset(&copy_bm); 1500 memory_bm_position_reset(&copy_bm);
1500 1501
@@ -1518,6 +1519,8 @@ static void free_unnecessary_pages(void)
1518 swsusp_unset_page_free(page); 1519 swsusp_unset_page_free(page);
1519 __free_page(page); 1520 __free_page(page);
1520 } 1521 }
1522
1523 return free;
1521} 1524}
1522 1525
1523/** 1526/**
@@ -1707,7 +1710,7 @@ int hibernate_preallocate_memory(void)
1707 * pages in memory, but we have allocated more. Release the excessive 1710 * pages in memory, but we have allocated more. Release the excessive
1708 * ones now. 1711 * ones now.
1709 */ 1712 */
1710 free_unnecessary_pages(); 1713 pages -= free_unnecessary_pages();
1711 1714
1712 out: 1715 out:
1713 stop = ktime_get(); 1716 stop = ktime_get();
@@ -2310,8 +2313,6 @@ static inline void free_highmem_data(void)
2310 free_image_page(buffer, PG_UNSAFE_CLEAR); 2313 free_image_page(buffer, PG_UNSAFE_CLEAR);
2311} 2314}
2312#else 2315#else
2313static inline int get_safe_write_buffer(void) { return 0; }
2314
2315static unsigned int 2316static unsigned int
2316count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } 2317count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
2317 2318
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c347e3ce3a55..b7d6b3a721b1 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,7 +37,9 @@ const char *pm_states[PM_SUSPEND_MAX];
37static const struct platform_suspend_ops *suspend_ops; 37static const struct platform_suspend_ops *suspend_ops;
38static const struct platform_freeze_ops *freeze_ops; 38static const struct platform_freeze_ops *freeze_ops;
39static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); 39static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
40static bool suspend_freeze_wake; 40
41enum freeze_state __read_mostly suspend_freeze_state;
42static DEFINE_SPINLOCK(suspend_freeze_lock);
41 43
42void freeze_set_ops(const struct platform_freeze_ops *ops) 44void freeze_set_ops(const struct platform_freeze_ops *ops)
43{ 45{
@@ -48,22 +50,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
48 50
49static void freeze_begin(void) 51static void freeze_begin(void)
50{ 52{
51 suspend_freeze_wake = false; 53 suspend_freeze_state = FREEZE_STATE_NONE;
52} 54}
53 55
54static void freeze_enter(void) 56static void freeze_enter(void)
55{ 57{
56 cpuidle_use_deepest_state(true); 58 spin_lock_irq(&suspend_freeze_lock);
59 if (pm_wakeup_pending())
60 goto out;
61
62 suspend_freeze_state = FREEZE_STATE_ENTER;
63 spin_unlock_irq(&suspend_freeze_lock);
64
65 get_online_cpus();
57 cpuidle_resume(); 66 cpuidle_resume();
58 wait_event(suspend_freeze_wait_head, suspend_freeze_wake); 67
68 /* Push all the CPUs into the idle loop. */
69 wake_up_all_idle_cpus();
70 pr_debug("PM: suspend-to-idle\n");
71 /* Make the current CPU wait so it can enter the idle loop too. */
72 wait_event(suspend_freeze_wait_head,
73 suspend_freeze_state == FREEZE_STATE_WAKE);
74 pr_debug("PM: resume from suspend-to-idle\n");
75
59 cpuidle_pause(); 76 cpuidle_pause();
60 cpuidle_use_deepest_state(false); 77 put_online_cpus();
78
79 spin_lock_irq(&suspend_freeze_lock);
80
81 out:
82 suspend_freeze_state = FREEZE_STATE_NONE;
83 spin_unlock_irq(&suspend_freeze_lock);
61} 84}
62 85
63void freeze_wake(void) 86void freeze_wake(void)
64{ 87{
65 suspend_freeze_wake = true; 88 unsigned long flags;
66 wake_up(&suspend_freeze_wait_head); 89
90 spin_lock_irqsave(&suspend_freeze_lock, flags);
91 if (suspend_freeze_state > FREEZE_STATE_NONE) {
92 suspend_freeze_state = FREEZE_STATE_WAKE;
93 wake_up(&suspend_freeze_wait_head);
94 }
95 spin_unlock_irqrestore(&suspend_freeze_lock, flags);
67} 96}
68EXPORT_SYMBOL_GPL(freeze_wake); 97EXPORT_SYMBOL_GPL(freeze_wake);
69 98
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 02d6b6d28796..c06df7de0963 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -935,8 +935,8 @@ static int __init ignore_loglevel_setup(char *str)
935 935
936early_param("ignore_loglevel", ignore_loglevel_setup); 936early_param("ignore_loglevel", ignore_loglevel_setup);
937module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); 937module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
938MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" 938MODULE_PARM_DESC(ignore_loglevel,
939 "print all kernel messages to the console."); 939 "ignore loglevel setting (prints all kernel messages to the console)");
940 940
941#ifdef CONFIG_BOOT_PRINTK_DELAY 941#ifdef CONFIG_BOOT_PRINTK_DELAY
942 942
@@ -1419,16 +1419,16 @@ static void call_console_drivers(int level, const char *text, size_t len)
1419} 1419}
1420 1420
1421/* 1421/*
1422 * Zap console related locks when oopsing. Only zap at most once 1422 * Zap console related locks when oopsing.
1423 * every 10 seconds, to leave time for slow consoles to print a 1423 * To leave time for slow consoles to print a full oops,
1424 * full oops. 1424 * only zap at most once every 30 seconds.
1425 */ 1425 */
1426static void zap_locks(void) 1426static void zap_locks(void)
1427{ 1427{
1428 static unsigned long oops_timestamp; 1428 static unsigned long oops_timestamp;
1429 1429
1430 if (time_after_eq(jiffies, oops_timestamp) && 1430 if (time_after_eq(jiffies, oops_timestamp) &&
1431 !time_after(jiffies, oops_timestamp + 30 * HZ)) 1431 !time_after(jiffies, oops_timestamp + 30 * HZ))
1432 return; 1432 return;
1433 1433
1434 oops_timestamp = jiffies; 1434 oops_timestamp = jiffies;
diff --git a/kernel/profile.c b/kernel/profile.c
index 54bf5ba26420..a7bcd28d6e9f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -422,8 +422,7 @@ void profile_tick(int type)
422 422
423static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) 423static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
424{ 424{
425 seq_cpumask(m, prof_cpu_mask); 425 seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
426 seq_putc(m, '\n');
427 return 0; 426 return 0;
428} 427}
429 428
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1eb9d90c3af9..227fec36b12a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1077,7 +1077,6 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
1077} 1077}
1078 1078
1079#if defined CONFIG_COMPAT 1079#if defined CONFIG_COMPAT
1080#include <linux/compat.h>
1081 1080
1082int compat_ptrace_request(struct task_struct *child, compat_long_t request, 1081int compat_ptrace_request(struct task_struct *child, compat_long_t request,
1083 compat_ulong_t addr, compat_ulong_t data) 1082 compat_ulong_t addr, compat_ulong_t data)
diff --git a/kernel/range.c b/kernel/range.c
index 322ea8e93e4b..82cfc285b046 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -113,12 +113,12 @@ static int cmp_range(const void *x1, const void *x2)
113{ 113{
114 const struct range *r1 = x1; 114 const struct range *r1 = x1;
115 const struct range *r2 = x2; 115 const struct range *r2 = x2;
116 s64 start1, start2;
117 116
118 start1 = r1->start; 117 if (r1->start < r2->start)
119 start2 = r2->start; 118 return -1;
120 119 if (r1->start > r2->start)
121 return start1 - start2; 120 return 1;
121 return 0;
122} 122}
123 123
124int clean_sort_range(struct range *range, int az) 124int clean_sort_range(struct range *range, int az)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index e6fae503d1bc..50a808424b06 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,5 @@
1obj-y += update.o srcu.o 1obj-y += update.o
2obj-$(CONFIG_SRCU) += srcu.o
2obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 3obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
3obj-$(CONFIG_TREE_RCU) += tree.o 4obj-$(CONFIG_TREE_RCU) += tree.o
4obj-$(CONFIG_PREEMPT_RCU) += tree.o 5obj-$(CONFIG_PREEMPT_RCU) += tree.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 07bb02eda844..80adef7d4c3d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -137,4 +137,10 @@ int rcu_jiffies_till_stall_check(void);
137 137
138void rcu_early_boot_tests(void); 138void rcu_early_boot_tests(void);
139 139
140/*
141 * This function really isn't for public consumption, but RCU is special in
142 * that context switches can allow the state machine to make progress.
143 */
144extern void resched_cpu(int cpu);
145
140#endif /* __LINUX_RCU_H */ 146#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4d559baf06e0..30d42aa55d83 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -244,7 +244,8 @@ struct rcu_torture_ops {
244 int (*readlock)(void); 244 int (*readlock)(void);
245 void (*read_delay)(struct torture_random_state *rrsp); 245 void (*read_delay)(struct torture_random_state *rrsp);
246 void (*readunlock)(int idx); 246 void (*readunlock)(int idx);
247 int (*completed)(void); 247 unsigned long (*started)(void);
248 unsigned long (*completed)(void);
248 void (*deferred_free)(struct rcu_torture *p); 249 void (*deferred_free)(struct rcu_torture *p);
249 void (*sync)(void); 250 void (*sync)(void);
250 void (*exp_sync)(void); 251 void (*exp_sync)(void);
@@ -296,11 +297,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU)
296 rcu_read_unlock(); 297 rcu_read_unlock();
297} 298}
298 299
299static int rcu_torture_completed(void)
300{
301 return rcu_batches_completed();
302}
303
304/* 300/*
305 * Update callback in the pipe. This should be invoked after a grace period. 301 * Update callback in the pipe. This should be invoked after a grace period.
306 */ 302 */
@@ -356,7 +352,7 @@ rcu_torture_cb(struct rcu_head *p)
356 cur_ops->deferred_free(rp); 352 cur_ops->deferred_free(rp);
357} 353}
358 354
359static int rcu_no_completed(void) 355static unsigned long rcu_no_completed(void)
360{ 356{
361 return 0; 357 return 0;
362} 358}
@@ -377,7 +373,8 @@ static struct rcu_torture_ops rcu_ops = {
377 .readlock = rcu_torture_read_lock, 373 .readlock = rcu_torture_read_lock,
378 .read_delay = rcu_read_delay, 374 .read_delay = rcu_read_delay,
379 .readunlock = rcu_torture_read_unlock, 375 .readunlock = rcu_torture_read_unlock,
380 .completed = rcu_torture_completed, 376 .started = rcu_batches_started,
377 .completed = rcu_batches_completed,
381 .deferred_free = rcu_torture_deferred_free, 378 .deferred_free = rcu_torture_deferred_free,
382 .sync = synchronize_rcu, 379 .sync = synchronize_rcu,
383 .exp_sync = synchronize_rcu_expedited, 380 .exp_sync = synchronize_rcu_expedited,
@@ -407,11 +404,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
407 rcu_read_unlock_bh(); 404 rcu_read_unlock_bh();
408} 405}
409 406
410static int rcu_bh_torture_completed(void)
411{
412 return rcu_batches_completed_bh();
413}
414
415static void rcu_bh_torture_deferred_free(struct rcu_torture *p) 407static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
416{ 408{
417 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); 409 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
@@ -423,7 +415,8 @@ static struct rcu_torture_ops rcu_bh_ops = {
423 .readlock = rcu_bh_torture_read_lock, 415 .readlock = rcu_bh_torture_read_lock,
424 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 416 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
425 .readunlock = rcu_bh_torture_read_unlock, 417 .readunlock = rcu_bh_torture_read_unlock,
426 .completed = rcu_bh_torture_completed, 418 .started = rcu_batches_started_bh,
419 .completed = rcu_batches_completed_bh,
427 .deferred_free = rcu_bh_torture_deferred_free, 420 .deferred_free = rcu_bh_torture_deferred_free,
428 .sync = synchronize_rcu_bh, 421 .sync = synchronize_rcu_bh,
429 .exp_sync = synchronize_rcu_bh_expedited, 422 .exp_sync = synchronize_rcu_bh_expedited,
@@ -466,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
466 .readlock = rcu_torture_read_lock, 459 .readlock = rcu_torture_read_lock,
467 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 460 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
468 .readunlock = rcu_torture_read_unlock, 461 .readunlock = rcu_torture_read_unlock,
462 .started = rcu_no_completed,
469 .completed = rcu_no_completed, 463 .completed = rcu_no_completed,
470 .deferred_free = rcu_busted_torture_deferred_free, 464 .deferred_free = rcu_busted_torture_deferred_free,
471 .sync = synchronize_rcu_busted, 465 .sync = synchronize_rcu_busted,
@@ -510,7 +504,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
510 srcu_read_unlock(&srcu_ctl, idx); 504 srcu_read_unlock(&srcu_ctl, idx);
511} 505}
512 506
513static int srcu_torture_completed(void) 507static unsigned long srcu_torture_completed(void)
514{ 508{
515 return srcu_batches_completed(&srcu_ctl); 509 return srcu_batches_completed(&srcu_ctl);
516} 510}
@@ -564,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = {
564 .readlock = srcu_torture_read_lock, 558 .readlock = srcu_torture_read_lock,
565 .read_delay = srcu_read_delay, 559 .read_delay = srcu_read_delay,
566 .readunlock = srcu_torture_read_unlock, 560 .readunlock = srcu_torture_read_unlock,
561 .started = NULL,
567 .completed = srcu_torture_completed, 562 .completed = srcu_torture_completed,
568 .deferred_free = srcu_torture_deferred_free, 563 .deferred_free = srcu_torture_deferred_free,
569 .sync = srcu_torture_synchronize, 564 .sync = srcu_torture_synchronize,
@@ -600,7 +595,8 @@ static struct rcu_torture_ops sched_ops = {
600 .readlock = sched_torture_read_lock, 595 .readlock = sched_torture_read_lock,
601 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 596 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
602 .readunlock = sched_torture_read_unlock, 597 .readunlock = sched_torture_read_unlock,
603 .completed = rcu_no_completed, 598 .started = rcu_batches_started_sched,
599 .completed = rcu_batches_completed_sched,
604 .deferred_free = rcu_sched_torture_deferred_free, 600 .deferred_free = rcu_sched_torture_deferred_free,
605 .sync = synchronize_sched, 601 .sync = synchronize_sched,
606 .exp_sync = synchronize_sched_expedited, 602 .exp_sync = synchronize_sched_expedited,
@@ -638,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = {
638 .readlock = tasks_torture_read_lock, 634 .readlock = tasks_torture_read_lock,
639 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 635 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
640 .readunlock = tasks_torture_read_unlock, 636 .readunlock = tasks_torture_read_unlock,
637 .started = rcu_no_completed,
641 .completed = rcu_no_completed, 638 .completed = rcu_no_completed,
642 .deferred_free = rcu_tasks_torture_deferred_free, 639 .deferred_free = rcu_tasks_torture_deferred_free,
643 .sync = synchronize_rcu_tasks, 640 .sync = synchronize_rcu_tasks,
@@ -1015,8 +1012,8 @@ static void rcutorture_trace_dump(void)
1015static void rcu_torture_timer(unsigned long unused) 1012static void rcu_torture_timer(unsigned long unused)
1016{ 1013{
1017 int idx; 1014 int idx;
1018 int completed; 1015 unsigned long started;
1019 int completed_end; 1016 unsigned long completed;
1020 static DEFINE_TORTURE_RANDOM(rand); 1017 static DEFINE_TORTURE_RANDOM(rand);
1021 static DEFINE_SPINLOCK(rand_lock); 1018 static DEFINE_SPINLOCK(rand_lock);
1022 struct rcu_torture *p; 1019 struct rcu_torture *p;
@@ -1024,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused)
1024 unsigned long long ts; 1021 unsigned long long ts;
1025 1022
1026 idx = cur_ops->readlock(); 1023 idx = cur_ops->readlock();
1027 completed = cur_ops->completed(); 1024 if (cur_ops->started)
1025 started = cur_ops->started();
1026 else
1027 started = cur_ops->completed();
1028 ts = rcu_trace_clock_local(); 1028 ts = rcu_trace_clock_local();
1029 p = rcu_dereference_check(rcu_torture_current, 1029 p = rcu_dereference_check(rcu_torture_current,
1030 rcu_read_lock_bh_held() || 1030 rcu_read_lock_bh_held() ||
@@ -1047,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused)
1047 /* Should not happen, but... */ 1047 /* Should not happen, but... */
1048 pipe_count = RCU_TORTURE_PIPE_LEN; 1048 pipe_count = RCU_TORTURE_PIPE_LEN;
1049 } 1049 }
1050 completed_end = cur_ops->completed(); 1050 completed = cur_ops->completed();
1051 if (pipe_count > 1) { 1051 if (pipe_count > 1) {
1052 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, 1052 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
1053 completed, completed_end); 1053 started, completed);
1054 rcutorture_trace_dump(); 1054 rcutorture_trace_dump();
1055 } 1055 }
1056 __this_cpu_inc(rcu_torture_count[pipe_count]); 1056 __this_cpu_inc(rcu_torture_count[pipe_count]);
1057 completed = completed_end - completed; 1057 completed = completed - started;
1058 if (cur_ops->started)
1059 completed++;
1058 if (completed > RCU_TORTURE_PIPE_LEN) { 1060 if (completed > RCU_TORTURE_PIPE_LEN) {
1059 /* Should not happen, but... */ 1061 /* Should not happen, but... */
1060 completed = RCU_TORTURE_PIPE_LEN; 1062 completed = RCU_TORTURE_PIPE_LEN;
@@ -1073,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused)
1073static int 1075static int
1074rcu_torture_reader(void *arg) 1076rcu_torture_reader(void *arg)
1075{ 1077{
1076 int completed; 1078 unsigned long started;
1077 int completed_end; 1079 unsigned long completed;
1078 int idx; 1080 int idx;
1079 DEFINE_TORTURE_RANDOM(rand); 1081 DEFINE_TORTURE_RANDOM(rand);
1080 struct rcu_torture *p; 1082 struct rcu_torture *p;
@@ -1093,7 +1095,10 @@ rcu_torture_reader(void *arg)
1093 mod_timer(&t, jiffies + 1); 1095 mod_timer(&t, jiffies + 1);
1094 } 1096 }
1095 idx = cur_ops->readlock(); 1097 idx = cur_ops->readlock();
1096 completed = cur_ops->completed(); 1098 if (cur_ops->started)
1099 started = cur_ops->started();
1100 else
1101 started = cur_ops->completed();
1097 ts = rcu_trace_clock_local(); 1102 ts = rcu_trace_clock_local();
1098 p = rcu_dereference_check(rcu_torture_current, 1103 p = rcu_dereference_check(rcu_torture_current,
1099 rcu_read_lock_bh_held() || 1104 rcu_read_lock_bh_held() ||
@@ -1114,14 +1119,16 @@ rcu_torture_reader(void *arg)
1114 /* Should not happen, but... */ 1119 /* Should not happen, but... */
1115 pipe_count = RCU_TORTURE_PIPE_LEN; 1120 pipe_count = RCU_TORTURE_PIPE_LEN;
1116 } 1121 }
1117 completed_end = cur_ops->completed(); 1122 completed = cur_ops->completed();
1118 if (pipe_count > 1) { 1123 if (pipe_count > 1) {
1119 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, 1124 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
1120 ts, completed, completed_end); 1125 ts, started, completed);
1121 rcutorture_trace_dump(); 1126 rcutorture_trace_dump();
1122 } 1127 }
1123 __this_cpu_inc(rcu_torture_count[pipe_count]); 1128 __this_cpu_inc(rcu_torture_count[pipe_count]);
1124 completed = completed_end - completed; 1129 completed = completed - started;
1130 if (cur_ops->started)
1131 completed++;
1125 if (completed > RCU_TORTURE_PIPE_LEN) { 1132 if (completed > RCU_TORTURE_PIPE_LEN) {
1126 /* Should not happen, but... */ 1133 /* Should not happen, but... */
1127 completed = RCU_TORTURE_PIPE_LEN; 1134 completed = RCU_TORTURE_PIPE_LEN;
@@ -1420,6 +1427,9 @@ static int rcu_torture_barrier(void *arg)
1420 cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ 1427 cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
1421 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { 1428 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
1422 n_rcu_torture_barrier_error++; 1429 n_rcu_torture_barrier_error++;
1430 pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
1431 atomic_read(&barrier_cbs_invoked),
1432 n_barrier_cbs);
1423 WARN_ON_ONCE(1); 1433 WARN_ON_ONCE(1);
1424 } 1434 }
1425 n_barrier_successes++; 1435 n_barrier_successes++;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e037f3eb2f7b..445bf8ffe3fb 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
546 * Report the number of batches, correlated with, but not necessarily 546 * Report the number of batches, correlated with, but not necessarily
547 * precisely the same as, the number of grace periods that have elapsed. 547 * precisely the same as, the number of grace periods that have elapsed.
548 */ 548 */
549long srcu_batches_completed(struct srcu_struct *sp) 549unsigned long srcu_batches_completed(struct srcu_struct *sp)
550{ 550{
551 return sp->completed; 551 return sp->completed;
552} 552}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 0db5649f8817..cc9ceca7bde1 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head,
47 void (*func)(struct rcu_head *rcu), 47 void (*func)(struct rcu_head *rcu),
48 struct rcu_ctrlblk *rcp); 48 struct rcu_ctrlblk *rcp);
49 49
50static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
51
52#include "tiny_plugin.h" 50#include "tiny_plugin.h"
53 51
54/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */
55static void rcu_idle_enter_common(long long newval)
56{
57 if (newval) {
58 RCU_TRACE(trace_rcu_dyntick(TPS("--="),
59 rcu_dynticks_nesting, newval));
60 rcu_dynticks_nesting = newval;
61 return;
62 }
63 RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
64 rcu_dynticks_nesting, newval));
65 if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
66 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
67
68 RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
69 rcu_dynticks_nesting, newval));
70 ftrace_dump(DUMP_ALL);
71 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
72 current->pid, current->comm,
73 idle->pid, idle->comm); /* must be idle task! */
74 }
75 rcu_sched_qs(); /* implies rcu_bh_inc() */
76 barrier();
77 rcu_dynticks_nesting = newval;
78}
79
80/* 52/*
81 * Enter idle, which is an extended quiescent state if we have fully 53 * Enter idle, which is an extended quiescent state if we have fully
82 * entered that mode (i.e., if the new value of dynticks_nesting is zero). 54 * entered that mode.
83 */ 55 */
84void rcu_idle_enter(void) 56void rcu_idle_enter(void)
85{ 57{
86 unsigned long flags;
87 long long newval;
88
89 local_irq_save(flags);
90 WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
91 if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
92 DYNTICK_TASK_NEST_VALUE)
93 newval = 0;
94 else
95 newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
96 rcu_idle_enter_common(newval);
97 local_irq_restore(flags);
98} 58}
99EXPORT_SYMBOL_GPL(rcu_idle_enter); 59EXPORT_SYMBOL_GPL(rcu_idle_enter);
100 60
@@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
103 */ 63 */
104void rcu_irq_exit(void) 64void rcu_irq_exit(void)
105{ 65{
106 unsigned long flags;
107 long long newval;
108
109 local_irq_save(flags);
110 newval = rcu_dynticks_nesting - 1;
111 WARN_ON_ONCE(newval < 0);
112 rcu_idle_enter_common(newval);
113 local_irq_restore(flags);
114} 66}
115EXPORT_SYMBOL_GPL(rcu_irq_exit); 67EXPORT_SYMBOL_GPL(rcu_irq_exit);
116 68
117/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */
118static void rcu_idle_exit_common(long long oldval)
119{
120 if (oldval) {
121 RCU_TRACE(trace_rcu_dyntick(TPS("++="),
122 oldval, rcu_dynticks_nesting));
123 return;
124 }
125 RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
126 if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
127 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
128
129 RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
130 oldval, rcu_dynticks_nesting));
131 ftrace_dump(DUMP_ALL);
132 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
133 current->pid, current->comm,
134 idle->pid, idle->comm); /* must be idle task! */
135 }
136}
137
138/* 69/*
139 * Exit idle, so that we are no longer in an extended quiescent state. 70 * Exit idle, so that we are no longer in an extended quiescent state.
140 */ 71 */
141void rcu_idle_exit(void) 72void rcu_idle_exit(void)
142{ 73{
143 unsigned long flags;
144 long long oldval;
145
146 local_irq_save(flags);
147 oldval = rcu_dynticks_nesting;
148 WARN_ON_ONCE(rcu_dynticks_nesting < 0);
149 if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
150 rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
151 else
152 rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
153 rcu_idle_exit_common(oldval);
154 local_irq_restore(flags);
155} 74}
156EXPORT_SYMBOL_GPL(rcu_idle_exit); 75EXPORT_SYMBOL_GPL(rcu_idle_exit);
157 76
@@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
160 */ 79 */
161void rcu_irq_enter(void) 80void rcu_irq_enter(void)
162{ 81{
163 unsigned long flags;
164 long long oldval;
165
166 local_irq_save(flags);
167 oldval = rcu_dynticks_nesting;
168 rcu_dynticks_nesting++;
169 WARN_ON_ONCE(rcu_dynticks_nesting == 0);
170 rcu_idle_exit_common(oldval);
171 local_irq_restore(flags);
172} 82}
173EXPORT_SYMBOL_GPL(rcu_irq_enter); 83EXPORT_SYMBOL_GPL(rcu_irq_enter);
174 84
@@ -179,23 +89,13 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter);
179 */ 89 */
180bool notrace __rcu_is_watching(void) 90bool notrace __rcu_is_watching(void)
181{ 91{
182 return rcu_dynticks_nesting; 92 return true;
183} 93}
184EXPORT_SYMBOL(__rcu_is_watching); 94EXPORT_SYMBOL(__rcu_is_watching);
185 95
186#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ 96#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
187 97
188/* 98/*
189 * Test whether the current CPU was interrupted from idle. Nested
190 * interrupts don't count, we must be running at the first interrupt
191 * level.
192 */
193static int rcu_is_cpu_rrupt_from_idle(void)
194{
195 return rcu_dynticks_nesting <= 1;
196}
197
198/*
199 * Helper function for rcu_sched_qs() and rcu_bh_qs(). 99 * Helper function for rcu_sched_qs() and rcu_bh_qs().
200 * Also irqs are disabled to avoid confusion due to interrupt handlers 100 * Also irqs are disabled to avoid confusion due to interrupt handlers
201 * invoking call_rcu(). 101 * invoking call_rcu().
@@ -250,7 +150,7 @@ void rcu_bh_qs(void)
250void rcu_check_callbacks(int user) 150void rcu_check_callbacks(int user)
251{ 151{
252 RCU_TRACE(check_cpu_stalls()); 152 RCU_TRACE(check_cpu_stalls());
253 if (user || rcu_is_cpu_rrupt_from_idle()) 153 if (user)
254 rcu_sched_qs(); 154 rcu_sched_qs();
255 else if (!in_softirq()) 155 else if (!in_softirq())
256 rcu_bh_qs(); 156 rcu_bh_qs();
@@ -357,6 +257,11 @@ static void __call_rcu(struct rcu_head *head,
357 rcp->curtail = &head->next; 257 rcp->curtail = &head->next;
358 RCU_TRACE(rcp->qlen++); 258 RCU_TRACE(rcp->qlen++);
359 local_irq_restore(flags); 259 local_irq_restore(flags);
260
261 if (unlikely(is_idle_task(current))) {
262 /* force scheduling for rcu_sched_qs() */
263 resched_cpu(0);
264 }
360} 265}
361 266
362/* 267/*
@@ -383,6 +288,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
383void __init rcu_init(void) 288void __init rcu_init(void)
384{ 289{
385 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 290 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
291 RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
292 RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
386 293
387 rcu_early_boot_tests(); 294 rcu_early_boot_tests();
388} 295}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 858c56569127..f94e209a10d6 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
145 rcp->ticks_this_gp++; 145 rcp->ticks_this_gp++;
146 j = jiffies; 146 j = jiffies;
147 js = ACCESS_ONCE(rcp->jiffies_stall); 147 js = ACCESS_ONCE(rcp->jiffies_stall);
148 if (*rcp->curtail && ULONG_CMP_GE(j, js)) { 148 if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", 149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
150 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, 150 rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
151 jiffies - rcp->gp_start, rcp->qlen); 151 jiffies - rcp->gp_start, rcp->qlen);
152 dump_stack(); 152 dump_stack();
153 }
154 if (*rcp->curtail && ULONG_CMP_GE(j, js))
155 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + 153 ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
156 3 * rcu_jiffies_till_stall_check() + 3; 154 3 * rcu_jiffies_till_stall_check() + 3;
157 else if (ULONG_CMP_GE(j, js)) 155 } else if (ULONG_CMP_GE(j, js)) {
158 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); 156 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
157 }
159} 158}
160 159
161static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) 160static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7680fc275036..48d640ca1a05 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
156static void invoke_rcu_core(void); 156static void invoke_rcu_core(void);
157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
158 158
159/* rcuc/rcub kthread realtime priority */
160static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
161module_param(kthread_prio, int, 0644);
162
159/* 163/*
160 * Track the rcutorture test sequence number and the update version 164 * Track the rcutorture test sequence number and the update version
161 * number within a given test. The rcutorture_testseq is incremented 165 * number within a given test. The rcutorture_testseq is incremented
@@ -215,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
215#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 219#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
216}; 220};
217 221
222DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
223EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
224
218/* 225/*
219 * Let the RCU core know that this CPU has gone through the scheduler, 226 * Let the RCU core know that this CPU has gone through the scheduler,
220 * which is a quiescent state. This is called when the need for a 227 * which is a quiescent state. This is called when the need for a
@@ -284,6 +291,22 @@ void rcu_note_context_switch(void)
284} 291}
285EXPORT_SYMBOL_GPL(rcu_note_context_switch); 292EXPORT_SYMBOL_GPL(rcu_note_context_switch);
286 293
294/*
295 * Register a quiesecent state for all RCU flavors. If there is an
296 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
297 * dyntick-idle quiescent state visible to other CPUs (but only for those
298 * RCU flavors in desparate need of a quiescent state, which will normally
299 * be none of them). Either way, do a lightweight quiescent state for
300 * all RCU flavors.
301 */
302void rcu_all_qs(void)
303{
304 if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
305 rcu_momentary_dyntick_idle();
306 this_cpu_inc(rcu_qs_ctr);
307}
308EXPORT_SYMBOL_GPL(rcu_all_qs);
309
287static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 310static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
288static long qhimark = 10000; /* If this many pending, ignore blimit. */ 311static long qhimark = 10000; /* If this many pending, ignore blimit. */
289static long qlowmark = 100; /* Once only this many pending, use blimit. */ 312static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -315,18 +338,54 @@ static void force_quiescent_state(struct rcu_state *rsp);
315static int rcu_pending(void); 338static int rcu_pending(void);
316 339
317/* 340/*
318 * Return the number of RCU-sched batches processed thus far for debug & stats. 341 * Return the number of RCU batches started thus far for debug & stats.
342 */
343unsigned long rcu_batches_started(void)
344{
345 return rcu_state_p->gpnum;
346}
347EXPORT_SYMBOL_GPL(rcu_batches_started);
348
349/*
350 * Return the number of RCU-sched batches started thus far for debug & stats.
351 */
352unsigned long rcu_batches_started_sched(void)
353{
354 return rcu_sched_state.gpnum;
355}
356EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
357
358/*
359 * Return the number of RCU BH batches started thus far for debug & stats.
319 */ 360 */
320long rcu_batches_completed_sched(void) 361unsigned long rcu_batches_started_bh(void)
362{
363 return rcu_bh_state.gpnum;
364}
365EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
366
367/*
368 * Return the number of RCU batches completed thus far for debug & stats.
369 */
370unsigned long rcu_batches_completed(void)
371{
372 return rcu_state_p->completed;
373}
374EXPORT_SYMBOL_GPL(rcu_batches_completed);
375
376/*
377 * Return the number of RCU-sched batches completed thus far for debug & stats.
378 */
379unsigned long rcu_batches_completed_sched(void)
321{ 380{
322 return rcu_sched_state.completed; 381 return rcu_sched_state.completed;
323} 382}
324EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); 383EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
325 384
326/* 385/*
327 * Return the number of RCU BH batches processed thus far for debug & stats. 386 * Return the number of RCU BH batches completed thus far for debug & stats.
328 */ 387 */
329long rcu_batches_completed_bh(void) 388unsigned long rcu_batches_completed_bh(void)
330{ 389{
331 return rcu_bh_state.completed; 390 return rcu_bh_state.completed;
332} 391}
@@ -759,39 +818,71 @@ void rcu_irq_enter(void)
759/** 818/**
760 * rcu_nmi_enter - inform RCU of entry to NMI context 819 * rcu_nmi_enter - inform RCU of entry to NMI context
761 * 820 *
762 * If the CPU was idle with dynamic ticks active, and there is no 821 * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
763 * irq handler running, this updates rdtp->dynticks_nmi to let the 822 * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
764 * RCU grace-period handling know that the CPU is active. 823 * that the CPU is active. This implementation permits nested NMIs, as
824 * long as the nesting level does not overflow an int. (You will probably
825 * run out of stack space first.)
765 */ 826 */
766void rcu_nmi_enter(void) 827void rcu_nmi_enter(void)
767{ 828{
768 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 829 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
830 int incby = 2;
769 831
770 if (rdtp->dynticks_nmi_nesting == 0 && 832 /* Complain about underflow. */
771 (atomic_read(&rdtp->dynticks) & 0x1)) 833 WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
772 return; 834
773 rdtp->dynticks_nmi_nesting++; 835 /*
774 smp_mb__before_atomic(); /* Force delay from prior write. */ 836 * If idle from RCU viewpoint, atomically increment ->dynticks
775 atomic_inc(&rdtp->dynticks); 837 * to mark non-idle and increment ->dynticks_nmi_nesting by one.
776 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 838 * Otherwise, increment ->dynticks_nmi_nesting by two. This means
777 smp_mb__after_atomic(); /* See above. */ 839 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
778 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 840 * to be in the outermost NMI handler that interrupted an RCU-idle
841 * period (observation due to Andy Lutomirski).
842 */
843 if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
844 smp_mb__before_atomic(); /* Force delay from prior write. */
845 atomic_inc(&rdtp->dynticks);
846 /* atomic_inc() before later RCU read-side crit sects */
847 smp_mb__after_atomic(); /* See above. */
848 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
849 incby = 1;
850 }
851 rdtp->dynticks_nmi_nesting += incby;
852 barrier();
779} 853}
780 854
781/** 855/**
782 * rcu_nmi_exit - inform RCU of exit from NMI context 856 * rcu_nmi_exit - inform RCU of exit from NMI context
783 * 857 *
784 * If the CPU was idle with dynamic ticks active, and there is no 858 * If we are returning from the outermost NMI handler that interrupted an
785 * irq handler running, this updates rdtp->dynticks_nmi to let the 859 * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
786 * RCU grace-period handling know that the CPU is no longer active. 860 * to let the RCU grace-period handling know that the CPU is back to
861 * being RCU-idle.
787 */ 862 */
788void rcu_nmi_exit(void) 863void rcu_nmi_exit(void)
789{ 864{
790 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 865 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
791 866
792 if (rdtp->dynticks_nmi_nesting == 0 || 867 /*
793 --rdtp->dynticks_nmi_nesting != 0) 868 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
869 * (We are exiting an NMI handler, so RCU better be paying attention
870 * to us!)
871 */
872 WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
873 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
874
875 /*
876 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
877 * leave it in non-RCU-idle state.
878 */
879 if (rdtp->dynticks_nmi_nesting != 1) {
880 rdtp->dynticks_nmi_nesting -= 2;
794 return; 881 return;
882 }
883
884 /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
885 rdtp->dynticks_nmi_nesting = 0;
795 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 886 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
796 smp_mb__before_atomic(); /* See above. */ 887 smp_mb__before_atomic(); /* See above. */
797 atomic_inc(&rdtp->dynticks); 888 atomic_inc(&rdtp->dynticks);
@@ -898,17 +989,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
898 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); 989 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
899 return 1; 990 return 1;
900 } else { 991 } else {
992 if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
993 rdp->mynode->gpnum))
994 ACCESS_ONCE(rdp->gpwrap) = true;
901 return 0; 995 return 0;
902 } 996 }
903} 997}
904 998
905/* 999/*
906 * This function really isn't for public consumption, but RCU is special in
907 * that context switches can allow the state machine to make progress.
908 */
909extern void resched_cpu(int cpu);
910
911/*
912 * Return true if the specified CPU has passed through a quiescent 1000 * Return true if the specified CPU has passed through a quiescent
913 * state by virtue of being in or having passed through an dynticks 1001 * state by virtue of being in or having passed through an dynticks
914 * idle state since the last call to dyntick_save_progress_counter() 1002 * idle state since the last call to dyntick_save_progress_counter()
@@ -1011,6 +1099,22 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
1011 j1 = rcu_jiffies_till_stall_check(); 1099 j1 = rcu_jiffies_till_stall_check();
1012 ACCESS_ONCE(rsp->jiffies_stall) = j + j1; 1100 ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
1013 rsp->jiffies_resched = j + j1 / 2; 1101 rsp->jiffies_resched = j + j1 / 2;
1102 rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
1103}
1104
1105/*
1106 * Complain about starvation of grace-period kthread.
1107 */
1108static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
1109{
1110 unsigned long gpa;
1111 unsigned long j;
1112
1113 j = jiffies;
1114 gpa = ACCESS_ONCE(rsp->gp_activity);
1115 if (j - gpa > 2 * HZ)
1116 pr_err("%s kthread starved for %ld jiffies!\n",
1117 rsp->name, j - gpa);
1014} 1118}
1015 1119
1016/* 1120/*
@@ -1033,11 +1137,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
1033 } 1137 }
1034} 1138}
1035 1139
1036static void print_other_cpu_stall(struct rcu_state *rsp) 1140static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1037{ 1141{
1038 int cpu; 1142 int cpu;
1039 long delta; 1143 long delta;
1040 unsigned long flags; 1144 unsigned long flags;
1145 unsigned long gpa;
1146 unsigned long j;
1041 int ndetected = 0; 1147 int ndetected = 0;
1042 struct rcu_node *rnp = rcu_get_root(rsp); 1148 struct rcu_node *rnp = rcu_get_root(rsp);
1043 long totqlen = 0; 1149 long totqlen = 0;
@@ -1075,30 +1181,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
1075 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1181 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1076 } 1182 }
1077 1183
1078 /*
1079 * Now rat on any tasks that got kicked up to the root rcu_node
1080 * due to CPU offlining.
1081 */
1082 rnp = rcu_get_root(rsp);
1083 raw_spin_lock_irqsave(&rnp->lock, flags);
1084 ndetected += rcu_print_task_stall(rnp);
1085 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1086
1087 print_cpu_stall_info_end(); 1184 print_cpu_stall_info_end();
1088 for_each_possible_cpu(cpu) 1185 for_each_possible_cpu(cpu)
1089 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1186 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
1090 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", 1187 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
1091 smp_processor_id(), (long)(jiffies - rsp->gp_start), 1188 smp_processor_id(), (long)(jiffies - rsp->gp_start),
1092 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1189 (long)rsp->gpnum, (long)rsp->completed, totqlen);
1093 if (ndetected == 0) 1190 if (ndetected) {
1094 pr_err("INFO: Stall ended before state dump start\n");
1095 else
1096 rcu_dump_cpu_stacks(rsp); 1191 rcu_dump_cpu_stacks(rsp);
1192 } else {
1193 if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
1194 ACCESS_ONCE(rsp->completed) == gpnum) {
1195 pr_err("INFO: Stall ended before state dump start\n");
1196 } else {
1197 j = jiffies;
1198 gpa = ACCESS_ONCE(rsp->gp_activity);
1199 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
1200 rsp->name, j - gpa, j, gpa,
1201 jiffies_till_next_fqs);
1202 /* In this case, the current CPU might be at fault. */
1203 sched_show_task(current);
1204 }
1205 }
1097 1206
1098 /* Complain about tasks blocking the grace period. */ 1207 /* Complain about tasks blocking the grace period. */
1099
1100 rcu_print_detail_task_stall(rsp); 1208 rcu_print_detail_task_stall(rsp);
1101 1209
1210 rcu_check_gp_kthread_starvation(rsp);
1211
1102 force_quiescent_state(rsp); /* Kick them all. */ 1212 force_quiescent_state(rsp); /* Kick them all. */
1103} 1213}
1104 1214
@@ -1123,6 +1233,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
1123 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", 1233 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
1124 jiffies - rsp->gp_start, 1234 jiffies - rsp->gp_start,
1125 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1235 (long)rsp->gpnum, (long)rsp->completed, totqlen);
1236
1237 rcu_check_gp_kthread_starvation(rsp);
1238
1126 rcu_dump_cpu_stacks(rsp); 1239 rcu_dump_cpu_stacks(rsp);
1127 1240
1128 raw_spin_lock_irqsave(&rnp->lock, flags); 1241 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1193,7 +1306,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
1193 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { 1306 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
1194 1307
1195 /* They had a few time units to dump stack, so complain. */ 1308 /* They had a few time units to dump stack, so complain. */
1196 print_other_cpu_stall(rsp); 1309 print_other_cpu_stall(rsp, gpnum);
1197 } 1310 }
1198} 1311}
1199 1312
@@ -1530,7 +1643,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1530 bool ret; 1643 bool ret;
1531 1644
1532 /* Handle the ends of any preceding grace periods first. */ 1645 /* Handle the ends of any preceding grace periods first. */
1533 if (rdp->completed == rnp->completed) { 1646 if (rdp->completed == rnp->completed &&
1647 !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
1534 1648
1535 /* No grace period end, so just accelerate recent callbacks. */ 1649 /* No grace period end, so just accelerate recent callbacks. */
1536 ret = rcu_accelerate_cbs(rsp, rnp, rdp); 1650 ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1545,7 +1659,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1545 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); 1659 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
1546 } 1660 }
1547 1661
1548 if (rdp->gpnum != rnp->gpnum) { 1662 if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
1549 /* 1663 /*
1550 * If the current grace period is waiting for this CPU, 1664 * If the current grace period is waiting for this CPU,
1551 * set up to detect a quiescent state, otherwise don't 1665 * set up to detect a quiescent state, otherwise don't
@@ -1554,8 +1668,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1554 rdp->gpnum = rnp->gpnum; 1668 rdp->gpnum = rnp->gpnum;
1555 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); 1669 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
1556 rdp->passed_quiesce = 0; 1670 rdp->passed_quiesce = 0;
1671 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
1557 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1672 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1558 zero_cpu_stall_ticks(rdp); 1673 zero_cpu_stall_ticks(rdp);
1674 ACCESS_ONCE(rdp->gpwrap) = false;
1559 } 1675 }
1560 return ret; 1676 return ret;
1561} 1677}
@@ -1569,7 +1685,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1569 local_irq_save(flags); 1685 local_irq_save(flags);
1570 rnp = rdp->mynode; 1686 rnp = rdp->mynode;
1571 if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && 1687 if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
1572 rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ 1688 rdp->completed == ACCESS_ONCE(rnp->completed) &&
1689 !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
1573 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ 1690 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1574 local_irq_restore(flags); 1691 local_irq_restore(flags);
1575 return; 1692 return;
@@ -1589,6 +1706,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1589 struct rcu_data *rdp; 1706 struct rcu_data *rdp;
1590 struct rcu_node *rnp = rcu_get_root(rsp); 1707 struct rcu_node *rnp = rcu_get_root(rsp);
1591 1708
1709 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1592 rcu_bind_gp_kthread(); 1710 rcu_bind_gp_kthread();
1593 raw_spin_lock_irq(&rnp->lock); 1711 raw_spin_lock_irq(&rnp->lock);
1594 smp_mb__after_unlock_lock(); 1712 smp_mb__after_unlock_lock();
@@ -1649,6 +1767,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1649 rnp->grphi, rnp->qsmask); 1767 rnp->grphi, rnp->qsmask);
1650 raw_spin_unlock_irq(&rnp->lock); 1768 raw_spin_unlock_irq(&rnp->lock);
1651 cond_resched_rcu_qs(); 1769 cond_resched_rcu_qs();
1770 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1652 } 1771 }
1653 1772
1654 mutex_unlock(&rsp->onoff_mutex); 1773 mutex_unlock(&rsp->onoff_mutex);
@@ -1665,6 +1784,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1665 unsigned long maxj; 1784 unsigned long maxj;
1666 struct rcu_node *rnp = rcu_get_root(rsp); 1785 struct rcu_node *rnp = rcu_get_root(rsp);
1667 1786
1787 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1668 rsp->n_force_qs++; 1788 rsp->n_force_qs++;
1669 if (fqs_state == RCU_SAVE_DYNTICK) { 1789 if (fqs_state == RCU_SAVE_DYNTICK) {
1670 /* Collect dyntick-idle snapshots. */ 1790 /* Collect dyntick-idle snapshots. */
@@ -1703,6 +1823,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1703 struct rcu_data *rdp; 1823 struct rcu_data *rdp;
1704 struct rcu_node *rnp = rcu_get_root(rsp); 1824 struct rcu_node *rnp = rcu_get_root(rsp);
1705 1825
1826 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1706 raw_spin_lock_irq(&rnp->lock); 1827 raw_spin_lock_irq(&rnp->lock);
1707 smp_mb__after_unlock_lock(); 1828 smp_mb__after_unlock_lock();
1708 gp_duration = jiffies - rsp->gp_start; 1829 gp_duration = jiffies - rsp->gp_start;
@@ -1739,6 +1860,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1739 nocb += rcu_future_gp_cleanup(rsp, rnp); 1860 nocb += rcu_future_gp_cleanup(rsp, rnp);
1740 raw_spin_unlock_irq(&rnp->lock); 1861 raw_spin_unlock_irq(&rnp->lock);
1741 cond_resched_rcu_qs(); 1862 cond_resched_rcu_qs();
1863 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1742 } 1864 }
1743 rnp = rcu_get_root(rsp); 1865 rnp = rcu_get_root(rsp);
1744 raw_spin_lock_irq(&rnp->lock); 1866 raw_spin_lock_irq(&rnp->lock);
@@ -1788,6 +1910,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1788 if (rcu_gp_init(rsp)) 1910 if (rcu_gp_init(rsp))
1789 break; 1911 break;
1790 cond_resched_rcu_qs(); 1912 cond_resched_rcu_qs();
1913 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1791 WARN_ON(signal_pending(current)); 1914 WARN_ON(signal_pending(current));
1792 trace_rcu_grace_period(rsp->name, 1915 trace_rcu_grace_period(rsp->name,
1793 ACCESS_ONCE(rsp->gpnum), 1916 ACCESS_ONCE(rsp->gpnum),
@@ -1831,9 +1954,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
1831 ACCESS_ONCE(rsp->gpnum), 1954 ACCESS_ONCE(rsp->gpnum),
1832 TPS("fqsend")); 1955 TPS("fqsend"));
1833 cond_resched_rcu_qs(); 1956 cond_resched_rcu_qs();
1957 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1834 } else { 1958 } else {
1835 /* Deal with stray signal. */ 1959 /* Deal with stray signal. */
1836 cond_resched_rcu_qs(); 1960 cond_resched_rcu_qs();
1961 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1837 WARN_ON(signal_pending(current)); 1962 WARN_ON(signal_pending(current));
1838 trace_rcu_grace_period(rsp->name, 1963 trace_rcu_grace_period(rsp->name,
1839 ACCESS_ONCE(rsp->gpnum), 1964 ACCESS_ONCE(rsp->gpnum),
@@ -2010,8 +2135,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2010 rnp = rdp->mynode; 2135 rnp = rdp->mynode;
2011 raw_spin_lock_irqsave(&rnp->lock, flags); 2136 raw_spin_lock_irqsave(&rnp->lock, flags);
2012 smp_mb__after_unlock_lock(); 2137 smp_mb__after_unlock_lock();
2013 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || 2138 if ((rdp->passed_quiesce == 0 &&
2014 rnp->completed == rnp->gpnum) { 2139 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
2140 rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
2141 rdp->gpwrap) {
2015 2142
2016 /* 2143 /*
2017 * The grace period in which this quiescent state was 2144 * The grace period in which this quiescent state was
@@ -2020,6 +2147,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2020 * within the current grace period. 2147 * within the current grace period.
2021 */ 2148 */
2022 rdp->passed_quiesce = 0; /* need qs for new gp. */ 2149 rdp->passed_quiesce = 0; /* need qs for new gp. */
2150 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
2023 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2151 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2024 return; 2152 return;
2025 } 2153 }
@@ -2064,7 +2192,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
2064 * Was there a quiescent state since the beginning of the grace 2192 * Was there a quiescent state since the beginning of the grace
2065 * period? If no, then exit and wait for the next call. 2193 * period? If no, then exit and wait for the next call.
2066 */ 2194 */
2067 if (!rdp->passed_quiesce) 2195 if (!rdp->passed_quiesce &&
2196 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
2068 return; 2197 return;
2069 2198
2070 /* 2199 /*
@@ -2195,6 +2324,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2195} 2324}
2196 2325
2197/* 2326/*
2327 * All CPUs for the specified rcu_node structure have gone offline,
2328 * and all tasks that were preempted within an RCU read-side critical
2329 * section while running on one of those CPUs have since exited their RCU
2330 * read-side critical section. Some other CPU is reporting this fact with
2331 * the specified rcu_node structure's ->lock held and interrupts disabled.
2332 * This function therefore goes up the tree of rcu_node structures,
2333 * clearing the corresponding bits in the ->qsmaskinit fields. Note that
2334 * the leaf rcu_node structure's ->qsmaskinit field has already been
2335 * updated
2336 *
2337 * This function does check that the specified rcu_node structure has
2338 * all CPUs offline and no blocked tasks, so it is OK to invoke it
2339 * prematurely. That said, invoking it after the fact will cost you
2340 * a needless lock acquisition. So once it has done its work, don't
2341 * invoke it again.
2342 */
2343static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2344{
2345 long mask;
2346 struct rcu_node *rnp = rnp_leaf;
2347
2348 if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
2349 return;
2350 for (;;) {
2351 mask = rnp->grpmask;
2352 rnp = rnp->parent;
2353 if (!rnp)
2354 break;
2355 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
2356 smp_mb__after_unlock_lock(); /* GP memory ordering. */
2357 rnp->qsmaskinit &= ~mask;
2358 if (rnp->qsmaskinit) {
2359 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2360 return;
2361 }
2362 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2363 }
2364}
2365
2366/*
2198 * The CPU has been completely removed, and some other CPU is reporting 2367 * The CPU has been completely removed, and some other CPU is reporting
2199 * this fact from process context. Do the remainder of the cleanup, 2368 * this fact from process context. Do the remainder of the cleanup,
2200 * including orphaning the outgoing CPU's RCU callbacks, and also 2369 * including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2204,8 +2373,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2204static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2373static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2205{ 2374{
2206 unsigned long flags; 2375 unsigned long flags;
2207 unsigned long mask;
2208 int need_report = 0;
2209 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2376 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2210 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 2377 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
2211 2378
@@ -2219,40 +2386,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2219 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 2386 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
2220 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 2387 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
2221 rcu_adopt_orphan_cbs(rsp, flags); 2388 rcu_adopt_orphan_cbs(rsp, flags);
2389 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
2222 2390
2223 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 2391 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
2224 mask = rdp->grpmask; /* rnp->grplo is constant. */ 2392 raw_spin_lock_irqsave(&rnp->lock, flags);
2225 do { 2393 smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
2226 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 2394 rnp->qsmaskinit &= ~rdp->grpmask;
2227 smp_mb__after_unlock_lock(); 2395 if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
2228 rnp->qsmaskinit &= ~mask; 2396 rcu_cleanup_dead_rnp(rnp);
2229 if (rnp->qsmaskinit != 0) { 2397 rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
2230 if (rnp != rdp->mynode)
2231 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2232 break;
2233 }
2234 if (rnp == rdp->mynode)
2235 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
2236 else
2237 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2238 mask = rnp->grpmask;
2239 rnp = rnp->parent;
2240 } while (rnp != NULL);
2241
2242 /*
2243 * We still hold the leaf rcu_node structure lock here, and
2244 * irqs are still disabled. The reason for this subterfuge is
2245 * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
2246 * held leads to deadlock.
2247 */
2248 raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
2249 rnp = rdp->mynode;
2250 if (need_report & RCU_OFL_TASKS_NORM_GP)
2251 rcu_report_unblock_qs_rnp(rnp, flags);
2252 else
2253 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2254 if (need_report & RCU_OFL_TASKS_EXP_GP)
2255 rcu_report_exp_rnp(rsp, rnp, true);
2256 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 2398 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
2257 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 2399 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
2258 cpu, rdp->qlen, rdp->nxtlist); 2400 cpu, rdp->qlen, rdp->nxtlist);
@@ -2268,6 +2410,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2268{ 2410{
2269} 2411}
2270 2412
2413static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2414{
2415}
2416
2271static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2417static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2272{ 2418{
2273} 2419}
@@ -2464,12 +2610,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
2464 } 2610 }
2465 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2611 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2466 } 2612 }
2467 rnp = rcu_get_root(rsp);
2468 if (rnp->qsmask == 0) {
2469 raw_spin_lock_irqsave(&rnp->lock, flags);
2470 smp_mb__after_unlock_lock();
2471 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
2472 }
2473} 2613}
2474 2614
2475/* 2615/*
@@ -2569,7 +2709,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
2569 * Schedule RCU callback invocation. If the specified type of RCU 2709 * Schedule RCU callback invocation. If the specified type of RCU
2570 * does not support RCU priority boosting, just do a direct call, 2710 * does not support RCU priority boosting, just do a direct call,
2571 * otherwise wake up the per-CPU kernel kthread. Note that because we 2711 * otherwise wake up the per-CPU kernel kthread. Note that because we
2572 * are running on the current CPU with interrupts disabled, the 2712 * are running on the current CPU with softirqs disabled, the
2573 * rcu_cpu_kthread_task cannot disappear out from under us. 2713 * rcu_cpu_kthread_task cannot disappear out from under us.
2574 */ 2714 */
2575static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 2715static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -3109,9 +3249,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3109 3249
3110 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3250 /* Is the RCU core waiting for a quiescent state from this CPU? */
3111 if (rcu_scheduler_fully_active && 3251 if (rcu_scheduler_fully_active &&
3112 rdp->qs_pending && !rdp->passed_quiesce) { 3252 rdp->qs_pending && !rdp->passed_quiesce &&
3253 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
3113 rdp->n_rp_qs_pending++; 3254 rdp->n_rp_qs_pending++;
3114 } else if (rdp->qs_pending && rdp->passed_quiesce) { 3255 } else if (rdp->qs_pending &&
3256 (rdp->passed_quiesce ||
3257 rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
3115 rdp->n_rp_report_qs++; 3258 rdp->n_rp_report_qs++;
3116 return 1; 3259 return 1;
3117 } 3260 }
@@ -3135,7 +3278,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3135 } 3278 }
3136 3279
3137 /* Has a new RCU grace period started? */ 3280 /* Has a new RCU grace period started? */
3138 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ 3281 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
3282 unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
3139 rdp->n_rp_gp_started++; 3283 rdp->n_rp_gp_started++;
3140 return 1; 3284 return 1;
3141 } 3285 }
@@ -3318,6 +3462,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
3318 } else { 3462 } else {
3319 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 3463 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
3320 rsp->n_barrier_done); 3464 rsp->n_barrier_done);
3465 smp_mb__before_atomic();
3321 atomic_inc(&rsp->barrier_cpu_count); 3466 atomic_inc(&rsp->barrier_cpu_count);
3322 __call_rcu(&rdp->barrier_head, 3467 __call_rcu(&rdp->barrier_head,
3323 rcu_barrier_callback, rsp, cpu, 0); 3468 rcu_barrier_callback, rsp, cpu, 0);
@@ -3385,9 +3530,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
3385 /* Set up local state, ensuring consistent view of global state. */ 3530 /* Set up local state, ensuring consistent view of global state. */
3386 raw_spin_lock_irqsave(&rnp->lock, flags); 3531 raw_spin_lock_irqsave(&rnp->lock, flags);
3387 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 3532 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
3388 init_callback_list(rdp);
3389 rdp->qlen_lazy = 0;
3390 ACCESS_ONCE(rdp->qlen) = 0;
3391 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 3533 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
3392 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 3534 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
3393 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 3535 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -3444,6 +3586,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3444 rdp->gpnum = rnp->completed; 3586 rdp->gpnum = rnp->completed;
3445 rdp->completed = rnp->completed; 3587 rdp->completed = rnp->completed;
3446 rdp->passed_quiesce = 0; 3588 rdp->passed_quiesce = 0;
3589 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
3447 rdp->qs_pending = 0; 3590 rdp->qs_pending = 0;
3448 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); 3591 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3449 } 3592 }
@@ -3535,17 +3678,35 @@ static int rcu_pm_notify(struct notifier_block *self,
3535static int __init rcu_spawn_gp_kthread(void) 3678static int __init rcu_spawn_gp_kthread(void)
3536{ 3679{
3537 unsigned long flags; 3680 unsigned long flags;
3681 int kthread_prio_in = kthread_prio;
3538 struct rcu_node *rnp; 3682 struct rcu_node *rnp;
3539 struct rcu_state *rsp; 3683 struct rcu_state *rsp;
3684 struct sched_param sp;
3540 struct task_struct *t; 3685 struct task_struct *t;
3541 3686
3687 /* Force priority into range. */
3688 if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
3689 kthread_prio = 1;
3690 else if (kthread_prio < 0)
3691 kthread_prio = 0;
3692 else if (kthread_prio > 99)
3693 kthread_prio = 99;
3694 if (kthread_prio != kthread_prio_in)
3695 pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
3696 kthread_prio, kthread_prio_in);
3697
3542 rcu_scheduler_fully_active = 1; 3698 rcu_scheduler_fully_active = 1;
3543 for_each_rcu_flavor(rsp) { 3699 for_each_rcu_flavor(rsp) {
3544 t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); 3700 t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
3545 BUG_ON(IS_ERR(t)); 3701 BUG_ON(IS_ERR(t));
3546 rnp = rcu_get_root(rsp); 3702 rnp = rcu_get_root(rsp);
3547 raw_spin_lock_irqsave(&rnp->lock, flags); 3703 raw_spin_lock_irqsave(&rnp->lock, flags);
3548 rsp->gp_kthread = t; 3704 rsp->gp_kthread = t;
3705 if (kthread_prio) {
3706 sp.sched_priority = kthread_prio;
3707 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
3708 }
3709 wake_up_process(t);
3549 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3710 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3550 } 3711 }
3551 rcu_spawn_nocb_kthreads(); 3712 rcu_spawn_nocb_kthreads();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b1843896e..119de399eb2f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,7 +27,6 @@
27#include <linux/threads.h> 27#include <linux/threads.h>
28#include <linux/cpumask.h> 28#include <linux/cpumask.h>
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30#include <linux/irq_work.h>
31 30
32/* 31/*
33 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and 32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -172,11 +171,6 @@ struct rcu_node {
172 /* queued on this rcu_node structure that */ 171 /* queued on this rcu_node structure that */
173 /* are blocking the current grace period, */ 172 /* are blocking the current grace period, */
174 /* there can be no such task. */ 173 /* there can be no such task. */
175 struct completion boost_completion;
176 /* Used to ensure that the rt_mutex used */
177 /* to carry out the boosting is fully */
178 /* released with no future boostee accesses */
179 /* before that rt_mutex is re-initialized. */
180 struct rt_mutex boost_mtx; 174 struct rt_mutex boost_mtx;
181 /* Used only for the priority-boosting */ 175 /* Used only for the priority-boosting */
182 /* side effect, not as a lock. */ 176 /* side effect, not as a lock. */
@@ -257,9 +251,12 @@ struct rcu_data {
257 /* in order to detect GP end. */ 251 /* in order to detect GP end. */
258 unsigned long gpnum; /* Highest gp number that this CPU */ 252 unsigned long gpnum; /* Highest gp number that this CPU */
259 /* is aware of having started. */ 253 /* is aware of having started. */
254 unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
255 /* for rcu_all_qs() invocations. */
260 bool passed_quiesce; /* User-mode/idle loop etc. */ 256 bool passed_quiesce; /* User-mode/idle loop etc. */
261 bool qs_pending; /* Core waits for quiesc state. */ 257 bool qs_pending; /* Core waits for quiesc state. */
262 bool beenonline; /* CPU online at least once. */ 258 bool beenonline; /* CPU online at least once. */
259 bool gpwrap; /* Possible gpnum/completed wrap. */
263 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 260 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
264 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 261 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
265#ifdef CONFIG_RCU_CPU_STALL_INFO 262#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -340,14 +337,10 @@ struct rcu_data {
340#ifdef CONFIG_RCU_NOCB_CPU 337#ifdef CONFIG_RCU_NOCB_CPU
341 struct rcu_head *nocb_head; /* CBs waiting for kthread. */ 338 struct rcu_head *nocb_head; /* CBs waiting for kthread. */
342 struct rcu_head **nocb_tail; 339 struct rcu_head **nocb_tail;
343 atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ 340 atomic_long_t nocb_q_count; /* # CBs waiting for nocb */
344 atomic_long_t nocb_q_count_lazy; /* (approximate). */ 341 atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
345 struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ 342 struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
346 struct rcu_head **nocb_follower_tail; 343 struct rcu_head **nocb_follower_tail;
347 atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
348 atomic_long_t nocb_follower_count_lazy; /* (approximate). */
349 int nocb_p_count; /* # CBs being invoked by kthread */
350 int nocb_p_count_lazy; /* (approximate). */
351 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ 344 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
352 struct task_struct *nocb_kthread; 345 struct task_struct *nocb_kthread;
353 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ 346 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
@@ -356,8 +349,6 @@ struct rcu_data {
356 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; 349 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
357 /* CBs waiting for GP. */ 350 /* CBs waiting for GP. */
358 struct rcu_head **nocb_gp_tail; 351 struct rcu_head **nocb_gp_tail;
359 long nocb_gp_count;
360 long nocb_gp_count_lazy;
361 bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ 352 bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
362 struct rcu_data *nocb_next_follower; 353 struct rcu_data *nocb_next_follower;
363 /* Next follower in wakeup chain. */ 354 /* Next follower in wakeup chain. */
@@ -488,10 +479,14 @@ struct rcu_state {
488 /* due to no GP active. */ 479 /* due to no GP active. */
489 unsigned long gp_start; /* Time at which GP started, */ 480 unsigned long gp_start; /* Time at which GP started, */
490 /* but in jiffies. */ 481 /* but in jiffies. */
482 unsigned long gp_activity; /* Time of last GP kthread */
483 /* activity in jiffies. */
491 unsigned long jiffies_stall; /* Time at which to check */ 484 unsigned long jiffies_stall; /* Time at which to check */
492 /* for CPU stalls. */ 485 /* for CPU stalls. */
493 unsigned long jiffies_resched; /* Time at which to resched */ 486 unsigned long jiffies_resched; /* Time at which to resched */
494 /* a reluctant CPU. */ 487 /* a reluctant CPU. */
488 unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
489 /* GP start. */
495 unsigned long gp_max; /* Maximum GP duration in */ 490 unsigned long gp_max; /* Maximum GP duration in */
496 /* jiffies. */ 491 /* jiffies. */
497 const char *name; /* Name of structure. */ 492 const char *name; /* Name of structure. */
@@ -514,13 +509,6 @@ extern struct list_head rcu_struct_flavors;
514#define for_each_rcu_flavor(rsp) \ 509#define for_each_rcu_flavor(rsp) \
515 list_for_each_entry((rsp), &rcu_struct_flavors, flavors) 510 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
516 511
517/* Return values for rcu_preempt_offline_tasks(). */
518
519#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
520 /* GP were moved to root. */
521#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
522 /* GP were moved to root. */
523
524/* 512/*
525 * RCU implementation internal declarations: 513 * RCU implementation internal declarations:
526 */ 514 */
@@ -546,27 +534,16 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
546 534
547/* Forward declarations for rcutree_plugin.h */ 535/* Forward declarations for rcutree_plugin.h */
548static void rcu_bootup_announce(void); 536static void rcu_bootup_announce(void);
549long rcu_batches_completed(void);
550static void rcu_preempt_note_context_switch(void); 537static void rcu_preempt_note_context_switch(void);
551static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 538static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
552#ifdef CONFIG_HOTPLUG_CPU 539#ifdef CONFIG_HOTPLUG_CPU
553static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 540static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
554 unsigned long flags);
555#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 541#endif /* #ifdef CONFIG_HOTPLUG_CPU */
556static void rcu_print_detail_task_stall(struct rcu_state *rsp); 542static void rcu_print_detail_task_stall(struct rcu_state *rsp);
557static int rcu_print_task_stall(struct rcu_node *rnp); 543static int rcu_print_task_stall(struct rcu_node *rnp);
558static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 544static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
559#ifdef CONFIG_HOTPLUG_CPU
560static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
561 struct rcu_node *rnp,
562 struct rcu_data *rdp);
563#endif /* #ifdef CONFIG_HOTPLUG_CPU */
564static void rcu_preempt_check_callbacks(void); 545static void rcu_preempt_check_callbacks(void);
565void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 546void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
566#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
567static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
568 bool wake);
569#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
570static void __init __rcu_init_preempt(void); 547static void __init __rcu_init_preempt(void);
571static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 548static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
572static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 549static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -622,24 +599,15 @@ static void rcu_dynticks_task_exit(void);
622#endif /* #ifndef RCU_TREE_NONCORE */ 599#endif /* #ifndef RCU_TREE_NONCORE */
623 600
624#ifdef CONFIG_RCU_TRACE 601#ifdef CONFIG_RCU_TRACE
625#ifdef CONFIG_RCU_NOCB_CPU 602/* Read out queue lengths for tracing. */
626/* Sum up queue lengths for tracing. */
627static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) 603static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
628{ 604{
629 *ql = atomic_long_read(&rdp->nocb_q_count) + 605#ifdef CONFIG_RCU_NOCB_CPU
630 rdp->nocb_p_count + 606 *ql = atomic_long_read(&rdp->nocb_q_count);
631 atomic_long_read(&rdp->nocb_follower_count) + 607 *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
632 rdp->nocb_p_count + rdp->nocb_gp_count;
633 *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
634 rdp->nocb_p_count_lazy +
635 atomic_long_read(&rdp->nocb_follower_count_lazy) +
636 rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
637}
638#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 608#else /* #ifdef CONFIG_RCU_NOCB_CPU */
639static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
640{
641 *ql = 0; 609 *ql = 0;
642 *qll = 0; 610 *qll = 0;
643}
644#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 611#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
612}
645#endif /* #ifdef CONFIG_RCU_TRACE */ 613#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..0d7bbe3095ad 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -34,10 +34,6 @@
34 34
35#include "../locking/rtmutex_common.h" 35#include "../locking/rtmutex_common.h"
36 36
37/* rcuc/rcub kthread realtime priority */
38static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
39module_param(kthread_prio, int, 0644);
40
41/* 37/*
42 * Control variables for per-CPU and per-rcu_node kthreads. These 38 * Control variables for per-CPU and per-rcu_node kthreads. These
43 * handle all flavors of RCU. 39 * handle all flavors of RCU.
@@ -53,7 +49,6 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
53static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ 49static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
54static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ 50static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
55static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ 51static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
56static char __initdata nocb_buf[NR_CPUS * 5];
57#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 52#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
58 53
59/* 54/*
@@ -103,6 +98,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
103static struct rcu_state *rcu_state_p = &rcu_preempt_state; 98static struct rcu_state *rcu_state_p = &rcu_preempt_state;
104 99
105static int rcu_preempted_readers_exp(struct rcu_node *rnp); 100static int rcu_preempted_readers_exp(struct rcu_node *rnp);
101static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
102 bool wake);
106 103
107/* 104/*
108 * Tell them what RCU they are running. 105 * Tell them what RCU they are running.
@@ -114,25 +111,6 @@ static void __init rcu_bootup_announce(void)
114} 111}
115 112
116/* 113/*
117 * Return the number of RCU-preempt batches processed thus far
118 * for debug and statistics.
119 */
120static long rcu_batches_completed_preempt(void)
121{
122 return rcu_preempt_state.completed;
123}
124EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
125
126/*
127 * Return the number of RCU batches processed thus far for debug & stats.
128 */
129long rcu_batches_completed(void)
130{
131 return rcu_batches_completed_preempt();
132}
133EXPORT_SYMBOL_GPL(rcu_batches_completed);
134
135/*
136 * Record a preemptible-RCU quiescent state for the specified CPU. Note 114 * Record a preemptible-RCU quiescent state for the specified CPU. Note
137 * that this just means that the task currently running on the CPU is 115 * that this just means that the task currently running on the CPU is
138 * not in a quiescent state. There might be any number of tasks blocked 116 * not in a quiescent state. There might be any number of tasks blocked
@@ -307,15 +285,25 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
307} 285}
308 286
309/* 287/*
288 * Return true if the specified rcu_node structure has tasks that were
289 * preempted within an RCU read-side critical section.
290 */
291static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
292{
293 return !list_empty(&rnp->blkd_tasks);
294}
295
296/*
310 * Handle special cases during rcu_read_unlock(), such as needing to 297 * Handle special cases during rcu_read_unlock(), such as needing to
311 * notify RCU core processing or task having blocked during the RCU 298 * notify RCU core processing or task having blocked during the RCU
312 * read-side critical section. 299 * read-side critical section.
313 */ 300 */
314void rcu_read_unlock_special(struct task_struct *t) 301void rcu_read_unlock_special(struct task_struct *t)
315{ 302{
316 int empty; 303 bool empty;
317 int empty_exp; 304 bool empty_exp;
318 int empty_exp_now; 305 bool empty_norm;
306 bool empty_exp_now;
319 unsigned long flags; 307 unsigned long flags;
320 struct list_head *np; 308 struct list_head *np;
321#ifdef CONFIG_RCU_BOOST 309#ifdef CONFIG_RCU_BOOST
@@ -367,7 +355,8 @@ void rcu_read_unlock_special(struct task_struct *t)
367 break; 355 break;
368 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 356 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
369 } 357 }
370 empty = !rcu_preempt_blocked_readers_cgp(rnp); 358 empty = !rcu_preempt_has_tasks(rnp);
359 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
371 empty_exp = !rcu_preempted_readers_exp(rnp); 360 empty_exp = !rcu_preempted_readers_exp(rnp);
372 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 361 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
373 np = rcu_next_node_entry(t, rnp); 362 np = rcu_next_node_entry(t, rnp);
@@ -387,13 +376,21 @@ void rcu_read_unlock_special(struct task_struct *t)
387#endif /* #ifdef CONFIG_RCU_BOOST */ 376#endif /* #ifdef CONFIG_RCU_BOOST */
388 377
389 /* 378 /*
379 * If this was the last task on the list, go see if we
380 * need to propagate ->qsmaskinit bit clearing up the
381 * rcu_node tree.
382 */
383 if (!empty && !rcu_preempt_has_tasks(rnp))
384 rcu_cleanup_dead_rnp(rnp);
385
386 /*
390 * If this was the last task on the current list, and if 387 * If this was the last task on the current list, and if
391 * we aren't waiting on any CPUs, report the quiescent state. 388 * we aren't waiting on any CPUs, report the quiescent state.
392 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, 389 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
393 * so we must take a snapshot of the expedited state. 390 * so we must take a snapshot of the expedited state.
394 */ 391 */
395 empty_exp_now = !rcu_preempted_readers_exp(rnp); 392 empty_exp_now = !rcu_preempted_readers_exp(rnp);
396 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 393 if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
397 trace_rcu_quiescent_state_report(TPS("preempt_rcu"), 394 trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
398 rnp->gpnum, 395 rnp->gpnum,
399 0, rnp->qsmask, 396 0, rnp->qsmask,
@@ -408,10 +405,8 @@ void rcu_read_unlock_special(struct task_struct *t)
408 405
409#ifdef CONFIG_RCU_BOOST 406#ifdef CONFIG_RCU_BOOST
410 /* Unboost if we were boosted. */ 407 /* Unboost if we were boosted. */
411 if (drop_boost_mutex) { 408 if (drop_boost_mutex)
412 rt_mutex_unlock(&rnp->boost_mtx); 409 rt_mutex_unlock(&rnp->boost_mtx);
413 complete(&rnp->boost_completion);
414 }
415#endif /* #ifdef CONFIG_RCU_BOOST */ 410#endif /* #ifdef CONFIG_RCU_BOOST */
416 411
417 /* 412 /*
@@ -519,99 +514,13 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
519static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 514static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
520{ 515{
521 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); 516 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
522 if (!list_empty(&rnp->blkd_tasks)) 517 if (rcu_preempt_has_tasks(rnp))
523 rnp->gp_tasks = rnp->blkd_tasks.next; 518 rnp->gp_tasks = rnp->blkd_tasks.next;
524 WARN_ON_ONCE(rnp->qsmask); 519 WARN_ON_ONCE(rnp->qsmask);
525} 520}
526 521
527#ifdef CONFIG_HOTPLUG_CPU 522#ifdef CONFIG_HOTPLUG_CPU
528 523
529/*
530 * Handle tasklist migration for case in which all CPUs covered by the
531 * specified rcu_node have gone offline. Move them up to the root
532 * rcu_node. The reason for not just moving them to the immediate
533 * parent is to remove the need for rcu_read_unlock_special() to
534 * make more than two attempts to acquire the target rcu_node's lock.
535 * Returns true if there were tasks blocking the current RCU grace
536 * period.
537 *
538 * Returns 1 if there was previously a task blocking the current grace
539 * period on the specified rcu_node structure.
540 *
541 * The caller must hold rnp->lock with irqs disabled.
542 */
543static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
544 struct rcu_node *rnp,
545 struct rcu_data *rdp)
546{
547 struct list_head *lp;
548 struct list_head *lp_root;
549 int retval = 0;
550 struct rcu_node *rnp_root = rcu_get_root(rsp);
551 struct task_struct *t;
552
553 if (rnp == rnp_root) {
554 WARN_ONCE(1, "Last CPU thought to be offlined?");
555 return 0; /* Shouldn't happen: at least one CPU online. */
556 }
557
558 /* If we are on an internal node, complain bitterly. */
559 WARN_ON_ONCE(rnp != rdp->mynode);
560
561 /*
562 * Move tasks up to root rcu_node. Don't try to get fancy for
563 * this corner-case operation -- just put this node's tasks
564 * at the head of the root node's list, and update the root node's
565 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
566 * if non-NULL. This might result in waiting for more tasks than
567 * absolutely necessary, but this is a good performance/complexity
568 * tradeoff.
569 */
570 if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
571 retval |= RCU_OFL_TASKS_NORM_GP;
572 if (rcu_preempted_readers_exp(rnp))
573 retval |= RCU_OFL_TASKS_EXP_GP;
574 lp = &rnp->blkd_tasks;
575 lp_root = &rnp_root->blkd_tasks;
576 while (!list_empty(lp)) {
577 t = list_entry(lp->next, typeof(*t), rcu_node_entry);
578 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
579 smp_mb__after_unlock_lock();
580 list_del(&t->rcu_node_entry);
581 t->rcu_blocked_node = rnp_root;
582 list_add(&t->rcu_node_entry, lp_root);
583 if (&t->rcu_node_entry == rnp->gp_tasks)
584 rnp_root->gp_tasks = rnp->gp_tasks;
585 if (&t->rcu_node_entry == rnp->exp_tasks)
586 rnp_root->exp_tasks = rnp->exp_tasks;
587#ifdef CONFIG_RCU_BOOST
588 if (&t->rcu_node_entry == rnp->boost_tasks)
589 rnp_root->boost_tasks = rnp->boost_tasks;
590#endif /* #ifdef CONFIG_RCU_BOOST */
591 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
592 }
593
594 rnp->gp_tasks = NULL;
595 rnp->exp_tasks = NULL;
596#ifdef CONFIG_RCU_BOOST
597 rnp->boost_tasks = NULL;
598 /*
599 * In case root is being boosted and leaf was not. Make sure
600 * that we boost the tasks blocking the current grace period
601 * in this case.
602 */
603 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
604 smp_mb__after_unlock_lock();
605 if (rnp_root->boost_tasks != NULL &&
606 rnp_root->boost_tasks != rnp_root->gp_tasks &&
607 rnp_root->boost_tasks != rnp_root->exp_tasks)
608 rnp_root->boost_tasks = rnp_root->gp_tasks;
609 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
610#endif /* #ifdef CONFIG_RCU_BOOST */
611
612 return retval;
613}
614
615#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 524#endif /* #ifdef CONFIG_HOTPLUG_CPU */
616 525
617/* 526/*
@@ -771,7 +680,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
771 680
772 raw_spin_lock_irqsave(&rnp->lock, flags); 681 raw_spin_lock_irqsave(&rnp->lock, flags);
773 smp_mb__after_unlock_lock(); 682 smp_mb__after_unlock_lock();
774 if (list_empty(&rnp->blkd_tasks)) { 683 if (!rcu_preempt_has_tasks(rnp)) {
775 raw_spin_unlock_irqrestore(&rnp->lock, flags); 684 raw_spin_unlock_irqrestore(&rnp->lock, flags);
776 } else { 685 } else {
777 rnp->exp_tasks = rnp->blkd_tasks.next; 686 rnp->exp_tasks = rnp->blkd_tasks.next;
@@ -933,15 +842,6 @@ static void __init rcu_bootup_announce(void)
933} 842}
934 843
935/* 844/*
936 * Return the number of RCU batches processed thus far for debug & stats.
937 */
938long rcu_batches_completed(void)
939{
940 return rcu_batches_completed_sched();
941}
942EXPORT_SYMBOL_GPL(rcu_batches_completed);
943
944/*
945 * Because preemptible RCU does not exist, we never have to check for 845 * Because preemptible RCU does not exist, we never have to check for
946 * CPUs being in quiescent states. 846 * CPUs being in quiescent states.
947 */ 847 */
@@ -960,11 +860,12 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
960 860
961#ifdef CONFIG_HOTPLUG_CPU 861#ifdef CONFIG_HOTPLUG_CPU
962 862
963/* Because preemptible RCU does not exist, no quieting of tasks. */ 863/*
964static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 864 * Because there is no preemptible RCU, there can be no readers blocked.
965 __releases(rnp->lock) 865 */
866static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
966{ 867{
967 raw_spin_unlock_irqrestore(&rnp->lock, flags); 868 return false;
968} 869}
969 870
970#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 871#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -996,23 +897,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
996 WARN_ON_ONCE(rnp->qsmask); 897 WARN_ON_ONCE(rnp->qsmask);
997} 898}
998 899
999#ifdef CONFIG_HOTPLUG_CPU
1000
1001/*
1002 * Because preemptible RCU does not exist, it never needs to migrate
1003 * tasks that were blocked within RCU read-side critical sections, and
1004 * such non-existent tasks cannot possibly have been blocking the current
1005 * grace period.
1006 */
1007static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1008 struct rcu_node *rnp,
1009 struct rcu_data *rdp)
1010{
1011 return 0;
1012}
1013
1014#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1015
1016/* 900/*
1017 * Because preemptible RCU does not exist, it never has any callbacks 901 * Because preemptible RCU does not exist, it never has any callbacks
1018 * to check. 902 * to check.
@@ -1031,20 +915,6 @@ void synchronize_rcu_expedited(void)
1031} 915}
1032EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 916EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1033 917
1034#ifdef CONFIG_HOTPLUG_CPU
1035
1036/*
1037 * Because preemptible RCU does not exist, there is never any need to
1038 * report on tasks preempted in RCU read-side critical sections during
1039 * expedited RCU grace periods.
1040 */
1041static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1042 bool wake)
1043{
1044}
1045
1046#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1047
1048/* 918/*
1049 * Because preemptible RCU does not exist, rcu_barrier() is just 919 * Because preemptible RCU does not exist, rcu_barrier() is just
1050 * another name for rcu_barrier_sched(). 920 * another name for rcu_barrier_sched().
@@ -1080,7 +950,7 @@ void exit_rcu(void)
1080 950
1081static void rcu_initiate_boost_trace(struct rcu_node *rnp) 951static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1082{ 952{
1083 if (list_empty(&rnp->blkd_tasks)) 953 if (!rcu_preempt_has_tasks(rnp))
1084 rnp->n_balk_blkd_tasks++; 954 rnp->n_balk_blkd_tasks++;
1085 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) 955 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1086 rnp->n_balk_exp_gp_tasks++; 956 rnp->n_balk_exp_gp_tasks++;
@@ -1127,7 +997,8 @@ static int rcu_boost(struct rcu_node *rnp)
1127 struct task_struct *t; 997 struct task_struct *t;
1128 struct list_head *tb; 998 struct list_head *tb;
1129 999
1130 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) 1000 if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
1001 ACCESS_ONCE(rnp->boost_tasks) == NULL)
1131 return 0; /* Nothing left to boost. */ 1002 return 0; /* Nothing left to boost. */
1132 1003
1133 raw_spin_lock_irqsave(&rnp->lock, flags); 1004 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1175,15 +1046,11 @@ static int rcu_boost(struct rcu_node *rnp)
1175 */ 1046 */
1176 t = container_of(tb, struct task_struct, rcu_node_entry); 1047 t = container_of(tb, struct task_struct, rcu_node_entry);
1177 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); 1048 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
1178 init_completion(&rnp->boost_completion);
1179 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1049 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1180 /* Lock only for side effect: boosts task t's priority. */ 1050 /* Lock only for side effect: boosts task t's priority. */
1181 rt_mutex_lock(&rnp->boost_mtx); 1051 rt_mutex_lock(&rnp->boost_mtx);
1182 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ 1052 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
1183 1053
1184 /* Wait for boostee to be done w/boost_mtx before reinitializing. */
1185 wait_for_completion(&rnp->boost_completion);
1186
1187 return ACCESS_ONCE(rnp->exp_tasks) != NULL || 1054 return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1188 ACCESS_ONCE(rnp->boost_tasks) != NULL; 1055 ACCESS_ONCE(rnp->boost_tasks) != NULL;
1189} 1056}
@@ -1416,12 +1283,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1416 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) 1283 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1417 if ((mask & 0x1) && cpu != outgoingcpu) 1284 if ((mask & 0x1) && cpu != outgoingcpu)
1418 cpumask_set_cpu(cpu, cm); 1285 cpumask_set_cpu(cpu, cm);
1419 if (cpumask_weight(cm) == 0) { 1286 if (cpumask_weight(cm) == 0)
1420 cpumask_setall(cm); 1287 cpumask_setall(cm);
1421 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1422 cpumask_clear_cpu(cpu, cm);
1423 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1424 }
1425 set_cpus_allowed_ptr(t, cm); 1288 set_cpus_allowed_ptr(t, cm);
1426 free_cpumask_var(cm); 1289 free_cpumask_var(cm);
1427} 1290}
@@ -1446,12 +1309,8 @@ static void __init rcu_spawn_boost_kthreads(void)
1446 for_each_possible_cpu(cpu) 1309 for_each_possible_cpu(cpu)
1447 per_cpu(rcu_cpu_has_work, cpu) = 0; 1310 per_cpu(rcu_cpu_has_work, cpu) = 0;
1448 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); 1311 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
1449 rnp = rcu_get_root(rcu_state_p); 1312 rcu_for_each_leaf_node(rcu_state_p, rnp)
1450 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); 1313 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1451 if (NUM_RCU_NODES > 1) {
1452 rcu_for_each_leaf_node(rcu_state_p, rnp)
1453 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1454 }
1455} 1314}
1456 1315
1457static void rcu_prepare_kthreads(int cpu) 1316static void rcu_prepare_kthreads(int cpu)
@@ -1605,7 +1464,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
1605 * completed since we last checked and there are 1464 * completed since we last checked and there are
1606 * callbacks not yet ready to invoke. 1465 * callbacks not yet ready to invoke.
1607 */ 1466 */
1608 if (rdp->completed != rnp->completed && 1467 if ((rdp->completed != rnp->completed ||
1468 unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
1609 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) 1469 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1610 note_gp_changes(rsp, rdp); 1470 note_gp_changes(rsp, rdp);
1611 1471
@@ -1898,11 +1758,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1898 ticks_value = rsp->gpnum - rdp->gpnum; 1758 ticks_value = rsp->gpnum - rdp->gpnum;
1899 } 1759 }
1900 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1760 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
1901 pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", 1761 pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
1902 cpu, ticks_value, ticks_title, 1762 cpu, ticks_value, ticks_title,
1903 atomic_read(&rdtp->dynticks) & 0xfff, 1763 atomic_read(&rdtp->dynticks) & 0xfff,
1904 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 1764 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
1905 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), 1765 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
1766 ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
1906 fast_no_hz); 1767 fast_no_hz);
1907} 1768}
1908 1769
@@ -2056,9 +1917,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
2056static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) 1917static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2057{ 1918{
2058 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1919 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1920 unsigned long ret;
1921#ifdef CONFIG_PROVE_RCU
2059 struct rcu_head *rhp; 1922 struct rcu_head *rhp;
1923#endif /* #ifdef CONFIG_PROVE_RCU */
2060 1924
2061 /* No-CBs CPUs might have callbacks on any of three lists. */ 1925 /*
1926 * Check count of all no-CBs callbacks awaiting invocation.
1927 * There needs to be a barrier before this function is called,
1928 * but associated with a prior determination that no more
1929 * callbacks would be posted. In the worst case, the first
1930 * barrier in _rcu_barrier() suffices (but the caller cannot
1931 * necessarily rely on this, not a substitute for the caller
1932 * getting the concurrency design right!). There must also be
1933 * a barrier between the following load an posting of a callback
1934 * (if a callback is in fact needed). This is associated with an
1935 * atomic_inc() in the caller.
1936 */
1937 ret = atomic_long_read(&rdp->nocb_q_count);
1938
1939#ifdef CONFIG_PROVE_RCU
2062 rhp = ACCESS_ONCE(rdp->nocb_head); 1940 rhp = ACCESS_ONCE(rdp->nocb_head);
2063 if (!rhp) 1941 if (!rhp)
2064 rhp = ACCESS_ONCE(rdp->nocb_gp_head); 1942 rhp = ACCESS_ONCE(rdp->nocb_gp_head);
@@ -2072,8 +1950,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2072 cpu, rhp->func); 1950 cpu, rhp->func);
2073 WARN_ON_ONCE(1); 1951 WARN_ON_ONCE(1);
2074 } 1952 }
1953#endif /* #ifdef CONFIG_PROVE_RCU */
2075 1954
2076 return !!rhp; 1955 return !!ret;
2077} 1956}
2078 1957
2079/* 1958/*
@@ -2095,9 +1974,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2095 struct task_struct *t; 1974 struct task_struct *t;
2096 1975
2097 /* Enqueue the callback on the nocb list and update counts. */ 1976 /* Enqueue the callback on the nocb list and update counts. */
1977 atomic_long_add(rhcount, &rdp->nocb_q_count);
1978 /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
2098 old_rhpp = xchg(&rdp->nocb_tail, rhtp); 1979 old_rhpp = xchg(&rdp->nocb_tail, rhtp);
2099 ACCESS_ONCE(*old_rhpp) = rhp; 1980 ACCESS_ONCE(*old_rhpp) = rhp;
2100 atomic_long_add(rhcount, &rdp->nocb_q_count);
2101 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); 1981 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
2102 smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ 1982 smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
2103 1983
@@ -2288,9 +2168,6 @@ wait_again:
2288 /* Move callbacks to wait-for-GP list, which is empty. */ 2168 /* Move callbacks to wait-for-GP list, which is empty. */
2289 ACCESS_ONCE(rdp->nocb_head) = NULL; 2169 ACCESS_ONCE(rdp->nocb_head) = NULL;
2290 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); 2170 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2291 rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
2292 rdp->nocb_gp_count_lazy =
2293 atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2294 gotcbs = true; 2171 gotcbs = true;
2295 } 2172 }
2296 2173
@@ -2338,9 +2215,6 @@ wait_again:
2338 /* Append callbacks to follower's "done" list. */ 2215 /* Append callbacks to follower's "done" list. */
2339 tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); 2216 tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
2340 *tail = rdp->nocb_gp_head; 2217 *tail = rdp->nocb_gp_head;
2341 atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
2342 atomic_long_add(rdp->nocb_gp_count_lazy,
2343 &rdp->nocb_follower_count_lazy);
2344 smp_mb__after_atomic(); /* Store *tail before wakeup. */ 2218 smp_mb__after_atomic(); /* Store *tail before wakeup. */
2345 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { 2219 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
2346 /* 2220 /*
@@ -2415,13 +2289,11 @@ static int rcu_nocb_kthread(void *arg)
2415 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); 2289 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
2416 ACCESS_ONCE(rdp->nocb_follower_head) = NULL; 2290 ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
2417 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); 2291 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
2418 c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
2419 cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
2420 rdp->nocb_p_count += c;
2421 rdp->nocb_p_count_lazy += cl;
2422 2292
2423 /* Each pass through the following loop invokes a callback. */ 2293 /* Each pass through the following loop invokes a callback. */
2424 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); 2294 trace_rcu_batch_start(rdp->rsp->name,
2295 atomic_long_read(&rdp->nocb_q_count_lazy),
2296 atomic_long_read(&rdp->nocb_q_count), -1);
2425 c = cl = 0; 2297 c = cl = 0;
2426 while (list) { 2298 while (list) {
2427 next = list->next; 2299 next = list->next;
@@ -2443,9 +2315,9 @@ static int rcu_nocb_kthread(void *arg)
2443 list = next; 2315 list = next;
2444 } 2316 }
2445 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); 2317 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2446 ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c; 2318 smp_mb__before_atomic(); /* _add after CB invocation. */
2447 ACCESS_ONCE(rdp->nocb_p_count_lazy) = 2319 atomic_long_add(-c, &rdp->nocb_q_count);
2448 rdp->nocb_p_count_lazy - cl; 2320 atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
2449 rdp->n_nocbs_invoked += c; 2321 rdp->n_nocbs_invoked += c;
2450 } 2322 }
2451 return 0; 2323 return 0;
@@ -2513,8 +2385,8 @@ void __init rcu_init_nohz(void)
2513 cpumask_and(rcu_nocb_mask, cpu_possible_mask, 2385 cpumask_and(rcu_nocb_mask, cpu_possible_mask,
2514 rcu_nocb_mask); 2386 rcu_nocb_mask);
2515 } 2387 }
2516 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 2388 pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
2517 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); 2389 cpumask_pr_args(rcu_nocb_mask));
2518 if (rcu_nocb_poll) 2390 if (rcu_nocb_poll)
2519 pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); 2391 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
2520 2392
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 5cdc62e1beeb..fbb6240509ea 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -46,6 +46,8 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "tree.h" 47#include "tree.h"
48 48
49DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
50
49static int r_open(struct inode *inode, struct file *file, 51static int r_open(struct inode *inode, struct file *file,
50 const struct seq_operations *op) 52 const struct seq_operations *op)
51{ 53{
@@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
115 117
116 if (!rdp->beenonline) 118 if (!rdp->beenonline)
117 return; 119 return;
118 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", 120 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
119 rdp->cpu, 121 rdp->cpu,
120 cpu_is_offline(rdp->cpu) ? '!' : ' ', 122 cpu_is_offline(rdp->cpu) ? '!' : ' ',
121 ulong2long(rdp->completed), ulong2long(rdp->gpnum), 123 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
122 rdp->passed_quiesce, rdp->qs_pending); 124 rdp->passed_quiesce,
125 rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
126 rdp->qs_pending);
123 seq_printf(m, " dt=%d/%llx/%d df=%lu", 127 seq_printf(m, " dt=%d/%llx/%d df=%lu",
124 atomic_read(&rdp->dynticks->dynticks), 128 atomic_read(&rdp->dynticks->dynticks),
125 rdp->dynticks->dynticks_nesting, 129 rdp->dynticks->dynticks_nesting,
diff --git a/kernel/resource.c b/kernel/resource.c
index 0bcebffc4e77..19f2357dfda3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -22,6 +22,7 @@
22#include <linux/device.h> 22#include <linux/device.h>
23#include <linux/pfn.h> 23#include <linux/pfn.h>
24#include <linux/mm.h> 24#include <linux/mm.h>
25#include <linux/resource_ext.h>
25#include <asm/io.h> 26#include <asm/io.h>
26 27
27 28
@@ -1529,6 +1530,30 @@ int iomem_is_exclusive(u64 addr)
1529 return err; 1530 return err;
1530} 1531}
1531 1532
1533struct resource_entry *resource_list_create_entry(struct resource *res,
1534 size_t extra_size)
1535{
1536 struct resource_entry *entry;
1537
1538 entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
1539 if (entry) {
1540 INIT_LIST_HEAD(&entry->node);
1541 entry->res = res ? res : &entry->__res;
1542 }
1543
1544 return entry;
1545}
1546EXPORT_SYMBOL(resource_list_create_entry);
1547
1548void resource_list_free(struct list_head *head)
1549{
1550 struct resource_entry *entry, *tmp;
1551
1552 list_for_each_entry_safe(entry, tmp, head, node)
1553 resource_list_destroy_entry(entry);
1554}
1555EXPORT_SYMBOL(resource_list_free);
1556
1532static int __init strict_iomem(char *str) 1557static int __init strict_iomem(char *str)
1533{ 1558{
1534 if (strstr(str, "relaxed")) 1559 if (strstr(str, "relaxed"))
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index ab32b7b0db5c..46be87024875 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,5 +1,5 @@
1ifdef CONFIG_FUNCTION_TRACER 1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_clock.o = -pg 2CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
3endif 3endif
4 4
5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c27e4f8f4879..c0a205101c23 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -420,3 +420,16 @@ u64 local_clock(void)
420 420
421EXPORT_SYMBOL_GPL(cpu_clock); 421EXPORT_SYMBOL_GPL(cpu_clock);
422EXPORT_SYMBOL_GPL(local_clock); 422EXPORT_SYMBOL_GPL(local_clock);
423
424/*
425 * Running clock - returns the time that has elapsed while a guest has been
426 * running.
427 * On a guest this value should be local_clock minus the time the guest was
428 * suspended by the hypervisor (for any reason).
429 * On bare metal this function should return the same as local_clock.
430 * Architectures and sub-architectures can override this.
431 */
432u64 __weak running_clock(void)
433{
434 return local_clock();
435}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 607f852b4d04..7052d3fd4e7b 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -268,6 +268,15 @@ bool try_wait_for_completion(struct completion *x)
268 unsigned long flags; 268 unsigned long flags;
269 int ret = 1; 269 int ret = 1;
270 270
271 /*
272 * Since x->done will need to be locked only
273 * in the non-blocking case, we check x->done
274 * first without taking the lock so we can
275 * return early in the blocking case.
276 */
277 if (!ACCESS_ONCE(x->done))
278 return 0;
279
271 spin_lock_irqsave(&x->wait.lock, flags); 280 spin_lock_irqsave(&x->wait.lock, flags);
272 if (!x->done) 281 if (!x->done)
273 ret = 0; 282 ret = 0;
@@ -288,13 +297,6 @@ EXPORT_SYMBOL(try_wait_for_completion);
288 */ 297 */
289bool completion_done(struct completion *x) 298bool completion_done(struct completion *x)
290{ 299{
291 unsigned long flags; 300 return !!ACCESS_ONCE(x->done);
292 int ret = 1;
293
294 spin_lock_irqsave(&x->wait.lock, flags);
295 if (!x->done)
296 ret = 0;
297 spin_unlock_irqrestore(&x->wait.lock, flags);
298 return ret;
299} 301}
300EXPORT_SYMBOL(completion_done); 302EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b5797b78add6..13049aac05a6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
119{ 119{
120 s64 delta; 120 s64 delta;
121 121
122 if (rq->skip_clock_update > 0) 122 lockdep_assert_held(&rq->lock);
123
124 if (rq->clock_skip_update & RQCF_ACT_SKIP)
123 return; 125 return;
124 126
125 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 127 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -490,6 +492,11 @@ static __init void init_hrtick(void)
490 */ 492 */
491void hrtick_start(struct rq *rq, u64 delay) 493void hrtick_start(struct rq *rq, u64 delay)
492{ 494{
495 /*
496 * Don't schedule slices shorter than 10000ns, that just
497 * doesn't make sense. Rely on vruntime for fairness.
498 */
499 delay = max_t(u64, delay, 10000LL);
493 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 500 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
494 HRTIMER_MODE_REL_PINNED, 0); 501 HRTIMER_MODE_REL_PINNED, 0);
495} 502}
@@ -1046,7 +1053,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1046 * this case, we can save a useless back to back clock update. 1053 * this case, we can save a useless back to back clock update.
1047 */ 1054 */
1048 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) 1055 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1049 rq->skip_clock_update = 1; 1056 rq_clock_skip_update(rq, true);
1050} 1057}
1051 1058
1052#ifdef CONFIG_SMP 1059#ifdef CONFIG_SMP
@@ -1082,7 +1089,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1082 if (p->sched_class->migrate_task_rq) 1089 if (p->sched_class->migrate_task_rq)
1083 p->sched_class->migrate_task_rq(p, new_cpu); 1090 p->sched_class->migrate_task_rq(p, new_cpu);
1084 p->se.nr_migrations++; 1091 p->se.nr_migrations++;
1085 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 1092 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
1086 } 1093 }
1087 1094
1088 __set_task_cpu(p, new_cpu); 1095 __set_task_cpu(p, new_cpu);
@@ -1814,6 +1821,10 @@ void __dl_clear_params(struct task_struct *p)
1814 dl_se->dl_period = 0; 1821 dl_se->dl_period = 0;
1815 dl_se->flags = 0; 1822 dl_se->flags = 0;
1816 dl_se->dl_bw = 0; 1823 dl_se->dl_bw = 0;
1824
1825 dl_se->dl_throttled = 0;
1826 dl_se->dl_new = 1;
1827 dl_se->dl_yielded = 0;
1817} 1828}
1818 1829
1819/* 1830/*
@@ -1832,6 +1843,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1832 p->se.prev_sum_exec_runtime = 0; 1843 p->se.prev_sum_exec_runtime = 0;
1833 p->se.nr_migrations = 0; 1844 p->se.nr_migrations = 0;
1834 p->se.vruntime = 0; 1845 p->se.vruntime = 0;
1846#ifdef CONFIG_SMP
1847 p->se.avg.decay_count = 0;
1848#endif
1835 INIT_LIST_HEAD(&p->se.group_node); 1849 INIT_LIST_HEAD(&p->se.group_node);
1836 1850
1837#ifdef CONFIG_SCHEDSTATS 1851#ifdef CONFIG_SCHEDSTATS
@@ -1839,7 +1853,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1839#endif 1853#endif
1840 1854
1841 RB_CLEAR_NODE(&p->dl.rb_node); 1855 RB_CLEAR_NODE(&p->dl.rb_node);
1842 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1856 init_dl_task_timer(&p->dl);
1843 __dl_clear_params(p); 1857 __dl_clear_params(p);
1844 1858
1845 INIT_LIST_HEAD(&p->rt.run_list); 1859 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2049,6 +2063,9 @@ static inline int dl_bw_cpus(int i)
2049 * allocated bandwidth to reflect the new situation. 2063 * allocated bandwidth to reflect the new situation.
2050 * 2064 *
2051 * This function is called while holding p's rq->lock. 2065 * This function is called while holding p's rq->lock.
2066 *
2067 * XXX we should delay bw change until the task's 0-lag point, see
2068 * __setparam_dl().
2052 */ 2069 */
2053static int dl_overflow(struct task_struct *p, int policy, 2070static int dl_overflow(struct task_struct *p, int policy,
2054 const struct sched_attr *attr) 2071 const struct sched_attr *attr)
@@ -2748,6 +2765,10 @@ again:
2748 * - explicit schedule() call 2765 * - explicit schedule() call
2749 * - return from syscall or exception to user-space 2766 * - return from syscall or exception to user-space
2750 * - return from interrupt-handler to user-space 2767 * - return from interrupt-handler to user-space
2768 *
2769 * WARNING: all callers must re-check need_resched() afterward and reschedule
2770 * accordingly in case an event triggered the need for rescheduling (such as
2771 * an interrupt waking up a task) while preemption was disabled in __schedule().
2751 */ 2772 */
2752static void __sched __schedule(void) 2773static void __sched __schedule(void)
2753{ 2774{
@@ -2756,7 +2777,6 @@ static void __sched __schedule(void)
2756 struct rq *rq; 2777 struct rq *rq;
2757 int cpu; 2778 int cpu;
2758 2779
2759need_resched:
2760 preempt_disable(); 2780 preempt_disable();
2761 cpu = smp_processor_id(); 2781 cpu = smp_processor_id();
2762 rq = cpu_rq(cpu); 2782 rq = cpu_rq(cpu);
@@ -2776,6 +2796,8 @@ need_resched:
2776 smp_mb__before_spinlock(); 2796 smp_mb__before_spinlock();
2777 raw_spin_lock_irq(&rq->lock); 2797 raw_spin_lock_irq(&rq->lock);
2778 2798
2799 rq->clock_skip_update <<= 1; /* promote REQ to ACT */
2800
2779 switch_count = &prev->nivcsw; 2801 switch_count = &prev->nivcsw;
2780 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2802 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2781 if (unlikely(signal_pending_state(prev->state, prev))) { 2803 if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2800,13 +2822,13 @@ need_resched:
2800 switch_count = &prev->nvcsw; 2822 switch_count = &prev->nvcsw;
2801 } 2823 }
2802 2824
2803 if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) 2825 if (task_on_rq_queued(prev))
2804 update_rq_clock(rq); 2826 update_rq_clock(rq);
2805 2827
2806 next = pick_next_task(rq, prev); 2828 next = pick_next_task(rq, prev);
2807 clear_tsk_need_resched(prev); 2829 clear_tsk_need_resched(prev);
2808 clear_preempt_need_resched(); 2830 clear_preempt_need_resched();
2809 rq->skip_clock_update = 0; 2831 rq->clock_skip_update = 0;
2810 2832
2811 if (likely(prev != next)) { 2833 if (likely(prev != next)) {
2812 rq->nr_switches++; 2834 rq->nr_switches++;
@@ -2821,8 +2843,6 @@ need_resched:
2821 post_schedule(rq); 2843 post_schedule(rq);
2822 2844
2823 sched_preempt_enable_no_resched(); 2845 sched_preempt_enable_no_resched();
2824 if (need_resched())
2825 goto need_resched;
2826} 2846}
2827 2847
2828static inline void sched_submit_work(struct task_struct *tsk) 2848static inline void sched_submit_work(struct task_struct *tsk)
@@ -2842,7 +2862,9 @@ asmlinkage __visible void __sched schedule(void)
2842 struct task_struct *tsk = current; 2862 struct task_struct *tsk = current;
2843 2863
2844 sched_submit_work(tsk); 2864 sched_submit_work(tsk);
2845 __schedule(); 2865 do {
2866 __schedule();
2867 } while (need_resched());
2846} 2868}
2847EXPORT_SYMBOL(schedule); 2869EXPORT_SYMBOL(schedule);
2848 2870
@@ -2877,6 +2899,21 @@ void __sched schedule_preempt_disabled(void)
2877 preempt_disable(); 2899 preempt_disable();
2878} 2900}
2879 2901
2902static void preempt_schedule_common(void)
2903{
2904 do {
2905 __preempt_count_add(PREEMPT_ACTIVE);
2906 __schedule();
2907 __preempt_count_sub(PREEMPT_ACTIVE);
2908
2909 /*
2910 * Check again in case we missed a preemption opportunity
2911 * between schedule and now.
2912 */
2913 barrier();
2914 } while (need_resched());
2915}
2916
2880#ifdef CONFIG_PREEMPT 2917#ifdef CONFIG_PREEMPT
2881/* 2918/*
2882 * this is the entry point to schedule() from in-kernel preemption 2919 * this is the entry point to schedule() from in-kernel preemption
@@ -2892,17 +2929,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
2892 if (likely(!preemptible())) 2929 if (likely(!preemptible()))
2893 return; 2930 return;
2894 2931
2895 do { 2932 preempt_schedule_common();
2896 __preempt_count_add(PREEMPT_ACTIVE);
2897 __schedule();
2898 __preempt_count_sub(PREEMPT_ACTIVE);
2899
2900 /*
2901 * Check again in case we missed a preemption opportunity
2902 * between schedule and now.
2903 */
2904 barrier();
2905 } while (need_resched());
2906} 2933}
2907NOKPROBE_SYMBOL(preempt_schedule); 2934NOKPROBE_SYMBOL(preempt_schedule);
2908EXPORT_SYMBOL(preempt_schedule); 2935EXPORT_SYMBOL(preempt_schedule);
@@ -3251,15 +3278,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3251{ 3278{
3252 struct sched_dl_entity *dl_se = &p->dl; 3279 struct sched_dl_entity *dl_se = &p->dl;
3253 3280
3254 init_dl_task_timer(dl_se);
3255 dl_se->dl_runtime = attr->sched_runtime; 3281 dl_se->dl_runtime = attr->sched_runtime;
3256 dl_se->dl_deadline = attr->sched_deadline; 3282 dl_se->dl_deadline = attr->sched_deadline;
3257 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; 3283 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3258 dl_se->flags = attr->sched_flags; 3284 dl_se->flags = attr->sched_flags;
3259 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3285 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3260 dl_se->dl_throttled = 0; 3286
3261 dl_se->dl_new = 1; 3287 /*
3262 dl_se->dl_yielded = 0; 3288 * Changing the parameters of a task is 'tricky' and we're not doing
3289 * the correct thing -- also see task_dead_dl() and switched_from_dl().
3290 *
3291 * What we SHOULD do is delay the bandwidth release until the 0-lag
3292 * point. This would include retaining the task_struct until that time
3293 * and change dl_overflow() to not immediately decrement the current
3294 * amount.
3295 *
3296 * Instead we retain the current runtime/deadline and let the new
3297 * parameters take effect after the current reservation period lapses.
3298 * This is safe (albeit pessimistic) because the 0-lag point is always
3299 * before the current scheduling deadline.
3300 *
3301 * We can still have temporary overloads because we do not delay the
3302 * change in bandwidth until that time; so admission control is
3303 * not on the safe side. It does however guarantee tasks will never
3304 * consume more than promised.
3305 */
3263} 3306}
3264 3307
3265/* 3308/*
@@ -3382,6 +3425,20 @@ static bool check_same_owner(struct task_struct *p)
3382 return match; 3425 return match;
3383} 3426}
3384 3427
3428static bool dl_param_changed(struct task_struct *p,
3429 const struct sched_attr *attr)
3430{
3431 struct sched_dl_entity *dl_se = &p->dl;
3432
3433 if (dl_se->dl_runtime != attr->sched_runtime ||
3434 dl_se->dl_deadline != attr->sched_deadline ||
3435 dl_se->dl_period != attr->sched_period ||
3436 dl_se->flags != attr->sched_flags)
3437 return true;
3438
3439 return false;
3440}
3441
3385static int __sched_setscheduler(struct task_struct *p, 3442static int __sched_setscheduler(struct task_struct *p,
3386 const struct sched_attr *attr, 3443 const struct sched_attr *attr,
3387 bool user) 3444 bool user)
@@ -3510,7 +3567,7 @@ recheck:
3510 goto change; 3567 goto change;
3511 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3568 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3512 goto change; 3569 goto change;
3513 if (dl_policy(policy)) 3570 if (dl_policy(policy) && dl_param_changed(p, attr))
3514 goto change; 3571 goto change;
3515 3572
3516 p->sched_reset_on_fork = reset_on_fork; 3573 p->sched_reset_on_fork = reset_on_fork;
@@ -4202,17 +4259,10 @@ SYSCALL_DEFINE0(sched_yield)
4202 return 0; 4259 return 0;
4203} 4260}
4204 4261
4205static void __cond_resched(void)
4206{
4207 __preempt_count_add(PREEMPT_ACTIVE);
4208 __schedule();
4209 __preempt_count_sub(PREEMPT_ACTIVE);
4210}
4211
4212int __sched _cond_resched(void) 4262int __sched _cond_resched(void)
4213{ 4263{
4214 if (should_resched()) { 4264 if (should_resched()) {
4215 __cond_resched(); 4265 preempt_schedule_common();
4216 return 1; 4266 return 1;
4217 } 4267 }
4218 return 0; 4268 return 0;
@@ -4237,7 +4287,7 @@ int __cond_resched_lock(spinlock_t *lock)
4237 if (spin_needbreak(lock) || resched) { 4287 if (spin_needbreak(lock) || resched) {
4238 spin_unlock(lock); 4288 spin_unlock(lock);
4239 if (resched) 4289 if (resched)
4240 __cond_resched(); 4290 preempt_schedule_common();
4241 else 4291 else
4242 cpu_relax(); 4292 cpu_relax();
4243 ret = 1; 4293 ret = 1;
@@ -4253,7 +4303,7 @@ int __sched __cond_resched_softirq(void)
4253 4303
4254 if (should_resched()) { 4304 if (should_resched()) {
4255 local_bh_enable(); 4305 local_bh_enable();
4256 __cond_resched(); 4306 preempt_schedule_common();
4257 local_bh_disable(); 4307 local_bh_disable();
4258 return 1; 4308 return 1;
4259 } 4309 }
@@ -4508,9 +4558,10 @@ void sched_show_task(struct task_struct *p)
4508{ 4558{
4509 unsigned long free = 0; 4559 unsigned long free = 0;
4510 int ppid; 4560 int ppid;
4511 unsigned state; 4561 unsigned long state = p->state;
4512 4562
4513 state = p->state ? __ffs(p->state) + 1 : 0; 4563 if (state)
4564 state = __ffs(state) + 1;
4514 printk(KERN_INFO "%-15.15s %c", p->comm, 4565 printk(KERN_INFO "%-15.15s %c", p->comm,
4515 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4566 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4516#if BITS_PER_LONG == 32 4567#if BITS_PER_LONG == 32
@@ -4642,6 +4693,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
4642 struct dl_bw *cur_dl_b; 4693 struct dl_bw *cur_dl_b;
4643 unsigned long flags; 4694 unsigned long flags;
4644 4695
4696 if (!cpumask_weight(cur))
4697 return ret;
4698
4645 rcu_read_lock_sched(); 4699 rcu_read_lock_sched();
4646 cur_dl_b = dl_bw_of(cpumask_any(cur)); 4700 cur_dl_b = dl_bw_of(cpumask_any(cur));
4647 trial_cpus = cpumask_weight(trial); 4701 trial_cpus = cpumask_weight(trial);
@@ -4740,7 +4794,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
4740 4794
4741void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4795void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4742{ 4796{
4743 if (p->sched_class && p->sched_class->set_cpus_allowed) 4797 if (p->sched_class->set_cpus_allowed)
4744 p->sched_class->set_cpus_allowed(p, new_mask); 4798 p->sched_class->set_cpus_allowed(p, new_mask);
4745 4799
4746 cpumask_copy(&p->cpus_allowed, new_mask); 4800 cpumask_copy(&p->cpus_allowed, new_mask);
@@ -5408,9 +5462,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5408 struct cpumask *groupmask) 5462 struct cpumask *groupmask)
5409{ 5463{
5410 struct sched_group *group = sd->groups; 5464 struct sched_group *group = sd->groups;
5411 char str[256];
5412 5465
5413 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5414 cpumask_clear(groupmask); 5466 cpumask_clear(groupmask);
5415 5467
5416 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 5468 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -5423,7 +5475,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5423 return -1; 5475 return -1;
5424 } 5476 }
5425 5477
5426 printk(KERN_CONT "span %s level %s\n", str, sd->name); 5478 printk(KERN_CONT "span %*pbl level %s\n",
5479 cpumask_pr_args(sched_domain_span(sd)), sd->name);
5427 5480
5428 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 5481 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5429 printk(KERN_ERR "ERROR: domain->span does not contain " 5482 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -5468,9 +5521,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5468 5521
5469 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 5522 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5470 5523
5471 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 5524 printk(KERN_CONT " %*pbl",
5472 5525 cpumask_pr_args(sched_group_cpus(group)));
5473 printk(KERN_CONT " %s", str);
5474 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { 5526 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
5475 printk(KERN_CONT " (cpu_capacity = %d)", 5527 printk(KERN_CONT " (cpu_capacity = %d)",
5476 group->sgc->capacity); 5528 group->sgc->capacity);
@@ -7113,9 +7165,6 @@ void __init sched_init(void)
7113#ifdef CONFIG_RT_GROUP_SCHED 7165#ifdef CONFIG_RT_GROUP_SCHED
7114 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7166 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7115#endif 7167#endif
7116#ifdef CONFIG_CPUMASK_OFFSTACK
7117 alloc_size += num_possible_cpus() * cpumask_size();
7118#endif
7119 if (alloc_size) { 7168 if (alloc_size) {
7120 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7169 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7121 7170
@@ -7135,13 +7184,13 @@ void __init sched_init(void)
7135 ptr += nr_cpu_ids * sizeof(void **); 7184 ptr += nr_cpu_ids * sizeof(void **);
7136 7185
7137#endif /* CONFIG_RT_GROUP_SCHED */ 7186#endif /* CONFIG_RT_GROUP_SCHED */
7187 }
7138#ifdef CONFIG_CPUMASK_OFFSTACK 7188#ifdef CONFIG_CPUMASK_OFFSTACK
7139 for_each_possible_cpu(i) { 7189 for_each_possible_cpu(i) {
7140 per_cpu(load_balance_mask, i) = (void *)ptr; 7190 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
7141 ptr += cpumask_size(); 7191 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7142 }
7143#endif /* CONFIG_CPUMASK_OFFSTACK */
7144 } 7192 }
7193#endif /* CONFIG_CPUMASK_OFFSTACK */
7145 7194
7146 init_rt_bandwidth(&def_rt_bandwidth, 7195 init_rt_bandwidth(&def_rt_bandwidth,
7147 global_rt_period(), global_rt_runtime()); 7196 global_rt_period(), global_rt_runtime());
@@ -7253,6 +7302,11 @@ void __init sched_init(void)
7253 enter_lazy_tlb(&init_mm, current); 7302 enter_lazy_tlb(&init_mm, current);
7254 7303
7255 /* 7304 /*
7305 * During early bootup we pretend to be a normal task:
7306 */
7307 current->sched_class = &fair_sched_class;
7308
7309 /*
7256 * Make us the idle thread. Technically, schedule() should not be 7310 * Make us the idle thread. Technically, schedule() should not be
7257 * called from this thread, however somewhere below it might be, 7311 * called from this thread, however somewhere below it might be,
7258 * but because we are the idle thread, we just pick up running again 7312 * but because we are the idle thread, we just pick up running again
@@ -7262,11 +7316,6 @@ void __init sched_init(void)
7262 7316
7263 calc_load_update = jiffies + LOAD_FREQ; 7317 calc_load_update = jiffies + LOAD_FREQ;
7264 7318
7265 /*
7266 * During early bootup we pretend to be a normal task:
7267 */
7268 current->sched_class = &fair_sched_class;
7269
7270#ifdef CONFIG_SMP 7319#ifdef CONFIG_SMP
7271 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 7320 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7272 /* May be allocated at isolcpus cmdline parse time */ 7321 /* May be allocated at isolcpus cmdline parse time */
@@ -7295,13 +7344,12 @@ void __might_sleep(const char *file, int line, int preempt_offset)
7295 * since we will exit with TASK_RUNNING make sure we enter with it, 7344 * since we will exit with TASK_RUNNING make sure we enter with it,
7296 * otherwise we will destroy state. 7345 * otherwise we will destroy state.
7297 */ 7346 */
7298 if (WARN_ONCE(current->state != TASK_RUNNING, 7347 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
7299 "do not call blocking ops when !TASK_RUNNING; " 7348 "do not call blocking ops when !TASK_RUNNING; "
7300 "state=%lx set at [<%p>] %pS\n", 7349 "state=%lx set at [<%p>] %pS\n",
7301 current->state, 7350 current->state,
7302 (void *)current->task_state_change, 7351 (void *)current->task_state_change,
7303 (void *)current->task_state_change)) 7352 (void *)current->task_state_change);
7304 __set_current_state(TASK_RUNNING);
7305 7353
7306 ___might_sleep(file, line, preempt_offset); 7354 ___might_sleep(file, line, preempt_offset);
7307} 7355}
@@ -7328,6 +7376,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
7328 in_atomic(), irqs_disabled(), 7376 in_atomic(), irqs_disabled(),
7329 current->pid, current->comm); 7377 current->pid, current->comm);
7330 7378
7379 if (task_stack_end_corrupted(current))
7380 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
7381
7331 debug_show_held_locks(current); 7382 debug_show_held_locks(current);
7332 if (irqs_disabled()) 7383 if (irqs_disabled())
7333 print_irqtrace_events(current); 7384 print_irqtrace_events(current);
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 539ca3ce071b..c6acb07466bb 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,7 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
107 int best_cpu = -1; 107 int best_cpu = -1;
108 const struct sched_dl_entity *dl_se = &p->dl; 108 const struct sched_dl_entity *dl_se = &p->dl;
109 109
110 if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { 110 if (later_mask &&
111 cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
111 best_cpu = cpumask_any(later_mask); 112 best_cpu = cpumask_any(later_mask);
112 goto out; 113 goto out;
113 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && 114 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
@@ -186,6 +187,26 @@ out:
186} 187}
187 188
188/* 189/*
190 * cpudl_set_freecpu - Set the cpudl.free_cpus
191 * @cp: the cpudl max-heap context
192 * @cpu: rd attached cpu
193 */
194void cpudl_set_freecpu(struct cpudl *cp, int cpu)
195{
196 cpumask_set_cpu(cpu, cp->free_cpus);
197}
198
199/*
200 * cpudl_clear_freecpu - Clear the cpudl.free_cpus
201 * @cp: the cpudl max-heap context
202 * @cpu: rd attached cpu
203 */
204void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
205{
206 cpumask_clear_cpu(cpu, cp->free_cpus);
207}
208
209/*
189 * cpudl_init - initialize the cpudl structure 210 * cpudl_init - initialize the cpudl structure
190 * @cp: the cpudl max-heap context 211 * @cp: the cpudl max-heap context
191 */ 212 */
@@ -203,7 +224,7 @@ int cpudl_init(struct cpudl *cp)
203 if (!cp->elements) 224 if (!cp->elements)
204 return -ENOMEM; 225 return -ENOMEM;
205 226
206 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { 227 if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
207 kfree(cp->elements); 228 kfree(cp->elements);
208 return -ENOMEM; 229 return -ENOMEM;
209 } 230 }
@@ -211,8 +232,6 @@ int cpudl_init(struct cpudl *cp)
211 for_each_possible_cpu(i) 232 for_each_possible_cpu(i)
212 cp->elements[i].idx = IDX_INVALID; 233 cp->elements[i].idx = IDX_INVALID;
213 234
214 cpumask_setall(cp->free_cpus);
215
216 return 0; 235 return 0;
217} 236}
218 237
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 020039bd1326..1a0a6ef2fbe1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
24 struct cpumask *later_mask); 24 struct cpumask *later_mask);
25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); 25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26int cpudl_init(struct cpudl *cp); 26int cpudl_init(struct cpudl *cp);
27void cpudl_set_freecpu(struct cpudl *cp, int cpu);
28void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
27void cpudl_cleanup(struct cpudl *cp); 29void cpudl_cleanup(struct cpudl *cp);
28#endif /* CONFIG_SMP */ 30#endif /* CONFIG_SMP */
29 31
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e5db8c6feebd..a027799ae130 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -350,6 +350,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
350 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 350 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
351 dl_se->runtime = pi_se->dl_runtime; 351 dl_se->runtime = pi_se->dl_runtime;
352 } 352 }
353
354 if (dl_se->dl_yielded)
355 dl_se->dl_yielded = 0;
356 if (dl_se->dl_throttled)
357 dl_se->dl_throttled = 0;
353} 358}
354 359
355/* 360/*
@@ -536,23 +541,19 @@ again:
536 541
537 sched_clock_tick(); 542 sched_clock_tick();
538 update_rq_clock(rq); 543 update_rq_clock(rq);
539 dl_se->dl_throttled = 0; 544 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
540 dl_se->dl_yielded = 0; 545 if (dl_task(rq->curr))
541 if (task_on_rq_queued(p)) { 546 check_preempt_curr_dl(rq, p, 0);
542 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 547 else
543 if (dl_task(rq->curr)) 548 resched_curr(rq);
544 check_preempt_curr_dl(rq, p, 0);
545 else
546 resched_curr(rq);
547#ifdef CONFIG_SMP 549#ifdef CONFIG_SMP
548 /* 550 /*
549 * Queueing this task back might have overloaded rq, 551 * Queueing this task back might have overloaded rq,
550 * check if we need to kick someone away. 552 * check if we need to kick someone away.
551 */ 553 */
552 if (has_pushable_dl_tasks(rq)) 554 if (has_pushable_dl_tasks(rq))
553 push_dl_task(rq); 555 push_dl_task(rq);
554#endif 556#endif
555 }
556unlock: 557unlock:
557 raw_spin_unlock(&rq->lock); 558 raw_spin_unlock(&rq->lock);
558 559
@@ -570,24 +571,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
570static 571static
571int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) 572int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
572{ 573{
573 int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); 574 return (dl_se->runtime <= 0);
574 int rorun = dl_se->runtime <= 0;
575
576 if (!rorun && !dmiss)
577 return 0;
578
579 /*
580 * If we are beyond our current deadline and we are still
581 * executing, then we have already used some of the runtime of
582 * the next instance. Thus, if we do not account that, we are
583 * stealing bandwidth from the system at each deadline miss!
584 */
585 if (dmiss) {
586 dl_se->runtime = rorun ? dl_se->runtime : 0;
587 dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
588 }
589
590 return 1;
591} 575}
592 576
593extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); 577extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
@@ -630,10 +614,9 @@ static void update_curr_dl(struct rq *rq)
630 614
631 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; 615 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
632 if (dl_runtime_exceeded(rq, dl_se)) { 616 if (dl_runtime_exceeded(rq, dl_se)) {
617 dl_se->dl_throttled = 1;
633 __dequeue_task_dl(rq, curr, 0); 618 __dequeue_task_dl(rq, curr, 0);
634 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) 619 if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
635 dl_se->dl_throttled = 1;
636 else
637 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); 620 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
638 621
639 if (!is_leftmost(curr, &rq->dl)) 622 if (!is_leftmost(curr, &rq->dl))
@@ -826,10 +809,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
826 * parameters of the task might need updating. Otherwise, 809 * parameters of the task might need updating. Otherwise,
827 * we want a replenishment of its runtime. 810 * we want a replenishment of its runtime.
828 */ 811 */
829 if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) 812 if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
830 replenish_dl_entity(dl_se, pi_se);
831 else
832 update_dl_entity(dl_se, pi_se); 813 update_dl_entity(dl_se, pi_se);
814 else if (flags & ENQUEUE_REPLENISH)
815 replenish_dl_entity(dl_se, pi_se);
833 816
834 __enqueue_dl_entity(dl_se); 817 __enqueue_dl_entity(dl_se);
835} 818}
@@ -870,7 +853,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
870 * its rq, the bandwidth timer callback (which clearly has not 853 * its rq, the bandwidth timer callback (which clearly has not
871 * run yet) will take care of this. 854 * run yet) will take care of this.
872 */ 855 */
873 if (p->dl.dl_throttled) 856 if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
874 return; 857 return;
875 858
876 enqueue_dl_entity(&p->dl, pi_se, flags); 859 enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -1090,7 +1073,13 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1090{ 1073{
1091 update_curr_dl(rq); 1074 update_curr_dl(rq);
1092 1075
1093 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) 1076 /*
1077 * Even when we have runtime, update_curr_dl() might have resulted in us
1078 * not being the leftmost task anymore. In that case NEED_RESCHED will
1079 * be set and schedule() will start a new hrtick for the next task.
1080 */
1081 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
1082 is_leftmost(p, &rq->dl))
1094 start_hrtick_dl(rq, p); 1083 start_hrtick_dl(rq, p);
1095} 1084}
1096 1085
@@ -1111,6 +1100,7 @@ static void task_dead_dl(struct task_struct *p)
1111 * Since we are TASK_DEAD we won't slip out of the domain! 1100 * Since we are TASK_DEAD we won't slip out of the domain!
1112 */ 1101 */
1113 raw_spin_lock_irq(&dl_b->lock); 1102 raw_spin_lock_irq(&dl_b->lock);
1103 /* XXX we should retain the bw until 0-lag */
1114 dl_b->total_bw -= p->dl.dl_bw; 1104 dl_b->total_bw -= p->dl.dl_bw;
1115 raw_spin_unlock_irq(&dl_b->lock); 1105 raw_spin_unlock_irq(&dl_b->lock);
1116 1106
@@ -1182,9 +1172,6 @@ static int find_later_rq(struct task_struct *task)
1182 * We have to consider system topology and task affinity 1172 * We have to consider system topology and task affinity
1183 * first, then we can look for a suitable cpu. 1173 * first, then we can look for a suitable cpu.
1184 */ 1174 */
1185 cpumask_copy(later_mask, task_rq(task)->rd->span);
1186 cpumask_and(later_mask, later_mask, cpu_active_mask);
1187 cpumask_and(later_mask, later_mask, &task->cpus_allowed);
1188 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, 1175 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
1189 task, later_mask); 1176 task, later_mask);
1190 if (best_cpu == -1) 1177 if (best_cpu == -1)
@@ -1579,6 +1566,7 @@ static void rq_online_dl(struct rq *rq)
1579 if (rq->dl.overloaded) 1566 if (rq->dl.overloaded)
1580 dl_set_overload(rq); 1567 dl_set_overload(rq);
1581 1568
1569 cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
1582 if (rq->dl.dl_nr_running > 0) 1570 if (rq->dl.dl_nr_running > 0)
1583 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); 1571 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
1584} 1572}
@@ -1590,6 +1578,7 @@ static void rq_offline_dl(struct rq *rq)
1590 dl_clear_overload(rq); 1578 dl_clear_overload(rq);
1591 1579
1592 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); 1580 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
1581 cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
1593} 1582}
1594 1583
1595void init_sched_dl_class(void) 1584void init_sched_dl_class(void)
@@ -1631,8 +1620,8 @@ static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
1631 1620
1632static void switched_from_dl(struct rq *rq, struct task_struct *p) 1621static void switched_from_dl(struct rq *rq, struct task_struct *p)
1633{ 1622{
1623 /* XXX we should retain the bw until 0-lag */
1634 cancel_dl_timer(rq, p); 1624 cancel_dl_timer(rq, p);
1635
1636 __dl_clear_params(p); 1625 __dl_clear_params(p);
1637 1626
1638 /* 1627 /*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 92cc52001e74..8baaf858d25c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -305,6 +305,7 @@ do { \
305 PN(next_balance); 305 PN(next_balance);
306 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); 306 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
307 PN(clock); 307 PN(clock);
308 PN(clock_task);
308 P(cpu_load[0]); 309 P(cpu_load[0]);
309 P(cpu_load[1]); 310 P(cpu_load[1]);
310 P(cpu_load[2]); 311 P(cpu_load[2]);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df2cdf77f899..7ce18f3c097a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -676,7 +676,6 @@ void init_task_runnable_average(struct task_struct *p)
676{ 676{
677 u32 slice; 677 u32 slice;
678 678
679 p->se.avg.decay_count = 0;
680 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; 679 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
681 p->se.avg.runnable_avg_sum = slice; 680 p->se.avg.runnable_avg_sum = slice;
682 p->se.avg.runnable_avg_period = slice; 681 p->se.avg.runnable_avg_period = slice;
@@ -1730,7 +1729,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
1730 nodes = node_online_map; 1729 nodes = node_online_map;
1731 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { 1730 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1732 unsigned long max_faults = 0; 1731 unsigned long max_faults = 0;
1733 nodemask_t max_group; 1732 nodemask_t max_group = NODE_MASK_NONE;
1734 int a, b; 1733 int a, b;
1735 1734
1736 /* Are there nodes at this distance from each other? */ 1735 /* Are there nodes at this distance from each other? */
@@ -2574,11 +2573,11 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
2574 u64 decays = atomic64_read(&cfs_rq->decay_counter); 2573 u64 decays = atomic64_read(&cfs_rq->decay_counter);
2575 2574
2576 decays -= se->avg.decay_count; 2575 decays -= se->avg.decay_count;
2576 se->avg.decay_count = 0;
2577 if (!decays) 2577 if (!decays)
2578 return 0; 2578 return 0;
2579 2579
2580 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); 2580 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
2581 se->avg.decay_count = 0;
2582 2581
2583 return decays; 2582 return decays;
2584} 2583}
@@ -4005,6 +4004,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
4005 4004
4006static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 4005static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4007{ 4006{
4007 /* init_cfs_bandwidth() was not called */
4008 if (!cfs_b->throttled_cfs_rq.next)
4009 return;
4010
4008 hrtimer_cancel(&cfs_b->period_timer); 4011 hrtimer_cancel(&cfs_b->period_timer);
4009 hrtimer_cancel(&cfs_b->slack_timer); 4012 hrtimer_cancel(&cfs_b->slack_timer);
4010} 4013}
@@ -4424,7 +4427,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4424 * wl = S * s'_i; see (2) 4427 * wl = S * s'_i; see (2)
4425 */ 4428 */
4426 if (W > 0 && w < W) 4429 if (W > 0 && w < W)
4427 wl = (w * tg->shares) / W; 4430 wl = (w * (long)tg->shares) / W;
4428 else 4431 else
4429 wl = tg->shares; 4432 wl = tg->shares;
4430 4433
@@ -5153,7 +5156,7 @@ static void yield_task_fair(struct rq *rq)
5153 * so we don't do microscopic update in schedule() 5156 * so we don't do microscopic update in schedule()
5154 * and double the fastpath cost. 5157 * and double the fastpath cost.
5155 */ 5158 */
5156 rq->skip_clock_update = 1; 5159 rq_clock_skip_update(rq, true);
5157 } 5160 }
5158 5161
5159 set_skip_buddy(se); 5162 set_skip_buddy(se);
@@ -5945,8 +5948,8 @@ static unsigned long scale_rt_capacity(int cpu)
5945 */ 5948 */
5946 age_stamp = ACCESS_ONCE(rq->age_stamp); 5949 age_stamp = ACCESS_ONCE(rq->age_stamp);
5947 avg = ACCESS_ONCE(rq->rt_avg); 5950 avg = ACCESS_ONCE(rq->rt_avg);
5951 delta = __rq_clock_broken(rq) - age_stamp;
5948 5952
5949 delta = rq_clock(rq) - age_stamp;
5950 if (unlikely(delta < 0)) 5953 if (unlikely(delta < 0))
5951 delta = 0; 5954 delta = 0;
5952 5955
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..94b2d7b88a27 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
7#include <linux/tick.h> 7#include <linux/tick.h>
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/stackprotector.h> 9#include <linux/stackprotector.h>
10#include <linux/suspend.h>
10 11
11#include <asm/tlb.h> 12#include <asm/tlb.h>
12 13
@@ -47,7 +48,8 @@ static inline int cpu_idle_poll(void)
47 rcu_idle_enter(); 48 rcu_idle_enter();
48 trace_cpu_idle_rcuidle(0, smp_processor_id()); 49 trace_cpu_idle_rcuidle(0, smp_processor_id());
49 local_irq_enable(); 50 local_irq_enable();
50 while (!tif_need_resched()) 51 while (!tif_need_resched() &&
52 (cpu_idle_force_poll || tick_check_broadcast_expired()))
51 cpu_relax(); 53 cpu_relax();
52 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 54 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
53 rcu_idle_exit(); 55 rcu_idle_exit();
@@ -104,6 +106,21 @@ static void cpuidle_idle_call(void)
104 rcu_idle_enter(); 106 rcu_idle_enter();
105 107
106 /* 108 /*
109 * Suspend-to-idle ("freeze") is a system state in which all user space
110 * has been frozen, all I/O devices have been suspended and the only
111 * activity happens here and in iterrupts (if any). In that case bypass
112 * the cpuidle governor and go stratight for the deepest idle state
113 * available. Possibly also suspend the local tick and the entire
114 * timekeeping to prevent timer interrupts from kicking us out of idle
115 * until a proper wakeup interrupt happens.
116 */
117 if (idle_should_freeze()) {
118 cpuidle_enter_freeze();
119 local_irq_enable();
120 goto exit_idle;
121 }
122
123 /*
107 * Ask the cpuidle framework to choose a convenient idle state. 124 * Ask the cpuidle framework to choose a convenient idle state.
108 * Fall back to the default arch idle method on errors. 125 * Fall back to the default arch idle method on errors.
109 */ 126 */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ee15f5a0d1c1..f4d4b077eba0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -831,11 +831,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
831 enqueue = 1; 831 enqueue = 1;
832 832
833 /* 833 /*
834 * Force a clock update if the CPU was idle, 834 * When we're idle and a woken (rt) task is
835 * lest wakeup -> unthrottle time accumulate. 835 * throttled check_preempt_curr() will set
836 * skip_update and the time between the wakeup
837 * and this unthrottle will get accounted as
838 * 'runtime'.
836 */ 839 */
837 if (rt_rq->rt_nr_running && rq->curr == rq->idle) 840 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
838 rq->skip_clock_update = -1; 841 rq_clock_skip_update(rq, false);
839 } 842 }
840 if (rt_rq->rt_time || rt_rq->rt_nr_running) 843 if (rt_rq->rt_time || rt_rq->rt_nr_running)
841 idle = 0; 844 idle = 0;
@@ -1337,7 +1340,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1337 curr->prio <= p->prio)) { 1340 curr->prio <= p->prio)) {
1338 int target = find_lowest_rq(p); 1341 int target = find_lowest_rq(p);
1339 1342
1340 if (target != -1) 1343 /*
1344 * Don't bother moving it if the destination CPU is
1345 * not running a lower priority task.
1346 */
1347 if (target != -1 &&
1348 p->prio < cpu_rq(target)->rt.highest_prio.curr)
1341 cpu = target; 1349 cpu = target;
1342 } 1350 }
1343 rcu_read_unlock(); 1351 rcu_read_unlock();
@@ -1614,6 +1622,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1614 1622
1615 lowest_rq = cpu_rq(cpu); 1623 lowest_rq = cpu_rq(cpu);
1616 1624
1625 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1626 /*
1627 * Target rq has tasks of equal or higher priority,
1628 * retrying does not release any lock and is unlikely
1629 * to yield a different result.
1630 */
1631 lowest_rq = NULL;
1632 break;
1633 }
1634
1617 /* if the prio of this runqueue changed, try again */ 1635 /* if the prio of this runqueue changed, try again */
1618 if (double_lock_balance(rq, lowest_rq)) { 1636 if (double_lock_balance(rq, lowest_rq)) {
1619 /* 1637 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a2a45c970e7..0870db23d79c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -558,8 +558,6 @@ struct rq {
558#ifdef CONFIG_NO_HZ_FULL 558#ifdef CONFIG_NO_HZ_FULL
559 unsigned long last_sched_tick; 559 unsigned long last_sched_tick;
560#endif 560#endif
561 int skip_clock_update;
562
563 /* capture load from *all* tasks on this cpu: */ 561 /* capture load from *all* tasks on this cpu: */
564 struct load_weight load; 562 struct load_weight load;
565 unsigned long nr_load_updates; 563 unsigned long nr_load_updates;
@@ -588,6 +586,7 @@ struct rq {
588 unsigned long next_balance; 586 unsigned long next_balance;
589 struct mm_struct *prev_mm; 587 struct mm_struct *prev_mm;
590 588
589 unsigned int clock_skip_update;
591 u64 clock; 590 u64 clock;
592 u64 clock_task; 591 u64 clock_task;
593 592
@@ -687,16 +686,35 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
687#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 686#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
688#define raw_rq() raw_cpu_ptr(&runqueues) 687#define raw_rq() raw_cpu_ptr(&runqueues)
689 688
689static inline u64 __rq_clock_broken(struct rq *rq)
690{
691 return ACCESS_ONCE(rq->clock);
692}
693
690static inline u64 rq_clock(struct rq *rq) 694static inline u64 rq_clock(struct rq *rq)
691{ 695{
696 lockdep_assert_held(&rq->lock);
692 return rq->clock; 697 return rq->clock;
693} 698}
694 699
695static inline u64 rq_clock_task(struct rq *rq) 700static inline u64 rq_clock_task(struct rq *rq)
696{ 701{
702 lockdep_assert_held(&rq->lock);
697 return rq->clock_task; 703 return rq->clock_task;
698} 704}
699 705
706#define RQCF_REQ_SKIP 0x01
707#define RQCF_ACT_SKIP 0x02
708
709static inline void rq_clock_skip_update(struct rq *rq, bool skip)
710{
711 lockdep_assert_held(&rq->lock);
712 if (skip)
713 rq->clock_skip_update |= RQCF_REQ_SKIP;
714 else
715 rq->clock_skip_update &= ~RQCF_REQ_SKIP;
716}
717
700#ifdef CONFIG_NUMA 718#ifdef CONFIG_NUMA
701enum numa_topology_type { 719enum numa_topology_type {
702 NUMA_DIRECT, 720 NUMA_DIRECT,
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index a476bea17fbc..87e2c9f0c33e 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -15,11 +15,6 @@
15static int show_schedstat(struct seq_file *seq, void *v) 15static int show_schedstat(struct seq_file *seq, void *v)
16{ 16{
17 int cpu; 17 int cpu;
18 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
19 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
20
21 if (mask_str == NULL)
22 return -ENOMEM;
23 18
24 if (v == (void *)1) { 19 if (v == (void *)1) {
25 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 20 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
@@ -50,9 +45,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
50 for_each_domain(cpu, sd) { 45 for_each_domain(cpu, sd) {
51 enum cpu_idle_type itype; 46 enum cpu_idle_type itype;
52 47
53 cpumask_scnprintf(mask_str, mask_len, 48 seq_printf(seq, "domain%d %*pb", dcount++,
54 sched_domain_span(sd)); 49 cpumask_pr_args(sched_domain_span(sd)));
55 seq_printf(seq, "domain%d %s", dcount++, mask_str);
56 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 50 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
57 itype++) { 51 itype++) {
58 seq_printf(seq, " %u %u %u %u %u %u %u %u", 52 seq_printf(seq, " %u %u %u %u %u %u %u %u",
@@ -76,7 +70,6 @@ static int show_schedstat(struct seq_file *seq, void *v)
76 rcu_read_unlock(); 70 rcu_read_unlock();
77#endif 71#endif
78 } 72 }
79 kfree(mask_str);
80 return 0; 73 return 0;
81} 74}
82 75
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4ef9687ac115..4f44028943e6 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -629,7 +629,9 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
629 629
630 switch (action) { 630 switch (action) {
631 case SECCOMP_RET_ERRNO: 631 case SECCOMP_RET_ERRNO:
632 /* Set the low-order 16-bits as a errno. */ 632 /* Set low-order bits as an errno, capped at MAX_ERRNO. */
633 if (data > MAX_ERRNO)
634 data = MAX_ERRNO;
633 syscall_set_return_value(current, task_pt_regs(current), 635 syscall_set_return_value(current, task_pt_regs(current),
634 -data, 0); 636 -data, 0);
635 goto skip; 637 goto skip;
diff --git a/kernel/signal.c b/kernel/signal.c
index 16a305295256..a390499943e4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2501,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals);
2501 */ 2501 */
2502SYSCALL_DEFINE0(restart_syscall) 2502SYSCALL_DEFINE0(restart_syscall)
2503{ 2503{
2504 struct restart_block *restart = &current_thread_info()->restart_block; 2504 struct restart_block *restart = &current->restart_block;
2505 return restart->fn(restart); 2505 return restart->fn(restart);
2506} 2506}
2507 2507
@@ -3550,7 +3550,7 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
3550SYSCALL_DEFINE0(pause) 3550SYSCALL_DEFINE0(pause)
3551{ 3551{
3552 while (!signal_pending(current)) { 3552 while (!signal_pending(current)) {
3553 current->state = TASK_INTERRUPTIBLE; 3553 __set_current_state(TASK_INTERRUPTIBLE);
3554 schedule(); 3554 schedule();
3555 } 3555 }
3556 return -ERESTARTNOHAND; 3556 return -ERESTARTNOHAND;
@@ -3563,7 +3563,7 @@ int sigsuspend(sigset_t *set)
3563 current->saved_sigmask = current->blocked; 3563 current->saved_sigmask = current->blocked;
3564 set_current_blocked(set); 3564 set_current_blocked(set);
3565 3565
3566 current->state = TASK_INTERRUPTIBLE; 3566 __set_current_state(TASK_INTERRUPTIBLE);
3567 schedule(); 3567 schedule();
3568 set_restore_sigmask(); 3568 set_restore_sigmask();
3569 return -ERESTARTNOHAND; 3569 return -ERESTARTNOHAND;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f032fb5284e3..40190f28db35 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -280,6 +280,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
280 unsigned int cpu; 280 unsigned int cpu;
281 int ret = 0; 281 int ret = 0;
282 282
283 get_online_cpus();
283 mutex_lock(&smpboot_threads_lock); 284 mutex_lock(&smpboot_threads_lock);
284 for_each_online_cpu(cpu) { 285 for_each_online_cpu(cpu) {
285 ret = __smpboot_create_thread(plug_thread, cpu); 286 ret = __smpboot_create_thread(plug_thread, cpu);
@@ -292,6 +293,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
292 list_add(&plug_thread->list, &hotplug_threads); 293 list_add(&plug_thread->list, &hotplug_threads);
293out: 294out:
294 mutex_unlock(&smpboot_threads_lock); 295 mutex_unlock(&smpboot_threads_lock);
296 put_online_cpus();
295 return ret; 297 return ret;
296} 298}
297EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); 299EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 501baa9ac1be..479e4436f787 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
114 trace_softirqs_off(ip); 114 trace_softirqs_off(ip);
115 raw_local_irq_restore(flags); 115 raw_local_irq_restore(flags);
116 116
117 if (preempt_count() == cnt) 117 if (preempt_count() == cnt) {
118#ifdef CONFIG_DEBUG_PREEMPT
119 current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
120#endif
118 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 121 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
122 }
119} 123}
120EXPORT_SYMBOL(__local_bh_disable_ip); 124EXPORT_SYMBOL(__local_bh_disable_ip);
121#endif /* CONFIG_TRACE_IRQFLAGS */ 125#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -656,9 +660,8 @@ static void run_ksoftirqd(unsigned int cpu)
656 * in the task stack here. 660 * in the task stack here.
657 */ 661 */
658 __do_softirq(); 662 __do_softirq();
659 rcu_note_context_switch();
660 local_irq_enable(); 663 local_irq_enable();
661 cond_resched(); 664 cond_resched_rcu_qs();
662 return; 665 return;
663 } 666 }
664 local_irq_enable(); 667 local_irq_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index a8c9f5a7dda6..ea9c88109894 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2210,9 +2210,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2210 up_write(&me->mm->mmap_sem); 2210 up_write(&me->mm->mmap_sem);
2211 break; 2211 break;
2212 case PR_MPX_ENABLE_MANAGEMENT: 2212 case PR_MPX_ENABLE_MANAGEMENT:
2213 if (arg2 || arg3 || arg4 || arg5)
2214 return -EINVAL;
2213 error = MPX_ENABLE_MANAGEMENT(me); 2215 error = MPX_ENABLE_MANAGEMENT(me);
2214 break; 2216 break;
2215 case PR_MPX_DISABLE_MANAGEMENT: 2217 case PR_MPX_DISABLE_MANAGEMENT:
2218 if (arg2 || arg3 || arg4 || arg5)
2219 return -EINVAL;
2216 error = MPX_DISABLE_MANAGEMENT(me); 2220 error = MPX_DISABLE_MANAGEMENT(me);
2217 break; 2221 break;
2218 default: 2222 default:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 137c7f69b264..88ea2d6e0031 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1248,7 +1248,6 @@ static struct ctl_table vm_table[] = {
1248 .maxlen = sizeof(unsigned long), 1248 .maxlen = sizeof(unsigned long),
1249 .mode = 0644, 1249 .mode = 0644,
1250 .proc_handler = hugetlb_sysctl_handler, 1250 .proc_handler = hugetlb_sysctl_handler,
1251 .extra1 = &zero,
1252 }, 1251 },
1253#ifdef CONFIG_NUMA 1252#ifdef CONFIG_NUMA
1254 { 1253 {
@@ -1257,7 +1256,6 @@ static struct ctl_table vm_table[] = {
1257 .maxlen = sizeof(unsigned long), 1256 .maxlen = sizeof(unsigned long),
1258 .mode = 0644, 1257 .mode = 0644,
1259 .proc_handler = &hugetlb_mempolicy_sysctl_handler, 1258 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
1260 .extra1 = &zero,
1261 }, 1259 },
1262#endif 1260#endif
1263 { 1261 {
@@ -1280,7 +1278,6 @@ static struct ctl_table vm_table[] = {
1280 .maxlen = sizeof(unsigned long), 1278 .maxlen = sizeof(unsigned long),
1281 .mode = 0644, 1279 .mode = 0644,
1282 .proc_handler = hugetlb_overcommit_handler, 1280 .proc_handler = hugetlb_overcommit_handler,
1283 .extra1 = &zero,
1284 }, 1281 },
1285#endif 1282#endif
1286 { 1283 {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 670fff88a961..21f82c29c914 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -111,13 +111,8 @@ static int send_reply(struct sk_buff *skb, struct genl_info *info)
111{ 111{
112 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 112 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
113 void *reply = genlmsg_data(genlhdr); 113 void *reply = genlmsg_data(genlhdr);
114 int rc;
115 114
116 rc = genlmsg_end(skb, reply); 115 genlmsg_end(skb, reply);
117 if (rc < 0) {
118 nlmsg_free(skb);
119 return rc;
120 }
121 116
122 return genlmsg_reply(skb, info); 117 return genlmsg_reply(skb, info);
123} 118}
@@ -134,11 +129,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
134 void *reply = genlmsg_data(genlhdr); 129 void *reply = genlmsg_data(genlhdr);
135 int rc, delcount = 0; 130 int rc, delcount = 0;
136 131
137 rc = genlmsg_end(skb, reply); 132 genlmsg_end(skb, reply);
138 if (rc < 0) {
139 nlmsg_free(skb);
140 return;
141 }
142 133
143 rc = 0; 134 rc = 0;
144 down_read(&listeners->sem); 135 down_read(&listeners->sem);
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f622cf28628a..c09c07817d7a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,6 @@
1obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o 1obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
3obj-y += timeconv.o posix-clock.o alarmtimer.o 3obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
4 4
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
6obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 6obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index a7077d3ae52f..1b001ed1edb9 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -788,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
788 goto out; 788 goto out;
789 } 789 }
790 790
791 restart = &current_thread_info()->restart_block; 791 restart = &current->restart_block;
792 restart->fn = alarm_timer_nsleep_restart; 792 restart->fn = alarm_timer_nsleep_restart;
793 restart->nanosleep.clockid = type; 793 restart->nanosleep.clockid = type;
794 restart->nanosleep.expires = exp.tv64; 794 restart->nanosleep.expires = exp.tv64;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b79f39bda7e1..4892352f0e49 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -34,82 +34,6 @@
34#include "tick-internal.h" 34#include "tick-internal.h"
35#include "timekeeping_internal.h" 35#include "timekeeping_internal.h"
36 36
37void timecounter_init(struct timecounter *tc,
38 const struct cyclecounter *cc,
39 u64 start_tstamp)
40{
41 tc->cc = cc;
42 tc->cycle_last = cc->read(cc);
43 tc->nsec = start_tstamp;
44}
45EXPORT_SYMBOL_GPL(timecounter_init);
46
47/**
48 * timecounter_read_delta - get nanoseconds since last call of this function
49 * @tc: Pointer to time counter
50 *
51 * When the underlying cycle counter runs over, this will be handled
52 * correctly as long as it does not run over more than once between
53 * calls.
54 *
55 * The first call to this function for a new time counter initializes
56 * the time tracking and returns an undefined result.
57 */
58static u64 timecounter_read_delta(struct timecounter *tc)
59{
60 cycle_t cycle_now, cycle_delta;
61 u64 ns_offset;
62
63 /* read cycle counter: */
64 cycle_now = tc->cc->read(tc->cc);
65
66 /* calculate the delta since the last timecounter_read_delta(): */
67 cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
68
69 /* convert to nanoseconds: */
70 ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
71
72 /* update time stamp of timecounter_read_delta() call: */
73 tc->cycle_last = cycle_now;
74
75 return ns_offset;
76}
77
78u64 timecounter_read(struct timecounter *tc)
79{
80 u64 nsec;
81
82 /* increment time by nanoseconds since last call */
83 nsec = timecounter_read_delta(tc);
84 nsec += tc->nsec;
85 tc->nsec = nsec;
86
87 return nsec;
88}
89EXPORT_SYMBOL_GPL(timecounter_read);
90
91u64 timecounter_cyc2time(struct timecounter *tc,
92 cycle_t cycle_tstamp)
93{
94 u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
95 u64 nsec;
96
97 /*
98 * Instead of always treating cycle_tstamp as more recent
99 * than tc->cycle_last, detect when it is too far in the
100 * future and treat it as old time stamp instead.
101 */
102 if (cycle_delta > tc->cc->mask / 2) {
103 cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
104 nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
105 } else {
106 nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
107 }
108
109 return nsec;
110}
111EXPORT_SYMBOL_GPL(timecounter_cyc2time);
112
113/** 37/**
114 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks 38 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
115 * @mult: pointer to mult variable 39 * @mult: pointer to mult variable
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 37e50aadd471..bee0c1f78091 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -122,7 +122,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
122 mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); 122 mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
123 boot = ktime_add(mono, off_boot); 123 boot = ktime_add(mono, off_boot);
124 xtim = ktime_add(mono, off_real); 124 xtim = ktime_add(mono, off_real);
125 tai = ktime_add(xtim, off_tai); 125 tai = ktime_add(mono, off_tai);
126 126
127 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; 127 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
128 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; 128 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
@@ -266,7 +266,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
266/* 266/*
267 * Divide a ktime value by a nanosecond value 267 * Divide a ktime value by a nanosecond value
268 */ 268 */
269u64 ktime_divns(const ktime_t kt, s64 div) 269u64 __ktime_divns(const ktime_t kt, s64 div)
270{ 270{
271 u64 dclc; 271 u64 dclc;
272 int sft = 0; 272 int sft = 0;
@@ -282,7 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
282 282
283 return dclc; 283 return dclc;
284} 284}
285EXPORT_SYMBOL_GPL(ktime_divns); 285EXPORT_SYMBOL_GPL(__ktime_divns);
286#endif /* BITS_PER_LONG >= 64 */ 286#endif /* BITS_PER_LONG >= 64 */
287 287
288/* 288/*
@@ -440,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer)
440 trace_hrtimer_cancel(timer); 440 trace_hrtimer_cancel(timer);
441} 441}
442 442
443#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
444static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
445{
446 struct hrtimer_clock_base *base = cpu_base->clock_base;
447 ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
448 int i;
449
450 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
451 struct timerqueue_node *next;
452 struct hrtimer *timer;
453
454 next = timerqueue_getnext(&base->active);
455 if (!next)
456 continue;
457
458 timer = container_of(next, struct hrtimer, node);
459 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
460 if (expires.tv64 < expires_next.tv64)
461 expires_next = expires;
462 }
463 /*
464 * clock_was_set() might have changed base->offset of any of
465 * the clock bases so the result might be negative. Fix it up
466 * to prevent a false positive in clockevents_program_event().
467 */
468 if (expires_next.tv64 < 0)
469 expires_next.tv64 = 0;
470 return expires_next;
471}
472#endif
473
443/* High resolution timer related functions */ 474/* High resolution timer related functions */
444#ifdef CONFIG_HIGH_RES_TIMERS 475#ifdef CONFIG_HIGH_RES_TIMERS
445 476
@@ -488,32 +519,7 @@ static inline int hrtimer_hres_active(void)
488static void 519static void
489hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) 520hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
490{ 521{
491 int i; 522 ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
492 struct hrtimer_clock_base *base = cpu_base->clock_base;
493 ktime_t expires, expires_next;
494
495 expires_next.tv64 = KTIME_MAX;
496
497 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
498 struct hrtimer *timer;
499 struct timerqueue_node *next;
500
501 next = timerqueue_getnext(&base->active);
502 if (!next)
503 continue;
504 timer = container_of(next, struct hrtimer, node);
505
506 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
507 /*
508 * clock_was_set() has changed base->offset so the
509 * result might be negative. Fix it up to prevent a
510 * false positive in clockevents_program_event()
511 */
512 if (expires.tv64 < 0)
513 expires.tv64 = 0;
514 if (expires.tv64 < expires_next.tv64)
515 expires_next = expires;
516 }
517 523
518 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) 524 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
519 return; 525 return;
@@ -587,6 +593,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
587 return 0; 593 return 0;
588 594
589 /* 595 /*
596 * When the target cpu of the timer is currently executing
597 * hrtimer_interrupt(), then we do not touch the clock event
598 * device. hrtimer_interrupt() will reevaluate all clock bases
599 * before reprogramming the device.
600 */
601 if (cpu_base->in_hrtirq)
602 return 0;
603
604 /*
590 * If a hang was detected in the last timer interrupt then we 605 * If a hang was detected in the last timer interrupt then we
591 * do not schedule a timer which is earlier than the expiry 606 * do not schedule a timer which is earlier than the expiry
592 * which we enforced in the hang detection. We want the system 607 * which we enforced in the hang detection. We want the system
@@ -1104,29 +1119,14 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1104ktime_t hrtimer_get_next_event(void) 1119ktime_t hrtimer_get_next_event(void)
1105{ 1120{
1106 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1121 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1107 struct hrtimer_clock_base *base = cpu_base->clock_base; 1122 ktime_t mindelta = { .tv64 = KTIME_MAX };
1108 ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
1109 unsigned long flags; 1123 unsigned long flags;
1110 int i;
1111 1124
1112 raw_spin_lock_irqsave(&cpu_base->lock, flags); 1125 raw_spin_lock_irqsave(&cpu_base->lock, flags);
1113 1126
1114 if (!hrtimer_hres_active()) { 1127 if (!hrtimer_hres_active())
1115 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1128 mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
1116 struct hrtimer *timer; 1129 ktime_get());
1117 struct timerqueue_node *next;
1118
1119 next = timerqueue_getnext(&base->active);
1120 if (!next)
1121 continue;
1122
1123 timer = container_of(next, struct hrtimer, node);
1124 delta.tv64 = hrtimer_get_expires_tv64(timer);
1125 delta = ktime_sub(delta, base->get_time());
1126 if (delta.tv64 < mindelta.tv64)
1127 mindelta.tv64 = delta.tv64;
1128 }
1129 }
1130 1130
1131 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1131 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1132 1132
@@ -1253,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1253 raw_spin_lock(&cpu_base->lock); 1253 raw_spin_lock(&cpu_base->lock);
1254 entry_time = now = hrtimer_update_base(cpu_base); 1254 entry_time = now = hrtimer_update_base(cpu_base);
1255retry: 1255retry:
1256 expires_next.tv64 = KTIME_MAX; 1256 cpu_base->in_hrtirq = 1;
1257 /* 1257 /*
1258 * We set expires_next to KTIME_MAX here with cpu_base->lock 1258 * We set expires_next to KTIME_MAX here with cpu_base->lock
1259 * held to prevent that a timer is enqueued in our queue via 1259 * held to prevent that a timer is enqueued in our queue via
@@ -1291,28 +1291,20 @@ retry:
1291 * are right-of a not yet expired timer, because that 1291 * are right-of a not yet expired timer, because that
1292 * timer will have to trigger a wakeup anyway. 1292 * timer will have to trigger a wakeup anyway.
1293 */ 1293 */
1294 1294 if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
1295 if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
1296 ktime_t expires;
1297
1298 expires = ktime_sub(hrtimer_get_expires(timer),
1299 base->offset);
1300 if (expires.tv64 < 0)
1301 expires.tv64 = KTIME_MAX;
1302 if (expires.tv64 < expires_next.tv64)
1303 expires_next = expires;
1304 break; 1295 break;
1305 }
1306 1296
1307 __run_hrtimer(timer, &basenow); 1297 __run_hrtimer(timer, &basenow);
1308 } 1298 }
1309 } 1299 }
1310 1300 /* Reevaluate the clock bases for the next expiry */
1301 expires_next = __hrtimer_get_next_event(cpu_base);
1311 /* 1302 /*
1312 * Store the new expiry value so the migration code can verify 1303 * Store the new expiry value so the migration code can verify
1313 * against it. 1304 * against it.
1314 */ 1305 */
1315 cpu_base->expires_next = expires_next; 1306 cpu_base->expires_next = expires_next;
1307 cpu_base->in_hrtirq = 0;
1316 raw_spin_unlock(&cpu_base->lock); 1308 raw_spin_unlock(&cpu_base->lock);
1317 1309
1318 /* Reprogramming necessary ? */ 1310 /* Reprogramming necessary ? */
@@ -1591,7 +1583,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1591 goto out; 1583 goto out;
1592 } 1584 }
1593 1585
1594 restart = &current_thread_info()->restart_block; 1586 restart = &current->restart_block;
1595 restart->fn = hrtimer_nanosleep_restart; 1587 restart->fn = hrtimer_nanosleep_restart;
1596 restart->nanosleep.clockid = t.timer.base->clockid; 1588 restart->nanosleep.clockid = t.timer.base->clockid;
1597 restart->nanosleep.rmtp = rmtp; 1589 restart->nanosleep.rmtp = rmtp;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 87a346fd6d61..4b585e0fdd22 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -488,13 +488,13 @@ static void sync_cmos_clock(struct work_struct *work)
488 488
489 getnstimeofday64(&now); 489 getnstimeofday64(&now);
490 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { 490 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
491 struct timespec adjust = timespec64_to_timespec(now); 491 struct timespec64 adjust = now;
492 492
493 fail = -ENODEV; 493 fail = -ENODEV;
494 if (persistent_clock_is_local) 494 if (persistent_clock_is_local)
495 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); 495 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
496#ifdef CONFIG_GENERIC_CMOS_UPDATE 496#ifdef CONFIG_GENERIC_CMOS_UPDATE
497 fail = update_persistent_clock(adjust); 497 fail = update_persistent_clock(timespec64_to_timespec(adjust));
498#endif 498#endif
499#ifdef CONFIG_RTC_SYSTOHC 499#ifdef CONFIG_RTC_SYSTOHC
500 if (fail == -ENODEV) 500 if (fail == -ENODEV)
@@ -633,6 +633,13 @@ int ntp_validate_timex(struct timex *txc)
633 if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) 633 if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
634 return -EPERM; 634 return -EPERM;
635 635
636 if (txc->modes & ADJ_FREQUENCY) {
637 if (LONG_MIN / PPM_SCALE > txc->freq)
638 return -EINVAL;
639 if (LONG_MAX / PPM_SCALE < txc->freq)
640 return -EINVAL;
641 }
642
636 return 0; 643 return 0;
637} 644}
638 645
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a16b67859e2a..0075da74abf0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1334,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1334static int posix_cpu_nsleep(const clockid_t which_clock, int flags, 1334static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1335 struct timespec *rqtp, struct timespec __user *rmtp) 1335 struct timespec *rqtp, struct timespec __user *rmtp)
1336{ 1336{
1337 struct restart_block *restart_block = 1337 struct restart_block *restart_block = &current->restart_block;
1338 &current_thread_info()->restart_block;
1339 struct itimerspec it; 1338 struct itimerspec it;
1340 int error; 1339 int error;
1341 1340
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 7efeedf53ebd..f7c515595b42 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -394,6 +394,56 @@ void tick_resume(void)
394 } 394 }
395} 395}
396 396
397static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
398static unsigned int tick_freeze_depth;
399
400/**
401 * tick_freeze - Suspend the local tick and (possibly) timekeeping.
402 *
403 * Check if this is the last online CPU executing the function and if so,
404 * suspend timekeeping. Otherwise suspend the local tick.
405 *
406 * Call with interrupts disabled. Must be balanced with %tick_unfreeze().
407 * Interrupts must not be enabled before the subsequent %tick_unfreeze().
408 */
409void tick_freeze(void)
410{
411 raw_spin_lock(&tick_freeze_lock);
412
413 tick_freeze_depth++;
414 if (tick_freeze_depth == num_online_cpus()) {
415 timekeeping_suspend();
416 } else {
417 tick_suspend();
418 tick_suspend_broadcast();
419 }
420
421 raw_spin_unlock(&tick_freeze_lock);
422}
423
424/**
425 * tick_unfreeze - Resume the local tick and (possibly) timekeeping.
426 *
427 * Check if this is the first CPU executing the function and if so, resume
428 * timekeeping. Otherwise resume the local tick.
429 *
430 * Call with interrupts disabled. Must be balanced with %tick_freeze().
431 * Interrupts must not be enabled after the preceding %tick_freeze().
432 */
433void tick_unfreeze(void)
434{
435 raw_spin_lock(&tick_freeze_lock);
436
437 if (tick_freeze_depth == num_online_cpus())
438 timekeeping_resume();
439 else
440 tick_resume();
441
442 tick_freeze_depth--;
443
444 raw_spin_unlock(&tick_freeze_lock);
445}
446
397/** 447/**
398 * tick_init - initialize the tick control 448 * tick_init - initialize the tick control
399 */ 449 */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1363d58f07e9..a4c4edac4528 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -326,13 +326,6 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
326 return NOTIFY_OK; 326 return NOTIFY_OK;
327} 327}
328 328
329/*
330 * Worst case string length in chunks of CPU range seems 2 steps
331 * separations: 0,2,4,6,...
332 * This is NR_CPUS + sizeof('\0')
333 */
334static char __initdata nohz_full_buf[NR_CPUS + 1];
335
336static int tick_nohz_init_all(void) 329static int tick_nohz_init_all(void)
337{ 330{
338 int err = -1; 331 int err = -1;
@@ -393,8 +386,8 @@ void __init tick_nohz_init(void)
393 context_tracking_cpu_set(cpu); 386 context_tracking_cpu_set(cpu);
394 387
395 cpu_notifier(tick_nohz_cpu_down_callback, 0); 388 cpu_notifier(tick_nohz_cpu_down_callback, 0);
396 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); 389 pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
397 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); 390 cpumask_pr_args(tick_nohz_full_mask));
398} 391}
399#endif 392#endif
400 393
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 6390517e77d4..2c85b7724af4 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -196,6 +196,10 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
196 if (tv) { 196 if (tv) {
197 if (copy_from_user(&user_tv, tv, sizeof(*tv))) 197 if (copy_from_user(&user_tv, tv, sizeof(*tv)))
198 return -EFAULT; 198 return -EFAULT;
199
200 if (!timeval_valid(&user_tv))
201 return -EINVAL;
202
199 new_ts.tv_sec = user_tv.tv_sec; 203 new_ts.tv_sec = user_tv.tv_sec;
200 new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; 204 new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
201 } 205 }
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
new file mode 100644
index 000000000000..4687b3104bae
--- /dev/null
+++ b/kernel/time/timecounter.c
@@ -0,0 +1,112 @@
1/*
2 * linux/kernel/time/timecounter.c
3 *
4 * based on code that migrated away from
5 * linux/kernel/time/clocksource.c
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 */
17
18#include <linux/export.h>
19#include <linux/timecounter.h>
20
21void timecounter_init(struct timecounter *tc,
22 const struct cyclecounter *cc,
23 u64 start_tstamp)
24{
25 tc->cc = cc;
26 tc->cycle_last = cc->read(cc);
27 tc->nsec = start_tstamp;
28 tc->mask = (1ULL << cc->shift) - 1;
29 tc->frac = 0;
30}
31EXPORT_SYMBOL_GPL(timecounter_init);
32
33/**
34 * timecounter_read_delta - get nanoseconds since last call of this function
35 * @tc: Pointer to time counter
36 *
37 * When the underlying cycle counter runs over, this will be handled
38 * correctly as long as it does not run over more than once between
39 * calls.
40 *
41 * The first call to this function for a new time counter initializes
42 * the time tracking and returns an undefined result.
43 */
44static u64 timecounter_read_delta(struct timecounter *tc)
45{
46 cycle_t cycle_now, cycle_delta;
47 u64 ns_offset;
48
49 /* read cycle counter: */
50 cycle_now = tc->cc->read(tc->cc);
51
52 /* calculate the delta since the last timecounter_read_delta(): */
53 cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
54
55 /* convert to nanoseconds: */
56 ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta,
57 tc->mask, &tc->frac);
58
59 /* update time stamp of timecounter_read_delta() call: */
60 tc->cycle_last = cycle_now;
61
62 return ns_offset;
63}
64
65u64 timecounter_read(struct timecounter *tc)
66{
67 u64 nsec;
68
69 /* increment time by nanoseconds since last call */
70 nsec = timecounter_read_delta(tc);
71 nsec += tc->nsec;
72 tc->nsec = nsec;
73
74 return nsec;
75}
76EXPORT_SYMBOL_GPL(timecounter_read);
77
78/*
79 * This is like cyclecounter_cyc2ns(), but it is used for computing a
80 * time previous to the time stored in the cycle counter.
81 */
82static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
83 cycle_t cycles, u64 mask, u64 frac)
84{
85 u64 ns = (u64) cycles;
86
87 ns = ((ns * cc->mult) - frac) >> cc->shift;
88
89 return ns;
90}
91
92u64 timecounter_cyc2time(struct timecounter *tc,
93 cycle_t cycle_tstamp)
94{
95 u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
96 u64 nsec = tc->nsec, frac = tc->frac;
97
98 /*
99 * Instead of always treating cycle_tstamp as more recent
100 * than tc->cycle_last, detect when it is too far in the
101 * future and treat it as old time stamp instead.
102 */
103 if (delta > tc->cc->mask / 2) {
104 delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
105 nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac);
106 } else {
107 nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac);
108 }
109
110 return nsec;
111}
112EXPORT_SYMBOL_GPL(timecounter_cyc2time);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6a931852082f..91db94136c10 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -230,9 +230,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
230 230
231/** 231/**
232 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. 232 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
233 * @tk: The timekeeper from which we take the update 233 * @tkr: Timekeeping readout base from which we take the update
234 * @tkf: The fast timekeeper to update
235 * @tbase: The time base for the fast timekeeper (mono/raw)
236 * 234 *
237 * We want to use this from any context including NMI and tracing / 235 * We want to use this from any context including NMI and tracing /
238 * instrumenting the timekeeping code itself. 236 * instrumenting the timekeeping code itself.
@@ -244,11 +242,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
244 * smp_wmb(); <- Ensure that the last base[1] update is visible 242 * smp_wmb(); <- Ensure that the last base[1] update is visible
245 * tkf->seq++; 243 * tkf->seq++;
246 * smp_wmb(); <- Ensure that the seqcount update is visible 244 * smp_wmb(); <- Ensure that the seqcount update is visible
247 * update(tkf->base[0], tk); 245 * update(tkf->base[0], tkr);
248 * smp_wmb(); <- Ensure that the base[0] update is visible 246 * smp_wmb(); <- Ensure that the base[0] update is visible
249 * tkf->seq++; 247 * tkf->seq++;
250 * smp_wmb(); <- Ensure that the seqcount update is visible 248 * smp_wmb(); <- Ensure that the seqcount update is visible
251 * update(tkf->base[1], tk); 249 * update(tkf->base[1], tkr);
252 * 250 *
253 * The reader side does: 251 * The reader side does:
254 * 252 *
@@ -269,7 +267,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
269 * slightly wrong timestamp (a few nanoseconds). See 267 * slightly wrong timestamp (a few nanoseconds). See
270 * @ktime_get_mono_fast_ns. 268 * @ktime_get_mono_fast_ns.
271 */ 269 */
272static void update_fast_timekeeper(struct timekeeper *tk) 270static void update_fast_timekeeper(struct tk_read_base *tkr)
273{ 271{
274 struct tk_read_base *base = tk_fast_mono.base; 272 struct tk_read_base *base = tk_fast_mono.base;
275 273
@@ -277,7 +275,7 @@ static void update_fast_timekeeper(struct timekeeper *tk)
277 raw_write_seqcount_latch(&tk_fast_mono.seq); 275 raw_write_seqcount_latch(&tk_fast_mono.seq);
278 276
279 /* Update base[0] */ 277 /* Update base[0] */
280 memcpy(base, &tk->tkr, sizeof(*base)); 278 memcpy(base, tkr, sizeof(*base));
281 279
282 /* Force readers back to base[0] */ 280 /* Force readers back to base[0] */
283 raw_write_seqcount_latch(&tk_fast_mono.seq); 281 raw_write_seqcount_latch(&tk_fast_mono.seq);
@@ -334,6 +332,35 @@ u64 notrace ktime_get_mono_fast_ns(void)
334} 332}
335EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); 333EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
336 334
335/* Suspend-time cycles value for halted fast timekeeper. */
336static cycle_t cycles_at_suspend;
337
338static cycle_t dummy_clock_read(struct clocksource *cs)
339{
340 return cycles_at_suspend;
341}
342
343/**
344 * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
345 * @tk: Timekeeper to snapshot.
346 *
347 * It generally is unsafe to access the clocksource after timekeeping has been
348 * suspended, so take a snapshot of the readout base of @tk and use it as the
349 * fast timekeeper's readout base while suspended. It will return the same
350 * number of cycles every time until timekeeping is resumed at which time the
351 * proper readout base for the fast timekeeper will be restored automatically.
352 */
353static void halt_fast_timekeeper(struct timekeeper *tk)
354{
355 static struct tk_read_base tkr_dummy;
356 struct tk_read_base *tkr = &tk->tkr;
357
358 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
359 cycles_at_suspend = tkr->read(tkr->clock);
360 tkr_dummy.read = dummy_clock_read;
361 update_fast_timekeeper(&tkr_dummy);
362}
363
337#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD 364#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
338 365
339static inline void update_vsyscall(struct timekeeper *tk) 366static inline void update_vsyscall(struct timekeeper *tk)
@@ -462,7 +489,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
462 memcpy(&shadow_timekeeper, &tk_core.timekeeper, 489 memcpy(&shadow_timekeeper, &tk_core.timekeeper,
463 sizeof(tk_core.timekeeper)); 490 sizeof(tk_core.timekeeper));
464 491
465 update_fast_timekeeper(tk); 492 update_fast_timekeeper(&tk->tkr);
466} 493}
467 494
468/** 495/**
@@ -1170,7 +1197,7 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
1170 * xtime/wall_to_monotonic/jiffies/etc are 1197 * xtime/wall_to_monotonic/jiffies/etc are
1171 * still managed by arch specific suspend/resume code. 1198 * still managed by arch specific suspend/resume code.
1172 */ 1199 */
1173static void timekeeping_resume(void) 1200void timekeeping_resume(void)
1174{ 1201{
1175 struct timekeeper *tk = &tk_core.timekeeper; 1202 struct timekeeper *tk = &tk_core.timekeeper;
1176 struct clocksource *clock = tk->tkr.clock; 1203 struct clocksource *clock = tk->tkr.clock;
@@ -1251,7 +1278,7 @@ static void timekeeping_resume(void)
1251 hrtimers_resume(); 1278 hrtimers_resume();
1252} 1279}
1253 1280
1254static int timekeeping_suspend(void) 1281int timekeeping_suspend(void)
1255{ 1282{
1256 struct timekeeper *tk = &tk_core.timekeeper; 1283 struct timekeeper *tk = &tk_core.timekeeper;
1257 unsigned long flags; 1284 unsigned long flags;
@@ -1296,6 +1323,7 @@ static int timekeeping_suspend(void)
1296 } 1323 }
1297 1324
1298 timekeeping_update(tk, TK_MIRROR); 1325 timekeeping_update(tk, TK_MIRROR);
1326 halt_fast_timekeeper(tk);
1299 write_seqcount_end(&tk_core.seq); 1327 write_seqcount_end(&tk_core.seq);
1300 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1328 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1301 1329
@@ -1659,24 +1687,24 @@ out:
1659} 1687}
1660 1688
1661/** 1689/**
1662 * getboottime - Return the real time of system boot. 1690 * getboottime64 - Return the real time of system boot.
1663 * @ts: pointer to the timespec to be set 1691 * @ts: pointer to the timespec64 to be set
1664 * 1692 *
1665 * Returns the wall-time of boot in a timespec. 1693 * Returns the wall-time of boot in a timespec64.
1666 * 1694 *
1667 * This is based on the wall_to_monotonic offset and the total suspend 1695 * This is based on the wall_to_monotonic offset and the total suspend
1668 * time. Calls to settimeofday will affect the value returned (which 1696 * time. Calls to settimeofday will affect the value returned (which
1669 * basically means that however wrong your real time clock is at boot time, 1697 * basically means that however wrong your real time clock is at boot time,
1670 * you get the right time here). 1698 * you get the right time here).
1671 */ 1699 */
1672void getboottime(struct timespec *ts) 1700void getboottime64(struct timespec64 *ts)
1673{ 1701{
1674 struct timekeeper *tk = &tk_core.timekeeper; 1702 struct timekeeper *tk = &tk_core.timekeeper;
1675 ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); 1703 ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
1676 1704
1677 *ts = ktime_to_timespec(t); 1705 *ts = ktime_to_timespec64(t);
1678} 1706}
1679EXPORT_SYMBOL_GPL(getboottime); 1707EXPORT_SYMBOL_GPL(getboottime64);
1680 1708
1681unsigned long get_seconds(void) 1709unsigned long get_seconds(void)
1682{ 1710{
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index adc1fc98bde3..1d91416055d5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -16,5 +16,7 @@ extern int timekeeping_inject_offset(struct timespec *ts);
16extern s32 timekeeping_get_tai_offset(void); 16extern s32 timekeeping_get_tai_offset(void);
17extern void timekeeping_set_tai_offset(s32 tai_offset); 17extern void timekeeping_set_tai_offset(s32 tai_offset);
18extern void timekeeping_clocktai(struct timespec *ts); 18extern void timekeeping_clocktai(struct timespec *ts);
19extern int timekeeping_suspend(void);
20extern void timekeeping_resume(void);
19 21
20#endif 22#endif
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979ccde26720..98f26588255e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -3,11 +3,11 @@
3 3
4ifdef CONFIG_FUNCTION_TRACER 4ifdef CONFIG_FUNCTION_TRACER
5ORIG_CFLAGS := $(KBUILD_CFLAGS) 5ORIG_CFLAGS := $(KBUILD_CFLAGS)
6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) 6KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
7 7
8ifdef CONFIG_FTRACE_SELFTEST 8ifdef CONFIG_FTRACE_SELFTEST
9# selftest needs instrumentation 9# selftest needs instrumentation
10CFLAGS_trace_selftest_dynamic.o = -pg 10CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE)
11obj-y += trace_selftest_dynamic.o 11obj-y += trace_selftest_dynamic.o
12endif 12endif
13endif 13endif
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 929a733d302e..45e5cb143d17 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2497,12 +2497,14 @@ static void ftrace_run_update_code(int command)
2497} 2497}
2498 2498
2499static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, 2499static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
2500 struct ftrace_hash *old_hash) 2500 struct ftrace_ops_hash *old_hash)
2501{ 2501{
2502 ops->flags |= FTRACE_OPS_FL_MODIFYING; 2502 ops->flags |= FTRACE_OPS_FL_MODIFYING;
2503 ops->old_hash.filter_hash = old_hash; 2503 ops->old_hash.filter_hash = old_hash->filter_hash;
2504 ops->old_hash.notrace_hash = old_hash->notrace_hash;
2504 ftrace_run_update_code(command); 2505 ftrace_run_update_code(command);
2505 ops->old_hash.filter_hash = NULL; 2506 ops->old_hash.filter_hash = NULL;
2507 ops->old_hash.notrace_hash = NULL;
2506 ops->flags &= ~FTRACE_OPS_FL_MODIFYING; 2508 ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
2507} 2509}
2508 2510
@@ -3579,7 +3581,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
3579 3581
3580static int ftrace_probe_registered; 3582static int ftrace_probe_registered;
3581 3583
3582static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash) 3584static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)
3583{ 3585{
3584 int ret; 3586 int ret;
3585 int i; 3587 int i;
@@ -3637,6 +3639,7 @@ int
3637register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 3639register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3638 void *data) 3640 void *data)
3639{ 3641{
3642 struct ftrace_ops_hash old_hash_ops;
3640 struct ftrace_func_probe *entry; 3643 struct ftrace_func_probe *entry;
3641 struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; 3644 struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
3642 struct ftrace_hash *old_hash = *orig_hash; 3645 struct ftrace_hash *old_hash = *orig_hash;
@@ -3658,6 +3661,10 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3658 3661
3659 mutex_lock(&trace_probe_ops.func_hash->regex_lock); 3662 mutex_lock(&trace_probe_ops.func_hash->regex_lock);
3660 3663
3664 old_hash_ops.filter_hash = old_hash;
3665 /* Probes only have filters */
3666 old_hash_ops.notrace_hash = NULL;
3667
3661 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); 3668 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
3662 if (!hash) { 3669 if (!hash) {
3663 count = -ENOMEM; 3670 count = -ENOMEM;
@@ -3718,7 +3725,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3718 3725
3719 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); 3726 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3720 3727
3721 __enable_ftrace_function_probe(old_hash); 3728 __enable_ftrace_function_probe(&old_hash_ops);
3722 3729
3723 if (!ret) 3730 if (!ret)
3724 free_ftrace_hash_rcu(old_hash); 3731 free_ftrace_hash_rcu(old_hash);
@@ -4006,10 +4013,34 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
4006} 4013}
4007 4014
4008static void ftrace_ops_update_code(struct ftrace_ops *ops, 4015static void ftrace_ops_update_code(struct ftrace_ops *ops,
4009 struct ftrace_hash *old_hash) 4016 struct ftrace_ops_hash *old_hash)
4010{ 4017{
4011 if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) 4018 struct ftrace_ops *op;
4019
4020 if (!ftrace_enabled)
4021 return;
4022
4023 if (ops->flags & FTRACE_OPS_FL_ENABLED) {
4012 ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); 4024 ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
4025 return;
4026 }
4027
4028 /*
4029 * If this is the shared global_ops filter, then we need to
4030 * check if there is another ops that shares it, is enabled.
4031 * If so, we still need to run the modify code.
4032 */
4033 if (ops->func_hash != &global_ops.local_hash)
4034 return;
4035
4036 do_for_each_ftrace_op(op, ftrace_ops_list) {
4037 if (op->func_hash == &global_ops.local_hash &&
4038 op->flags & FTRACE_OPS_FL_ENABLED) {
4039 ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
4040 /* Only need to do this once */
4041 return;
4042 }
4043 } while_for_each_ftrace_op(op);
4013} 4044}
4014 4045
4015static int 4046static int
@@ -4017,6 +4048,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
4017 unsigned long ip, int remove, int reset, int enable) 4048 unsigned long ip, int remove, int reset, int enable)
4018{ 4049{
4019 struct ftrace_hash **orig_hash; 4050 struct ftrace_hash **orig_hash;
4051 struct ftrace_ops_hash old_hash_ops;
4020 struct ftrace_hash *old_hash; 4052 struct ftrace_hash *old_hash;
4021 struct ftrace_hash *hash; 4053 struct ftrace_hash *hash;
4022 int ret; 4054 int ret;
@@ -4053,9 +4085,11 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
4053 4085
4054 mutex_lock(&ftrace_lock); 4086 mutex_lock(&ftrace_lock);
4055 old_hash = *orig_hash; 4087 old_hash = *orig_hash;
4088 old_hash_ops.filter_hash = ops->func_hash->filter_hash;
4089 old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
4056 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 4090 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
4057 if (!ret) { 4091 if (!ret) {
4058 ftrace_ops_update_code(ops, old_hash); 4092 ftrace_ops_update_code(ops, &old_hash_ops);
4059 free_ftrace_hash_rcu(old_hash); 4093 free_ftrace_hash_rcu(old_hash);
4060 } 4094 }
4061 mutex_unlock(&ftrace_lock); 4095 mutex_unlock(&ftrace_lock);
@@ -4267,6 +4301,7 @@ static void __init set_ftrace_early_filters(void)
4267int ftrace_regex_release(struct inode *inode, struct file *file) 4301int ftrace_regex_release(struct inode *inode, struct file *file)
4268{ 4302{
4269 struct seq_file *m = (struct seq_file *)file->private_data; 4303 struct seq_file *m = (struct seq_file *)file->private_data;
4304 struct ftrace_ops_hash old_hash_ops;
4270 struct ftrace_iterator *iter; 4305 struct ftrace_iterator *iter;
4271 struct ftrace_hash **orig_hash; 4306 struct ftrace_hash **orig_hash;
4272 struct ftrace_hash *old_hash; 4307 struct ftrace_hash *old_hash;
@@ -4300,10 +4335,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
4300 4335
4301 mutex_lock(&ftrace_lock); 4336 mutex_lock(&ftrace_lock);
4302 old_hash = *orig_hash; 4337 old_hash = *orig_hash;
4338 old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash;
4339 old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash;
4303 ret = ftrace_hash_move(iter->ops, filter_hash, 4340 ret = ftrace_hash_move(iter->ops, filter_hash,
4304 orig_hash, iter->hash); 4341 orig_hash, iter->hash);
4305 if (!ret) { 4342 if (!ret) {
4306 ftrace_ops_update_code(iter->ops, old_hash); 4343 ftrace_ops_update_code(iter->ops, &old_hash_ops);
4307 free_ftrace_hash_rcu(old_hash); 4344 free_ftrace_hash_rcu(old_hash);
4308 } 4345 }
4309 mutex_unlock(&ftrace_lock); 4346 mutex_unlock(&ftrace_lock);
@@ -5419,7 +5456,7 @@ static __init int ftrace_init_debugfs(void)
5419 struct dentry *d_tracer; 5456 struct dentry *d_tracer;
5420 5457
5421 d_tracer = tracing_init_dentry(); 5458 d_tracer = tracing_init_dentry();
5422 if (!d_tracer) 5459 if (IS_ERR(d_tracer))
5423 return 0; 5460 return 0;
5424 5461
5425 ftrace_init_dyn_debugfs(d_tracer); 5462 ftrace_init_dyn_debugfs(d_tracer);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 1c71382b283d..eb4220a132ec 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,6 @@
13#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
16EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); 17EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
17 18
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7a4104cb95cb..5040d44fe5a3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -9,7 +9,6 @@
9#include <linux/trace_seq.h> 9#include <linux/trace_seq.h>
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/irq_work.h> 11#include <linux/irq_work.h>
12#include <linux/debugfs.h>
13#include <linux/uaccess.h> 12#include <linux/uaccess.h>
14#include <linux/hardirq.h> 13#include <linux/hardirq.h>
15#include <linux/kthread.h> /* for self test */ 14#include <linux/kthread.h> /* for self test */
@@ -23,7 +22,6 @@
23#include <linux/hash.h> 22#include <linux/hash.h>
24#include <linux/list.h> 23#include <linux/list.h>
25#include <linux/cpu.h> 24#include <linux/cpu.h>
26#include <linux/fs.h>
27 25
28#include <asm/local.h> 26#include <asm/local.h>
29 27
@@ -447,7 +445,10 @@ int ring_buffer_print_page_header(struct trace_seq *s)
447struct rb_irq_work { 445struct rb_irq_work {
448 struct irq_work work; 446 struct irq_work work;
449 wait_queue_head_t waiters; 447 wait_queue_head_t waiters;
448 wait_queue_head_t full_waiters;
450 bool waiters_pending; 449 bool waiters_pending;
450 bool full_waiters_pending;
451 bool wakeup_full;
451}; 452};
452 453
453/* 454/*
@@ -529,6 +530,10 @@ static void rb_wake_up_waiters(struct irq_work *work)
529 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); 530 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
530 531
531 wake_up_all(&rbwork->waiters); 532 wake_up_all(&rbwork->waiters);
533 if (rbwork->wakeup_full) {
534 rbwork->wakeup_full = false;
535 wake_up_all(&rbwork->full_waiters);
536 }
532} 537}
533 538
534/** 539/**
@@ -553,9 +558,11 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
553 * data in any cpu buffer, or a specific buffer, put the 558 * data in any cpu buffer, or a specific buffer, put the
554 * caller on the appropriate wait queue. 559 * caller on the appropriate wait queue.
555 */ 560 */
556 if (cpu == RING_BUFFER_ALL_CPUS) 561 if (cpu == RING_BUFFER_ALL_CPUS) {
557 work = &buffer->irq_work; 562 work = &buffer->irq_work;
558 else { 563 /* Full only makes sense on per cpu reads */
564 full = false;
565 } else {
559 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 566 if (!cpumask_test_cpu(cpu, buffer->cpumask))
560 return -ENODEV; 567 return -ENODEV;
561 cpu_buffer = buffer->buffers[cpu]; 568 cpu_buffer = buffer->buffers[cpu];
@@ -564,7 +571,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
564 571
565 572
566 while (true) { 573 while (true) {
567 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); 574 if (full)
575 prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
576 else
577 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
568 578
569 /* 579 /*
570 * The events can happen in critical sections where 580 * The events can happen in critical sections where
@@ -586,7 +596,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
586 * that is necessary is that the wake up happens after 596 * that is necessary is that the wake up happens after
587 * a task has been queued. It's OK for spurious wake ups. 597 * a task has been queued. It's OK for spurious wake ups.
588 */ 598 */
589 work->waiters_pending = true; 599 if (full)
600 work->full_waiters_pending = true;
601 else
602 work->waiters_pending = true;
590 603
591 if (signal_pending(current)) { 604 if (signal_pending(current)) {
592 ret = -EINTR; 605 ret = -EINTR;
@@ -615,7 +628,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
615 schedule(); 628 schedule();
616 } 629 }
617 630
618 finish_wait(&work->waiters, &wait); 631 if (full)
632 finish_wait(&work->full_waiters, &wait);
633 else
634 finish_wait(&work->waiters, &wait);
619 635
620 return ret; 636 return ret;
621} 637}
@@ -1230,6 +1246,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1230 init_completion(&cpu_buffer->update_done); 1246 init_completion(&cpu_buffer->update_done);
1231 init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); 1247 init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
1232 init_waitqueue_head(&cpu_buffer->irq_work.waiters); 1248 init_waitqueue_head(&cpu_buffer->irq_work.waiters);
1249 init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
1233 1250
1234 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1251 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1235 GFP_KERNEL, cpu_to_node(cpu)); 1252 GFP_KERNEL, cpu_to_node(cpu));
@@ -2801,6 +2818,8 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2801static __always_inline void 2818static __always_inline void
2802rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) 2819rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2803{ 2820{
2821 bool pagebusy;
2822
2804 if (buffer->irq_work.waiters_pending) { 2823 if (buffer->irq_work.waiters_pending) {
2805 buffer->irq_work.waiters_pending = false; 2824 buffer->irq_work.waiters_pending = false;
2806 /* irq_work_queue() supplies it's own memory barriers */ 2825 /* irq_work_queue() supplies it's own memory barriers */
@@ -2812,6 +2831,15 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2812 /* irq_work_queue() supplies it's own memory barriers */ 2831 /* irq_work_queue() supplies it's own memory barriers */
2813 irq_work_queue(&cpu_buffer->irq_work.work); 2832 irq_work_queue(&cpu_buffer->irq_work.work);
2814 } 2833 }
2834
2835 pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
2836
2837 if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
2838 cpu_buffer->irq_work.wakeup_full = true;
2839 cpu_buffer->irq_work.full_waiters_pending = false;
2840 /* irq_work_queue() supplies it's own memory barriers */
2841 irq_work_queue(&cpu_buffer->irq_work.work);
2842 }
2815} 2843}
2816 2844
2817/** 2845/**
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 3f9e328c30b5..13d945c0d03f 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -7,7 +7,7 @@
7#include <linux/completion.h> 7#include <linux/completion.h>
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/time.h> 10#include <linux/ktime.h>
11#include <asm/local.h> 11#include <asm/local.h>
12 12
13struct rb_page { 13struct rb_page {
@@ -17,7 +17,7 @@ struct rb_page {
17}; 17};
18 18
19/* run time and sleep time in seconds */ 19/* run time and sleep time in seconds */
20#define RUN_TIME 10 20#define RUN_TIME 10ULL
21#define SLEEP_TIME 10 21#define SLEEP_TIME 10
22 22
23/* number of events for writer to wake up the reader */ 23/* number of events for writer to wake up the reader */
@@ -212,8 +212,7 @@ static void ring_buffer_consumer(void)
212 212
213static void ring_buffer_producer(void) 213static void ring_buffer_producer(void)
214{ 214{
215 struct timeval start_tv; 215 ktime_t start_time, end_time, timeout;
216 struct timeval end_tv;
217 unsigned long long time; 216 unsigned long long time;
218 unsigned long long entries; 217 unsigned long long entries;
219 unsigned long long overruns; 218 unsigned long long overruns;
@@ -227,7 +226,8 @@ static void ring_buffer_producer(void)
227 * make the system stall) 226 * make the system stall)
228 */ 227 */
229 trace_printk("Starting ring buffer hammer\n"); 228 trace_printk("Starting ring buffer hammer\n");
230 do_gettimeofday(&start_tv); 229 start_time = ktime_get();
230 timeout = ktime_add_ns(start_time, RUN_TIME * NSEC_PER_SEC);
231 do { 231 do {
232 struct ring_buffer_event *event; 232 struct ring_buffer_event *event;
233 int *entry; 233 int *entry;
@@ -244,7 +244,7 @@ static void ring_buffer_producer(void)
244 ring_buffer_unlock_commit(buffer, event); 244 ring_buffer_unlock_commit(buffer, event);
245 } 245 }
246 } 246 }
247 do_gettimeofday(&end_tv); 247 end_time = ktime_get();
248 248
249 cnt++; 249 cnt++;
250 if (consumer && !(cnt % wakeup_interval)) 250 if (consumer && !(cnt % wakeup_interval))
@@ -264,7 +264,7 @@ static void ring_buffer_producer(void)
264 cond_resched(); 264 cond_resched();
265#endif 265#endif
266 266
267 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); 267 } while (ktime_before(end_time, timeout) && !kill_test);
268 trace_printk("End ring buffer hammer\n"); 268 trace_printk("End ring buffer hammer\n");
269 269
270 if (consumer) { 270 if (consumer) {
@@ -280,9 +280,7 @@ static void ring_buffer_producer(void)
280 wait_for_completion(&read_done); 280 wait_for_completion(&read_done);
281 } 281 }
282 282
283 time = end_tv.tv_sec - start_tv.tv_sec; 283 time = ktime_us_delta(end_time, start_time);
284 time *= USEC_PER_SEC;
285 time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
286 284
287 entries = ring_buffer_entries(buffer); 285 entries = ring_buffer_entries(buffer);
288 overruns = ring_buffer_overruns(buffer); 286 overruns = ring_buffer_overruns(buffer);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2e767972e99c..62c6506d663f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2036,7 +2036,8 @@ void trace_printk_init_buffers(void)
2036 2036
2037 /* trace_printk() is for debug use only. Don't use it in production. */ 2037 /* trace_printk() is for debug use only. Don't use it in production. */
2038 2038
2039 pr_warning("\n**********************************************************\n"); 2039 pr_warning("\n");
2040 pr_warning("**********************************************************\n");
2040 pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); 2041 pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
2041 pr_warning("** **\n"); 2042 pr_warning("** **\n");
2042 pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); 2043 pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
@@ -3352,12 +3353,12 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
3352 3353
3353 mutex_lock(&tracing_cpumask_update_lock); 3354 mutex_lock(&tracing_cpumask_update_lock);
3354 3355
3355 len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask); 3356 len = snprintf(mask_str, count, "%*pb\n",
3356 if (count - len < 2) { 3357 cpumask_pr_args(tr->tracing_cpumask));
3358 if (len >= count) {
3357 count = -EINVAL; 3359 count = -EINVAL;
3358 goto out_err; 3360 goto out_err;
3359 } 3361 }
3360 len += sprintf(mask_str + len, "\n");
3361 count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); 3362 count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
3362 3363
3363out_err: 3364out_err:
@@ -4140,6 +4141,12 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
4140 goto out; 4141 goto out;
4141 } 4142 }
4142 4143
4144 /* If trace pipe files are being read, we can't change the tracer */
4145 if (tr->current_trace->ref) {
4146 ret = -EBUSY;
4147 goto out;
4148 }
4149
4143 trace_branch_disable(); 4150 trace_branch_disable();
4144 4151
4145 tr->current_trace->enabled--; 4152 tr->current_trace->enabled--;
@@ -4326,17 +4333,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
4326 } 4333 }
4327 4334
4328 trace_seq_init(&iter->seq); 4335 trace_seq_init(&iter->seq);
4329 4336 iter->trace = tr->current_trace;
4330 /*
4331 * We make a copy of the current tracer to avoid concurrent
4332 * changes on it while we are reading.
4333 */
4334 iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL);
4335 if (!iter->trace) {
4336 ret = -ENOMEM;
4337 goto fail;
4338 }
4339 *iter->trace = *tr->current_trace;
4340 4337
4341 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 4338 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
4342 ret = -ENOMEM; 4339 ret = -ENOMEM;
@@ -4363,6 +4360,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
4363 iter->trace->pipe_open(iter); 4360 iter->trace->pipe_open(iter);
4364 4361
4365 nonseekable_open(inode, filp); 4362 nonseekable_open(inode, filp);
4363
4364 tr->current_trace->ref++;
4366out: 4365out:
4367 mutex_unlock(&trace_types_lock); 4366 mutex_unlock(&trace_types_lock);
4368 return ret; 4367 return ret;
@@ -4382,6 +4381,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
4382 4381
4383 mutex_lock(&trace_types_lock); 4382 mutex_lock(&trace_types_lock);
4384 4383
4384 tr->current_trace->ref--;
4385
4385 if (iter->trace->pipe_close) 4386 if (iter->trace->pipe_close)
4386 iter->trace->pipe_close(iter); 4387 iter->trace->pipe_close(iter);
4387 4388
@@ -4389,7 +4390,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
4389 4390
4390 free_cpumask_var(iter->started); 4391 free_cpumask_var(iter->started);
4391 mutex_destroy(&iter->mutex); 4392 mutex_destroy(&iter->mutex);
4392 kfree(iter->trace);
4393 kfree(iter); 4393 kfree(iter);
4394 4394
4395 trace_array_put(tr); 4395 trace_array_put(tr);
@@ -4422,7 +4422,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
4422 return trace_poll(iter, filp, poll_table); 4422 return trace_poll(iter, filp, poll_table);
4423} 4423}
4424 4424
4425/* Must be called with trace_types_lock mutex held. */ 4425/* Must be called with iter->mutex held. */
4426static int tracing_wait_pipe(struct file *filp) 4426static int tracing_wait_pipe(struct file *filp)
4427{ 4427{
4428 struct trace_iterator *iter = filp->private_data; 4428 struct trace_iterator *iter = filp->private_data;
@@ -4467,7 +4467,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
4467 size_t cnt, loff_t *ppos) 4467 size_t cnt, loff_t *ppos)
4468{ 4468{
4469 struct trace_iterator *iter = filp->private_data; 4469 struct trace_iterator *iter = filp->private_data;
4470 struct trace_array *tr = iter->tr;
4471 ssize_t sret; 4470 ssize_t sret;
4472 4471
4473 /* return any leftover data */ 4472 /* return any leftover data */
@@ -4477,12 +4476,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
4477 4476
4478 trace_seq_init(&iter->seq); 4477 trace_seq_init(&iter->seq);
4479 4478
4480 /* copy the tracer to avoid using a global lock all around */
4481 mutex_lock(&trace_types_lock);
4482 if (unlikely(iter->trace->name != tr->current_trace->name))
4483 *iter->trace = *tr->current_trace;
4484 mutex_unlock(&trace_types_lock);
4485
4486 /* 4479 /*
4487 * Avoid more than one consumer on a single file descriptor 4480 * Avoid more than one consumer on a single file descriptor
4488 * This is just a matter of traces coherency, the ring buffer itself 4481 * This is just a matter of traces coherency, the ring buffer itself
@@ -4642,7 +4635,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
4642 .ops = &tracing_pipe_buf_ops, 4635 .ops = &tracing_pipe_buf_ops,
4643 .spd_release = tracing_spd_release_pipe, 4636 .spd_release = tracing_spd_release_pipe,
4644 }; 4637 };
4645 struct trace_array *tr = iter->tr;
4646 ssize_t ret; 4638 ssize_t ret;
4647 size_t rem; 4639 size_t rem;
4648 unsigned int i; 4640 unsigned int i;
@@ -4650,12 +4642,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
4650 if (splice_grow_spd(pipe, &spd)) 4642 if (splice_grow_spd(pipe, &spd))
4651 return -ENOMEM; 4643 return -ENOMEM;
4652 4644
4653 /* copy the tracer to avoid using a global lock all around */
4654 mutex_lock(&trace_types_lock);
4655 if (unlikely(iter->trace->name != tr->current_trace->name))
4656 *iter->trace = *tr->current_trace;
4657 mutex_unlock(&trace_types_lock);
4658
4659 mutex_lock(&iter->mutex); 4645 mutex_lock(&iter->mutex);
4660 4646
4661 if (iter->trace->splice_read) { 4647 if (iter->trace->splice_read) {
@@ -4942,7 +4928,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4942 *fpos += written; 4928 *fpos += written;
4943 4929
4944 out_unlock: 4930 out_unlock:
4945 for (i = 0; i < nr_pages; i++){ 4931 for (i = nr_pages - 1; i >= 0; i--) {
4946 kunmap_atomic(map_page[i]); 4932 kunmap_atomic(map_page[i]);
4947 put_page(pages[i]); 4933 put_page(pages[i]);
4948 } 4934 }
@@ -5331,6 +5317,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
5331 5317
5332 filp->private_data = info; 5318 filp->private_data = info;
5333 5319
5320 tr->current_trace->ref++;
5321
5334 mutex_unlock(&trace_types_lock); 5322 mutex_unlock(&trace_types_lock);
5335 5323
5336 ret = nonseekable_open(inode, filp); 5324 ret = nonseekable_open(inode, filp);
@@ -5361,21 +5349,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5361 if (!count) 5349 if (!count)
5362 return 0; 5350 return 0;
5363 5351
5364 mutex_lock(&trace_types_lock);
5365
5366#ifdef CONFIG_TRACER_MAX_TRACE 5352#ifdef CONFIG_TRACER_MAX_TRACE
5367 if (iter->snapshot && iter->tr->current_trace->use_max_tr) { 5353 if (iter->snapshot && iter->tr->current_trace->use_max_tr)
5368 size = -EBUSY; 5354 return -EBUSY;
5369 goto out_unlock;
5370 }
5371#endif 5355#endif
5372 5356
5373 if (!info->spare) 5357 if (!info->spare)
5374 info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, 5358 info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
5375 iter->cpu_file); 5359 iter->cpu_file);
5376 size = -ENOMEM;
5377 if (!info->spare) 5360 if (!info->spare)
5378 goto out_unlock; 5361 return -ENOMEM;
5379 5362
5380 /* Do we have previous read data to read? */ 5363 /* Do we have previous read data to read? */
5381 if (info->read < PAGE_SIZE) 5364 if (info->read < PAGE_SIZE)
@@ -5391,21 +5374,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5391 5374
5392 if (ret < 0) { 5375 if (ret < 0) {
5393 if (trace_empty(iter)) { 5376 if (trace_empty(iter)) {
5394 if ((filp->f_flags & O_NONBLOCK)) { 5377 if ((filp->f_flags & O_NONBLOCK))
5395 size = -EAGAIN; 5378 return -EAGAIN;
5396 goto out_unlock; 5379
5397 }
5398 mutex_unlock(&trace_types_lock);
5399 ret = wait_on_pipe(iter, false); 5380 ret = wait_on_pipe(iter, false);
5400 mutex_lock(&trace_types_lock); 5381 if (ret)
5401 if (ret) { 5382 return ret;
5402 size = ret; 5383
5403 goto out_unlock;
5404 }
5405 goto again; 5384 goto again;
5406 } 5385 }
5407 size = 0; 5386 return 0;
5408 goto out_unlock;
5409 } 5387 }
5410 5388
5411 info->read = 0; 5389 info->read = 0;
@@ -5415,18 +5393,14 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5415 size = count; 5393 size = count;
5416 5394
5417 ret = copy_to_user(ubuf, info->spare + info->read, size); 5395 ret = copy_to_user(ubuf, info->spare + info->read, size);
5418 if (ret == size) { 5396 if (ret == size)
5419 size = -EFAULT; 5397 return -EFAULT;
5420 goto out_unlock; 5398
5421 }
5422 size -= ret; 5399 size -= ret;
5423 5400
5424 *ppos += size; 5401 *ppos += size;
5425 info->read += size; 5402 info->read += size;
5426 5403
5427 out_unlock:
5428 mutex_unlock(&trace_types_lock);
5429
5430 return size; 5404 return size;
5431} 5405}
5432 5406
@@ -5437,6 +5411,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
5437 5411
5438 mutex_lock(&trace_types_lock); 5412 mutex_lock(&trace_types_lock);
5439 5413
5414 iter->tr->current_trace->ref--;
5415
5440 __trace_array_put(iter->tr); 5416 __trace_array_put(iter->tr);
5441 5417
5442 if (info->spare) 5418 if (info->spare)
@@ -5522,30 +5498,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5522 int entries, size, i; 5498 int entries, size, i;
5523 ssize_t ret = 0; 5499 ssize_t ret = 0;
5524 5500
5525 mutex_lock(&trace_types_lock);
5526
5527#ifdef CONFIG_TRACER_MAX_TRACE 5501#ifdef CONFIG_TRACER_MAX_TRACE
5528 if (iter->snapshot && iter->tr->current_trace->use_max_tr) { 5502 if (iter->snapshot && iter->tr->current_trace->use_max_tr)
5529 ret = -EBUSY; 5503 return -EBUSY;
5530 goto out;
5531 }
5532#endif 5504#endif
5533 5505
5534 if (splice_grow_spd(pipe, &spd)) { 5506 if (splice_grow_spd(pipe, &spd))
5535 ret = -ENOMEM; 5507 return -ENOMEM;
5536 goto out;
5537 }
5538 5508
5539 if (*ppos & (PAGE_SIZE - 1)) { 5509 if (*ppos & (PAGE_SIZE - 1))
5540 ret = -EINVAL; 5510 return -EINVAL;
5541 goto out;
5542 }
5543 5511
5544 if (len & (PAGE_SIZE - 1)) { 5512 if (len & (PAGE_SIZE - 1)) {
5545 if (len < PAGE_SIZE) { 5513 if (len < PAGE_SIZE)
5546 ret = -EINVAL; 5514 return -EINVAL;
5547 goto out;
5548 }
5549 len &= PAGE_MASK; 5515 len &= PAGE_MASK;
5550 } 5516 }
5551 5517
@@ -5606,25 +5572,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5606 /* did we read anything? */ 5572 /* did we read anything? */
5607 if (!spd.nr_pages) { 5573 if (!spd.nr_pages) {
5608 if (ret) 5574 if (ret)
5609 goto out; 5575 return ret;
5576
5577 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
5578 return -EAGAIN;
5610 5579
5611 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
5612 ret = -EAGAIN;
5613 goto out;
5614 }
5615 mutex_unlock(&trace_types_lock);
5616 ret = wait_on_pipe(iter, true); 5580 ret = wait_on_pipe(iter, true);
5617 mutex_lock(&trace_types_lock);
5618 if (ret) 5581 if (ret)
5619 goto out; 5582 return ret;
5620 5583
5621 goto again; 5584 goto again;
5622 } 5585 }
5623 5586
5624 ret = splice_to_pipe(pipe, &spd); 5587 ret = splice_to_pipe(pipe, &spd);
5625 splice_shrink_spd(&spd); 5588 splice_shrink_spd(&spd);
5626out:
5627 mutex_unlock(&trace_types_lock);
5628 5589
5629 return ret; 5590 return ret;
5630} 5591}
@@ -5854,28 +5815,11 @@ static __init int register_snapshot_cmd(void)
5854static inline __init int register_snapshot_cmd(void) { return 0; } 5815static inline __init int register_snapshot_cmd(void) { return 0; }
5855#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ 5816#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
5856 5817
5857struct dentry *tracing_init_dentry_tr(struct trace_array *tr) 5818static struct dentry *tracing_get_dentry(struct trace_array *tr)
5858{ 5819{
5859 if (tr->dir)
5860 return tr->dir;
5861
5862 if (!debugfs_initialized())
5863 return NULL;
5864
5865 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
5866 tr->dir = debugfs_create_dir("tracing", NULL);
5867
5868 if (!tr->dir)
5869 pr_warn_once("Could not create debugfs directory 'tracing'\n");
5870
5871 return tr->dir; 5820 return tr->dir;
5872} 5821}
5873 5822
5874struct dentry *tracing_init_dentry(void)
5875{
5876 return tracing_init_dentry_tr(&global_trace);
5877}
5878
5879static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) 5823static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
5880{ 5824{
5881 struct dentry *d_tracer; 5825 struct dentry *d_tracer;
@@ -5883,8 +5827,8 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
5883 if (tr->percpu_dir) 5827 if (tr->percpu_dir)
5884 return tr->percpu_dir; 5828 return tr->percpu_dir;
5885 5829
5886 d_tracer = tracing_init_dentry_tr(tr); 5830 d_tracer = tracing_get_dentry(tr);
5887 if (!d_tracer) 5831 if (IS_ERR(d_tracer))
5888 return NULL; 5832 return NULL;
5889 5833
5890 tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); 5834 tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
@@ -6086,8 +6030,8 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
6086 if (tr->options) 6030 if (tr->options)
6087 return tr->options; 6031 return tr->options;
6088 6032
6089 d_tracer = tracing_init_dentry_tr(tr); 6033 d_tracer = tracing_get_dentry(tr);
6090 if (!d_tracer) 6034 if (IS_ERR(d_tracer))
6091 return NULL; 6035 return NULL;
6092 6036
6093 tr->options = debugfs_create_dir("options", d_tracer); 6037 tr->options = debugfs_create_dir("options", d_tracer);
@@ -6416,7 +6360,7 @@ static int instance_delete(const char *name)
6416 goto out_unlock; 6360 goto out_unlock;
6417 6361
6418 ret = -EBUSY; 6362 ret = -EBUSY;
6419 if (tr->ref) 6363 if (tr->ref || (tr->current_trace && tr->current_trace->ref))
6420 goto out_unlock; 6364 goto out_unlock;
6421 6365
6422 list_del(&tr->list); 6366 list_del(&tr->list);
@@ -6571,6 +6515,33 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6571 6515
6572} 6516}
6573 6517
6518/**
6519 * tracing_init_dentry - initialize top level trace array
6520 *
6521 * This is called when creating files or directories in the tracing
6522 * directory. It is called via fs_initcall() by any of the boot up code
6523 * and expects to return the dentry of the top level tracing directory.
6524 */
6525struct dentry *tracing_init_dentry(void)
6526{
6527 struct trace_array *tr = &global_trace;
6528
6529 if (tr->dir)
6530 return tr->dir;
6531
6532 if (WARN_ON(!debugfs_initialized()))
6533 return ERR_PTR(-ENODEV);
6534
6535 tr->dir = debugfs_create_dir("tracing", NULL);
6536
6537 if (!tr->dir) {
6538 pr_warn_once("Could not create debugfs directory 'tracing'\n");
6539 return ERR_PTR(-ENOMEM);
6540 }
6541
6542 return tr->dir;
6543}
6544
6574static __init int tracer_init_debugfs(void) 6545static __init int tracer_init_debugfs(void)
6575{ 6546{
6576 struct dentry *d_tracer; 6547 struct dentry *d_tracer;
@@ -6578,7 +6549,7 @@ static __init int tracer_init_debugfs(void)
6578 trace_access_lock_init(); 6549 trace_access_lock_init();
6579 6550
6580 d_tracer = tracing_init_dentry(); 6551 d_tracer = tracing_init_dentry();
6581 if (!d_tracer) 6552 if (IS_ERR(d_tracer))
6582 return 0; 6553 return 0;
6583 6554
6584 init_tracer_debugfs(&global_trace, d_tracer); 6555 init_tracer_debugfs(&global_trace, d_tracer);
@@ -6811,7 +6782,6 @@ __init static int tracer_alloc_buffers(void)
6811 int ring_buf_size; 6782 int ring_buf_size;
6812 int ret = -ENOMEM; 6783 int ret = -ENOMEM;
6813 6784
6814
6815 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 6785 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
6816 goto out; 6786 goto out;
6817 6787
@@ -6918,7 +6888,6 @@ void __init trace_init(void)
6918 tracepoint_printk = 0; 6888 tracepoint_printk = 0;
6919 } 6889 }
6920 tracer_alloc_buffers(); 6890 tracer_alloc_buffers();
6921 init_ftrace_syscalls();
6922 trace_event_init(); 6891 trace_event_init();
6923} 6892}
6924 6893
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8de48bac1ce2..dd8205a35760 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -388,6 +388,7 @@ struct tracer {
388 struct tracer *next; 388 struct tracer *next;
389 struct tracer_flags *flags; 389 struct tracer_flags *flags;
390 int enabled; 390 int enabled;
391 int ref;
391 bool print_max; 392 bool print_max;
392 bool allow_instances; 393 bool allow_instances;
393#ifdef CONFIG_TRACER_MAX_TRACE 394#ifdef CONFIG_TRACER_MAX_TRACE
@@ -541,7 +542,6 @@ struct dentry *trace_create_file(const char *name,
541 void *data, 542 void *data,
542 const struct file_operations *fops); 543 const struct file_operations *fops);
543 544
544struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
545struct dentry *tracing_init_dentry(void); 545struct dentry *tracing_init_dentry(void);
546 546
547struct ring_buffer_event; 547struct ring_buffer_event;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7d6e2afde669..57cbf1efdd44 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -7,7 +7,6 @@
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/spinlock.h> 8#include <linux/spinlock.h>
9#include <linux/irqflags.h> 9#include <linux/irqflags.h>
10#include <linux/debugfs.h>
11#include <linux/uaccess.h> 10#include <linux/uaccess.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/ftrace.h> 12#include <linux/ftrace.h>
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4b9c114ee9de..6fa484de2ba1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags)
261} 261}
262 262
263void *perf_trace_buf_prepare(int size, unsigned short type, 263void *perf_trace_buf_prepare(int size, unsigned short type,
264 struct pt_regs *regs, int *rctxp) 264 struct pt_regs **regs, int *rctxp)
265{ 265{
266 struct trace_entry *entry; 266 struct trace_entry *entry;
267 unsigned long flags; 267 unsigned long flags;
@@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
280 if (*rctxp < 0) 280 if (*rctxp < 0)
281 return NULL; 281 return NULL;
282 282
283 if (regs)
284 *regs = this_cpu_ptr(&__perf_regs[*rctxp]);
283 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); 285 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
284 286
285 /* zero the dead bytes from align to not leak stack to user */ 287 /* zero the dead bytes from align to not leak stack to user */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 366a78a3e61e..db54dda10ccc 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2429,12 +2429,39 @@ static __init int event_trace_memsetup(void)
2429 return 0; 2429 return 0;
2430} 2430}
2431 2431
2432static __init void
2433early_enable_events(struct trace_array *tr, bool disable_first)
2434{
2435 char *buf = bootup_event_buf;
2436 char *token;
2437 int ret;
2438
2439 while (true) {
2440 token = strsep(&buf, ",");
2441
2442 if (!token)
2443 break;
2444 if (!*token)
2445 continue;
2446
2447 /* Restarting syscalls requires that we stop them first */
2448 if (disable_first)
2449 ftrace_set_clr_event(tr, token, 0);
2450
2451 ret = ftrace_set_clr_event(tr, token, 1);
2452 if (ret)
2453 pr_warn("Failed to enable trace event: %s\n", token);
2454
2455 /* Put back the comma to allow this to be called again */
2456 if (buf)
2457 *(buf - 1) = ',';
2458 }
2459}
2460
2432static __init int event_trace_enable(void) 2461static __init int event_trace_enable(void)
2433{ 2462{
2434 struct trace_array *tr = top_trace_array(); 2463 struct trace_array *tr = top_trace_array();
2435 struct ftrace_event_call **iter, *call; 2464 struct ftrace_event_call **iter, *call;
2436 char *buf = bootup_event_buf;
2437 char *token;
2438 int ret; 2465 int ret;
2439 2466
2440 if (!tr) 2467 if (!tr)
@@ -2456,18 +2483,7 @@ static __init int event_trace_enable(void)
2456 */ 2483 */
2457 __trace_early_add_events(tr); 2484 __trace_early_add_events(tr);
2458 2485
2459 while (true) { 2486 early_enable_events(tr, false);
2460 token = strsep(&buf, ",");
2461
2462 if (!token)
2463 break;
2464 if (!*token)
2465 continue;
2466
2467 ret = ftrace_set_clr_event(tr, token, 1);
2468 if (ret)
2469 pr_warn("Failed to enable trace event: %s\n", token);
2470 }
2471 2487
2472 trace_printk_start_comm(); 2488 trace_printk_start_comm();
2473 2489
@@ -2478,6 +2494,31 @@ static __init int event_trace_enable(void)
2478 return 0; 2494 return 0;
2479} 2495}
2480 2496
2497/*
2498 * event_trace_enable() is called from trace_event_init() first to
2499 * initialize events and perhaps start any events that are on the
2500 * command line. Unfortunately, there are some events that will not
2501 * start this early, like the system call tracepoints that need
2502 * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable()
2503 * is called before pid 1 starts, and this flag is never set, making
2504 * the syscall tracepoint never get reached, but the event is enabled
2505 * regardless (and not doing anything).
2506 */
2507static __init int event_trace_enable_again(void)
2508{
2509 struct trace_array *tr;
2510
2511 tr = top_trace_array();
2512 if (!tr)
2513 return -ENODEV;
2514
2515 early_enable_events(tr, true);
2516
2517 return 0;
2518}
2519
2520early_initcall(event_trace_enable_again);
2521
2481static __init int event_trace_init(void) 2522static __init int event_trace_init(void)
2482{ 2523{
2483 struct trace_array *tr; 2524 struct trace_array *tr;
@@ -2490,7 +2531,7 @@ static __init int event_trace_init(void)
2490 return -ENODEV; 2531 return -ENODEV;
2491 2532
2492 d_tracer = tracing_init_dentry(); 2533 d_tracer = tracing_init_dentry();
2493 if (!d_tracer) 2534 if (IS_ERR(d_tracer))
2494 return 0; 2535 return 0;
2495 2536
2496 entry = debugfs_create_file("available_events", 0444, d_tracer, 2537 entry = debugfs_create_file("available_events", 0444, d_tracer,
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4ddde28a81a..12e2b99be862 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -6,12 +6,10 @@
6#include <linux/stringify.h> 6#include <linux/stringify.h>
7#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 9#include <linux/uaccess.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/init.h> 12#include <linux/init.h>
14#include <linux/fs.h>
15 13
16#include "trace_output.h" 14#include "trace_output.h"
17 15
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ba476009e5de..2d25ad1526bb 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1437,7 +1437,7 @@ static __init int init_graph_debugfs(void)
1437 struct dentry *d_tracer; 1437 struct dentry *d_tracer;
1438 1438
1439 d_tracer = tracing_init_dentry(); 1439 d_tracer = tracing_init_dentry();
1440 if (!d_tracer) 1440 if (IS_ERR(d_tracer))
1441 return 0; 1441 return 0;
1442 1442
1443 trace_create_file("max_graph_depth", 0644, d_tracer, 1443 trace_create_file("max_graph_depth", 0644, d_tracer,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9bb104f748d0..8523ea345f2b 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -10,11 +10,9 @@
10 * Copyright (C) 2004 Nadia Yvette Chambers 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/kallsyms.h> 12#include <linux/kallsyms.h>
13#include <linux/debugfs.h>
14#include <linux/uaccess.h> 13#include <linux/uaccess.h>
15#include <linux/module.h> 14#include <linux/module.h>
16#include <linux/ftrace.h> 15#include <linux/ftrace.h>
17#include <linux/fs.h>
18 16
19#include "trace.h" 17#include "trace.h"
20 18
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index b0b1c44e923a..3ccf5c2c1320 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -132,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv)
132 132
133static __init int kdb_ftrace_register(void) 133static __init int kdb_ftrace_register(void)
134{ 134{
135 kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", 135 kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
136 "Dump ftrace log", 0, KDB_REPEAT_NONE); 136 "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE);
137 return 0; 137 return 0;
138} 138}
139 139
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5edb518be345..d73f565b4e06 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1148 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1148 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1149 size -= sizeof(u32); 1149 size -= sizeof(u32);
1150 1150
1151 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1151 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1152 if (!entry) 1152 if (!entry)
1153 return; 1153 return;
1154 1154
@@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1179 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1179 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1180 size -= sizeof(u32); 1180 size -= sizeof(u32);
1181 1181
1182 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1182 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1183 if (!entry) 1183 if (!entry)
1184 return; 1184 return;
1185 1185
@@ -1320,7 +1320,7 @@ static __init int init_kprobe_trace(void)
1320 return -EINVAL; 1320 return -EINVAL;
1321 1321
1322 d_tracer = tracing_init_dentry(); 1322 d_tracer = tracing_init_dentry();
1323 if (!d_tracer) 1323 if (IS_ERR(d_tracer))
1324 return 0; 1324 return 0;
1325 1325
1326 entry = debugfs_create_file("kprobe_events", 0644, d_tracer, 1326 entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index fcf0a9e48916..8bb2071474dd 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -6,8 +6,6 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h> 9#include <linux/ftrace.h>
12 10
13#include "trace.h" 11#include "trace.h"
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b77b9a697619..692bf7184c8c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -177,6 +177,50 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
177} 177}
178EXPORT_SYMBOL(ftrace_print_hex_seq); 178EXPORT_SYMBOL(ftrace_print_hex_seq);
179 179
180const char *
181ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
182 size_t el_size)
183{
184 const char *ret = trace_seq_buffer_ptr(p);
185 const char *prefix = "";
186 void *ptr = (void *)buf;
187
188 trace_seq_putc(p, '{');
189
190 while (ptr < buf + buf_len) {
191 switch (el_size) {
192 case 1:
193 trace_seq_printf(p, "%s0x%x", prefix,
194 *(u8 *)ptr);
195 break;
196 case 2:
197 trace_seq_printf(p, "%s0x%x", prefix,
198 *(u16 *)ptr);
199 break;
200 case 4:
201 trace_seq_printf(p, "%s0x%x", prefix,
202 *(u32 *)ptr);
203 break;
204 case 8:
205 trace_seq_printf(p, "%s0x%llx", prefix,
206 *(u64 *)ptr);
207 break;
208 default:
209 trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size,
210 *(u8 *)ptr);
211 el_size = 1;
212 }
213 prefix = ",";
214 ptr += el_size;
215 }
216
217 trace_seq_putc(p, '}');
218 trace_seq_putc(p, 0);
219
220 return ret;
221}
222EXPORT_SYMBOL(ftrace_print_array_seq);
223
180int ftrace_raw_output_prep(struct trace_iterator *iter, 224int ftrace_raw_output_prep(struct trace_iterator *iter,
181 struct trace_event *trace_event) 225 struct trace_event *trace_event)
182{ 226{
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index c4e70b6bd7fa..36c1455b7567 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -5,7 +5,6 @@
5 * 5 *
6 */ 6 */
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/debugfs.h>
9#include <linux/uaccess.h> 8#include <linux/uaccess.h>
10#include <linux/kernel.h> 9#include <linux/kernel.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
@@ -15,7 +14,6 @@
15#include <linux/ctype.h> 14#include <linux/ctype.h>
16#include <linux/list.h> 15#include <linux/list.h>
17#include <linux/slab.h> 16#include <linux/slab.h>
18#include <linux/fs.h>
19 17
20#include "trace.h" 18#include "trace.h"
21 19
@@ -349,7 +347,7 @@ static __init int init_trace_printk_function_export(void)
349 struct dentry *d_tracer; 347 struct dentry *d_tracer;
350 348
351 d_tracer = tracing_init_dentry(); 349 d_tracer = tracing_init_dentry();
352 if (!d_tracer) 350 if (IS_ERR(d_tracer))
353 return 0; 351 return 0;
354 352
355 trace_create_file("printk_formats", 0444, d_tracer, 353 trace_create_file("printk_formats", 0444, d_tracer,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 2e293beb186e..419ca37e72c9 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -5,8 +5,6 @@
5 * 5 *
6 */ 6 */
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/fs.h>
9#include <linux/debugfs.h>
10#include <linux/kallsyms.h> 8#include <linux/kallsyms.h>
11#include <linux/uaccess.h> 9#include <linux/uaccess.h>
12#include <linux/ftrace.h> 10#include <linux/ftrace.h>
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 8fb84b362816..d6e1003724e9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -10,8 +10,6 @@
10 * Copyright (C) 2004 Nadia Yvette Chambers 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/fs.h>
14#include <linux/debugfs.h>
15#include <linux/kallsyms.h> 13#include <linux/kallsyms.h>
16#include <linux/uaccess.h> 14#include <linux/uaccess.h>
17#include <linux/ftrace.h> 15#include <linux/ftrace.h>
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index f8b45d8792f9..e694c9f9efa4 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -120,7 +120,7 @@ void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
120 120
121 __trace_seq_init(s); 121 __trace_seq_init(s);
122 122
123 seq_buf_bitmask(&s->seq, maskp, nmaskbits); 123 seq_buf_printf(&s->seq, "%*pb", nmaskbits, maskp);
124 124
125 if (unlikely(seq_buf_has_overflowed(&s->seq))) { 125 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
126 s->seq.len = save_len; 126 s->seq.len = save_len;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 16eddb308c33..c3e4fcfddd45 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -7,12 +7,10 @@
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/spinlock.h> 8#include <linux/spinlock.h>
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/sysctl.h> 12#include <linux/sysctl.h>
14#include <linux/init.h> 13#include <linux/init.h>
15#include <linux/fs.h>
16 14
17#include <asm/setup.h> 15#include <asm/setup.h>
18 16
@@ -462,7 +460,7 @@ static __init int stack_trace_init(void)
462 struct dentry *d_tracer; 460 struct dentry *d_tracer;
463 461
464 d_tracer = tracing_init_dentry(); 462 d_tracer = tracing_init_dentry();
465 if (!d_tracer) 463 if (IS_ERR(d_tracer))
466 return 0; 464 return 0;
467 465
468 trace_create_file("stack_max_size", 0644, d_tracer, 466 trace_create_file("stack_max_size", 0644, d_tracer,
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 7af67360b330..75e19e86c954 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -276,7 +276,7 @@ static int tracing_stat_init(void)
276 struct dentry *d_tracing; 276 struct dentry *d_tracing;
277 277
278 d_tracing = tracing_init_dentry(); 278 d_tracing = tracing_init_dentry();
279 if (!d_tracing) 279 if (IS_ERR(d_tracing))
280 return 0; 280 return 0;
281 281
282 stat_dir = debugfs_create_dir("trace_stat", d_tracing); 282 stat_dir = debugfs_create_dir("trace_stat", d_tracing);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..f97f6e3a676c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
574 size -= sizeof(u32); 574 size -= sizeof(u32);
575 575
576 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 576 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
577 sys_data->enter_event->event.type, regs, &rctx); 577 sys_data->enter_event->event.type, NULL, &rctx);
578 if (!rec) 578 if (!rec)
579 return; 579 return;
580 580
@@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
647 size -= sizeof(u32); 647 size -= sizeof(u32);
648 648
649 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 649 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
650 sys_data->exit_event->event.type, regs, &rctx); 650 sys_data->exit_event->event.type, NULL, &rctx);
651 if (!rec) 651 if (!rec)
652 return; 652 return;
653 653
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8520acc34b18..7dc1c8abecd6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
1111 if (hlist_empty(head)) 1111 if (hlist_empty(head))
1112 goto out; 1112 goto out;
1113 1113
1114 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1114 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1115 if (!entry) 1115 if (!entry)
1116 goto out; 1116 goto out;
1117 1117
@@ -1321,7 +1321,7 @@ static __init int init_uprobe_trace(void)
1321 struct dentry *d_tracer; 1321 struct dentry *d_tracer;
1322 1322
1323 d_tracer = tracing_init_dentry(); 1323 d_tracer = tracing_init_dentry();
1324 if (!d_tracer) 1324 if (IS_ERR(d_tracer))
1325 return 0; 1325 return 0;
1326 1326
1327 trace_create_file("uprobe_events", 0644, d_tracer, 1327 trace_create_file("uprobe_events", 0644, d_tracer,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 70bf11815f84..3174bf8e3538 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -154,7 +154,7 @@ static int get_softlockup_thresh(void)
154 */ 154 */
155static unsigned long get_timestamp(void) 155static unsigned long get_timestamp(void)
156{ 156{
157 return local_clock() >> 30LL; /* 2^30 ~= 10^9 */ 157 return running_clock() >> 30LL; /* 2^30 ~= 10^9 */
158} 158}
159 159
160static void set_sample_period(void) 160static void set_sample_period(void)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6202b08f1933..f28849394791 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1841,17 +1841,11 @@ static void pool_mayday_timeout(unsigned long __pool)
1841 * spin_lock_irq(pool->lock) which may be released and regrabbed 1841 * spin_lock_irq(pool->lock) which may be released and regrabbed
1842 * multiple times. Does GFP_KERNEL allocations. Called only from 1842 * multiple times. Does GFP_KERNEL allocations. Called only from
1843 * manager. 1843 * manager.
1844 *
1845 * Return:
1846 * %false if no action was taken and pool->lock stayed locked, %true
1847 * otherwise.
1848 */ 1844 */
1849static bool maybe_create_worker(struct worker_pool *pool) 1845static void maybe_create_worker(struct worker_pool *pool)
1850__releases(&pool->lock) 1846__releases(&pool->lock)
1851__acquires(&pool->lock) 1847__acquires(&pool->lock)
1852{ 1848{
1853 if (!need_to_create_worker(pool))
1854 return false;
1855restart: 1849restart:
1856 spin_unlock_irq(&pool->lock); 1850 spin_unlock_irq(&pool->lock);
1857 1851
@@ -1877,7 +1871,6 @@ restart:
1877 */ 1871 */
1878 if (need_to_create_worker(pool)) 1872 if (need_to_create_worker(pool))
1879 goto restart; 1873 goto restart;
1880 return true;
1881} 1874}
1882 1875
1883/** 1876/**
@@ -1897,16 +1890,14 @@ restart:
1897 * multiple times. Does GFP_KERNEL allocations. 1890 * multiple times. Does GFP_KERNEL allocations.
1898 * 1891 *
1899 * Return: 1892 * Return:
1900 * %false if the pool don't need management and the caller can safely start 1893 * %false if the pool doesn't need management and the caller can safely
1901 * processing works, %true indicates that the function released pool->lock 1894 * start processing works, %true if management function was performed and
1902 * and reacquired it to perform some management function and that the 1895 * the conditions that the caller verified before calling the function may
1903 * conditions that the caller verified while holding the lock before 1896 * no longer be true.
1904 * calling the function might no longer be true.
1905 */ 1897 */
1906static bool manage_workers(struct worker *worker) 1898static bool manage_workers(struct worker *worker)
1907{ 1899{
1908 struct worker_pool *pool = worker->pool; 1900 struct worker_pool *pool = worker->pool;
1909 bool ret = false;
1910 1901
1911 /* 1902 /*
1912 * Anyone who successfully grabs manager_arb wins the arbitration 1903 * Anyone who successfully grabs manager_arb wins the arbitration
@@ -1919,12 +1910,12 @@ static bool manage_workers(struct worker *worker)
1919 * actual management, the pool may stall indefinitely. 1910 * actual management, the pool may stall indefinitely.
1920 */ 1911 */
1921 if (!mutex_trylock(&pool->manager_arb)) 1912 if (!mutex_trylock(&pool->manager_arb))
1922 return ret; 1913 return false;
1923 1914
1924 ret |= maybe_create_worker(pool); 1915 maybe_create_worker(pool);
1925 1916
1926 mutex_unlock(&pool->manager_arb); 1917 mutex_unlock(&pool->manager_arb);
1927 return ret; 1918 return true;
1928} 1919}
1929 1920
1930/** 1921/**
@@ -3092,10 +3083,9 @@ static ssize_t wq_cpumask_show(struct device *dev,
3092 int written; 3083 int written;
3093 3084
3094 mutex_lock(&wq->mutex); 3085 mutex_lock(&wq->mutex);
3095 written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask); 3086 written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
3087 cpumask_pr_args(wq->unbound_attrs->cpumask));
3096 mutex_unlock(&wq->mutex); 3088 mutex_unlock(&wq->mutex);
3097
3098 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
3099 return written; 3089 return written;
3100} 3090}
3101 3091