aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks4
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/acct.c94
-rw-r--r--kernel/audit.h17
-rw-r--r--kernel/auditfilter.c2
-rw-r--r--kernel/auditsc.c176
-rw-r--r--kernel/cgroup.c12
-rw-r--r--kernel/compat.c5
-rw-r--r--kernel/cpu.c56
-rw-r--r--kernel/cpuset.c44
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/core.c490
-rw-r--r--kernel/events/ring_buffer.c3
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/futex.c8
-rw-r--r--kernel/gcov/Makefile36
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/irq/proc.c11
-rw-r--r--kernel/kexec.c25
-rw-r--r--kernel/kprobes.c22
-rw-r--r--kernel/livepatch/Kconfig18
-rw-r--r--kernel/livepatch/Makefile3
-rw-r--r--kernel/livepatch/core.c1015
-rw-r--r--kernel/locking/Makefile11
-rw-r--r--kernel/locking/mcs_spinlock.h16
-rw-r--r--kernel/locking/mutex.c62
-rw-r--r--kernel/locking/osq_lock.c (renamed from kernel/locking/mcs_spinlock.c)9
-rw-r--r--kernel/locking/rtmutex.c7
-rw-r--r--kernel/locking/rwsem-spinlock.c2
-rw-r--r--kernel/locking/rwsem-xadd.c3
-rw-r--r--kernel/locking/spinlock.c8
-rw-r--r--kernel/module.c58
-rw-r--r--kernel/notifier.c3
-rw-r--r--kernel/padata.c11
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/process.c75
-rw-r--r--kernel/power/qos.c91
-rw-r--r--kernel/power/snapshot.c11
-rw-r--r--kernel/power/suspend.c43
-rw-r--r--kernel/printk/printk.c12
-rw-r--r--kernel/profile.c3
-rw-r--r--kernel/ptrace.c1
-rw-r--r--kernel/rcu/Makefile3
-rw-r--r--kernel/rcu/rcu.h6
-rw-r--r--kernel/rcu/rcutorture.c66
-rw-r--r--kernel/rcu/srcu.c2
-rw-r--r--kernel/rcu/tiny.c113
-rw-r--r--kernel/rcu/tiny_plugin.h9
-rw-r--r--kernel/rcu/tree.c355
-rw-r--r--kernel/rcu/tree.h62
-rw-r--r--kernel/rcu/tree_plugin.h276
-rw-r--r--kernel/rcu/tree_trace.c8
-rw-r--r--kernel/resource.c25
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/clock.c13
-rw-r--r--kernel/sched/completion.c18
-rw-r--r--kernel/sched/core.c119
-rw-r--r--kernel/sched/cpudeadline.c27
-rw-r--r--kernel/sched/cpudeadline.h2
-rw-r--r--kernel/sched/deadline.c51
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c7
-rw-r--r--kernel/sched/idle.c19
-rw-r--r--kernel/sched/rt.c26
-rw-r--r--kernel/sched/sched.h22
-rw-r--r--kernel/sched/stats.c11
-rw-r--r--kernel/seccomp.c4
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sysctl.c3
-rw-r--r--kernel/taskstats.c13
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clocksource.c76
-rw-r--r--kernel/time/hrtimer.c114
-rw-r--r--kernel/time/ntp.c4
-rw-r--r--kernel/time/posix-cpu-timers.c3
-rw-r--r--kernel/time/tick-common.c50
-rw-r--r--kernel/time/tick-sched.c11
-rw-r--r--kernel/time/timecounter.c112
-rw-r--r--kernel/time/timekeeping.c60
-rw-r--r--kernel/time/timekeeping.h2
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c42
-rw-r--r--kernel/trace/ring_buffer_benchmark.c18
-rw-r--r--kernel/trace/trace.c194
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_branch.c1
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_nop.c2
-rw-r--r--kernel/trace/trace_output.c44
-rw-r--r--kernel/trace/trace_printk.c4
-rw-r--r--kernel/trace/trace_sched_switch.c2
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_seq.c2
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_stat.c2
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/trace/trace_uprobe.c4
-rw-r--r--kernel/watchdog.c2
-rw-r--r--kernel/workqueue.c5
110 files changed, 2981 insertions, 1594 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 76768ee812b2..08561f1acd13 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER
231 def_bool y 231 def_bool y
232 depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW 232 depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
233 233
234config LOCK_SPIN_ON_OWNER
235 def_bool y
236 depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
237
234config ARCH_USE_QUEUE_RWLOCK 238config ARCH_USE_QUEUE_RWLOCK
235 bool 239 bool
236 240
diff --git a/kernel/Makefile b/kernel/Makefile
index a59481a3fa6c..1408b3353a3c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -13,8 +13,8 @@ obj-y = fork.o exec_domain.o panic.o \
13 13
14ifdef CONFIG_FUNCTION_TRACER 14ifdef CONFIG_FUNCTION_TRACER
15# Do not trace debug files and internal ftrace files 15# Do not trace debug files and internal ftrace files
16CFLAGS_REMOVE_cgroup-debug.o = -pg 16CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
17CFLAGS_REMOVE_irq_work.o = -pg 17CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
18endif 18endif
19 19
20# cond_syscall is currently not LTO compatible 20# cond_syscall is currently not LTO compatible
@@ -26,6 +26,7 @@ obj-y += power/
26obj-y += printk/ 26obj-y += printk/
27obj-y += irq/ 27obj-y += irq/
28obj-y += rcu/ 28obj-y += rcu/
29obj-y += livepatch/
29 30
30obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 31obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
31obj-$(CONFIG_FREEZER) += freezer.o 32obj-$(CONFIG_FREEZER) += freezer.o
@@ -142,7 +143,7 @@ endif
142kernel/system_certificates.o: $(obj)/x509_certificate_list 143kernel/system_certificates.o: $(obj)/x509_certificate_list
143 144
144quiet_cmd_x509certs = CERTS $@ 145quiet_cmd_x509certs = CERTS $@
145 cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)") 146 cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)")
146 147
147targets += $(obj)/x509_certificate_list 148targets += $(obj)/x509_certificate_list
148$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list 149$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
diff --git a/kernel/acct.c b/kernel/acct.c
index 33738ef972f3..e6c10d1a4058 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -76,10 +76,11 @@ int acct_parm[3] = {4, 2, 30};
76/* 76/*
77 * External references and all of the globals. 77 * External references and all of the globals.
78 */ 78 */
79static void do_acct_process(struct bsd_acct_struct *acct);
80 79
81struct bsd_acct_struct { 80struct bsd_acct_struct {
82 struct fs_pin pin; 81 struct fs_pin pin;
82 atomic_long_t count;
83 struct rcu_head rcu;
83 struct mutex lock; 84 struct mutex lock;
84 int active; 85 int active;
85 unsigned long needcheck; 86 unsigned long needcheck;
@@ -89,6 +90,8 @@ struct bsd_acct_struct {
89 struct completion done; 90 struct completion done;
90}; 91};
91 92
93static void do_acct_process(struct bsd_acct_struct *acct);
94
92/* 95/*
93 * Check the amount of free space and suspend/resume accordingly. 96 * Check the amount of free space and suspend/resume accordingly.
94 */ 97 */
@@ -124,32 +127,56 @@ out:
124 return acct->active; 127 return acct->active;
125} 128}
126 129
130static void acct_put(struct bsd_acct_struct *p)
131{
132 if (atomic_long_dec_and_test(&p->count))
133 kfree_rcu(p, rcu);
134}
135
136static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
137{
138 return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
139}
140
127static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) 141static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
128{ 142{
129 struct bsd_acct_struct *res; 143 struct bsd_acct_struct *res;
130again: 144again:
131 smp_rmb(); 145 smp_rmb();
132 rcu_read_lock(); 146 rcu_read_lock();
133 res = ACCESS_ONCE(ns->bacct); 147 res = to_acct(ACCESS_ONCE(ns->bacct));
134 if (!res) { 148 if (!res) {
135 rcu_read_unlock(); 149 rcu_read_unlock();
136 return NULL; 150 return NULL;
137 } 151 }
138 if (!atomic_long_inc_not_zero(&res->pin.count)) { 152 if (!atomic_long_inc_not_zero(&res->count)) {
139 rcu_read_unlock(); 153 rcu_read_unlock();
140 cpu_relax(); 154 cpu_relax();
141 goto again; 155 goto again;
142 } 156 }
143 rcu_read_unlock(); 157 rcu_read_unlock();
144 mutex_lock(&res->lock); 158 mutex_lock(&res->lock);
145 if (!res->ns) { 159 if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
146 mutex_unlock(&res->lock); 160 mutex_unlock(&res->lock);
147 pin_put(&res->pin); 161 acct_put(res);
148 goto again; 162 goto again;
149 } 163 }
150 return res; 164 return res;
151} 165}
152 166
167static void acct_pin_kill(struct fs_pin *pin)
168{
169 struct bsd_acct_struct *acct = to_acct(pin);
170 mutex_lock(&acct->lock);
171 do_acct_process(acct);
172 schedule_work(&acct->work);
173 wait_for_completion(&acct->done);
174 cmpxchg(&acct->ns->bacct, pin, NULL);
175 mutex_unlock(&acct->lock);
176 pin_remove(pin);
177 acct_put(acct);
178}
179
153static void close_work(struct work_struct *work) 180static void close_work(struct work_struct *work)
154{ 181{
155 struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); 182 struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
@@ -160,44 +187,13 @@ static void close_work(struct work_struct *work)
160 complete(&acct->done); 187 complete(&acct->done);
161} 188}
162 189
163static void acct_kill(struct bsd_acct_struct *acct,
164 struct bsd_acct_struct *new)
165{
166 if (acct) {
167 struct pid_namespace *ns = acct->ns;
168 do_acct_process(acct);
169 INIT_WORK(&acct->work, close_work);
170 init_completion(&acct->done);
171 schedule_work(&acct->work);
172 wait_for_completion(&acct->done);
173 pin_remove(&acct->pin);
174 ns->bacct = new;
175 acct->ns = NULL;
176 atomic_long_dec(&acct->pin.count);
177 mutex_unlock(&acct->lock);
178 pin_put(&acct->pin);
179 }
180}
181
182static void acct_pin_kill(struct fs_pin *pin)
183{
184 struct bsd_acct_struct *acct;
185 acct = container_of(pin, struct bsd_acct_struct, pin);
186 mutex_lock(&acct->lock);
187 if (!acct->ns) {
188 mutex_unlock(&acct->lock);
189 pin_put(pin);
190 acct = NULL;
191 }
192 acct_kill(acct, NULL);
193}
194
195static int acct_on(struct filename *pathname) 190static int acct_on(struct filename *pathname)
196{ 191{
197 struct file *file; 192 struct file *file;
198 struct vfsmount *mnt, *internal; 193 struct vfsmount *mnt, *internal;
199 struct pid_namespace *ns = task_active_pid_ns(current); 194 struct pid_namespace *ns = task_active_pid_ns(current);
200 struct bsd_acct_struct *acct, *old; 195 struct bsd_acct_struct *acct;
196 struct fs_pin *old;
201 int err; 197 int err;
202 198
203 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); 199 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
@@ -238,21 +234,21 @@ static int acct_on(struct filename *pathname)
238 mnt = file->f_path.mnt; 234 mnt = file->f_path.mnt;
239 file->f_path.mnt = internal; 235 file->f_path.mnt = internal;
240 236
241 atomic_long_set(&acct->pin.count, 1); 237 atomic_long_set(&acct->count, 1);
242 acct->pin.kill = acct_pin_kill; 238 init_fs_pin(&acct->pin, acct_pin_kill);
243 acct->file = file; 239 acct->file = file;
244 acct->needcheck = jiffies; 240 acct->needcheck = jiffies;
245 acct->ns = ns; 241 acct->ns = ns;
246 mutex_init(&acct->lock); 242 mutex_init(&acct->lock);
243 INIT_WORK(&acct->work, close_work);
244 init_completion(&acct->done);
247 mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ 245 mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
248 pin_insert(&acct->pin, mnt); 246 pin_insert(&acct->pin, mnt);
249 247
250 old = acct_get(ns); 248 rcu_read_lock();
251 if (old) 249 old = xchg(&ns->bacct, &acct->pin);
252 acct_kill(old, acct);
253 else
254 ns->bacct = acct;
255 mutex_unlock(&acct->lock); 250 mutex_unlock(&acct->lock);
251 pin_kill(old);
256 mnt_drop_write(mnt); 252 mnt_drop_write(mnt);
257 mntput(mnt); 253 mntput(mnt);
258 return 0; 254 return 0;
@@ -288,7 +284,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
288 mutex_unlock(&acct_on_mutex); 284 mutex_unlock(&acct_on_mutex);
289 putname(tmp); 285 putname(tmp);
290 } else { 286 } else {
291 acct_kill(acct_get(task_active_pid_ns(current)), NULL); 287 rcu_read_lock();
288 pin_kill(task_active_pid_ns(current)->bacct);
292 } 289 }
293 290
294 return error; 291 return error;
@@ -296,7 +293,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
296 293
297void acct_exit_ns(struct pid_namespace *ns) 294void acct_exit_ns(struct pid_namespace *ns)
298{ 295{
299 acct_kill(acct_get(ns), NULL); 296 rcu_read_lock();
297 pin_kill(ns->bacct);
300} 298}
301 299
302/* 300/*
@@ -576,7 +574,7 @@ static void slow_acct_process(struct pid_namespace *ns)
576 if (acct) { 574 if (acct) {
577 do_acct_process(acct); 575 do_acct_process(acct);
578 mutex_unlock(&acct->lock); 576 mutex_unlock(&acct->lock);
579 pin_put(&acct->pin); 577 acct_put(acct);
580 } 578 }
581 } 579 }
582} 580}
diff --git a/kernel/audit.h b/kernel/audit.h
index 3cdffad5a1d9..1caa0d345d90 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -24,12 +24,6 @@
24#include <linux/skbuff.h> 24#include <linux/skbuff.h>
25#include <uapi/linux/mqueue.h> 25#include <uapi/linux/mqueue.h>
26 26
27/* 0 = no checking
28 1 = put_count checking
29 2 = verbose put_count checking
30*/
31#define AUDIT_DEBUG 0
32
33/* AUDIT_NAMES is the number of slots we reserve in the audit_context 27/* AUDIT_NAMES is the number of slots we reserve in the audit_context
34 * for saving names from getname(). If we get more names we will allocate 28 * for saving names from getname(). If we get more names we will allocate
35 * a name dynamically and also add those to the list anchored by names_list. */ 29 * a name dynamically and also add those to the list anchored by names_list. */
@@ -74,9 +68,8 @@ struct audit_cap_data {
74 }; 68 };
75}; 69};
76 70
77/* When fs/namei.c:getname() is called, we store the pointer in name and 71/* When fs/namei.c:getname() is called, we store the pointer in name and bump
78 * we don't let putname() free it (instead we free all of the saved 72 * the refcnt in the associated filename struct.
79 * pointers at syscall exit time).
80 * 73 *
81 * Further, in fs/namei.c:path_lookup() we store the inode and device. 74 * Further, in fs/namei.c:path_lookup() we store the inode and device.
82 */ 75 */
@@ -86,7 +79,6 @@ struct audit_names {
86 struct filename *name; 79 struct filename *name;
87 int name_len; /* number of chars to log */ 80 int name_len; /* number of chars to log */
88 bool hidden; /* don't log this record */ 81 bool hidden; /* don't log this record */
89 bool name_put; /* call __putname()? */
90 82
91 unsigned long ino; 83 unsigned long ino;
92 dev_t dev; 84 dev_t dev;
@@ -208,11 +200,6 @@ struct audit_context {
208 }; 200 };
209 int fds[2]; 201 int fds[2];
210 struct audit_proctitle proctitle; 202 struct audit_proctitle proctitle;
211
212#if AUDIT_DEBUG
213 int put_count;
214 int ino_count;
215#endif
216}; 203};
217 204
218extern u32 audit_ever_enabled; 205extern u32 audit_ever_enabled;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f68a326d92e..72e1660a79a3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -425,7 +425,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
425 goto exit_nofree; 425 goto exit_nofree;
426 426
427 bufp = data->buf; 427 bufp = data->buf;
428 entry->rule.vers_ops = 2;
429 for (i = 0; i < data->field_count; i++) { 428 for (i = 0; i < data->field_count; i++) {
430 struct audit_field *f = &entry->rule.fields[i]; 429 struct audit_field *f = &entry->rule.fields[i];
431 430
@@ -758,7 +757,6 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
758 return ERR_PTR(-ENOMEM); 757 return ERR_PTR(-ENOMEM);
759 758
760 new = &entry->rule; 759 new = &entry->rule;
761 new->vers_ops = old->vers_ops;
762 new->flags = old->flags; 760 new->flags = old->flags;
763 new->pflags = old->pflags; 761 new->pflags = old->pflags;
764 new->listnr = old->listnr; 762 new->listnr = old->listnr;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 072566dd0caf..dc4ae70a7413 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -866,33 +866,10 @@ static inline void audit_free_names(struct audit_context *context)
866{ 866{
867 struct audit_names *n, *next; 867 struct audit_names *n, *next;
868 868
869#if AUDIT_DEBUG == 2
870 if (context->put_count + context->ino_count != context->name_count) {
871 int i = 0;
872
873 pr_err("%s:%d(:%d): major=%d in_syscall=%d"
874 " name_count=%d put_count=%d ino_count=%d"
875 " [NOT freeing]\n", __FILE__, __LINE__,
876 context->serial, context->major, context->in_syscall,
877 context->name_count, context->put_count,
878 context->ino_count);
879 list_for_each_entry(n, &context->names_list, list) {
880 pr_err("names[%d] = %p = %s\n", i++, n->name,
881 n->name->name ?: "(null)");
882 }
883 dump_stack();
884 return;
885 }
886#endif
887#if AUDIT_DEBUG
888 context->put_count = 0;
889 context->ino_count = 0;
890#endif
891
892 list_for_each_entry_safe(n, next, &context->names_list, list) { 869 list_for_each_entry_safe(n, next, &context->names_list, list) {
893 list_del(&n->list); 870 list_del(&n->list);
894 if (n->name && n->name_put) 871 if (n->name)
895 final_putname(n->name); 872 putname(n->name);
896 if (n->should_free) 873 if (n->should_free)
897 kfree(n); 874 kfree(n);
898 } 875 }
@@ -1711,9 +1688,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
1711 list_add_tail(&aname->list, &context->names_list); 1688 list_add_tail(&aname->list, &context->names_list);
1712 1689
1713 context->name_count++; 1690 context->name_count++;
1714#if AUDIT_DEBUG
1715 context->ino_count++;
1716#endif
1717 return aname; 1691 return aname;
1718} 1692}
1719 1693
@@ -1734,8 +1708,10 @@ __audit_reusename(const __user char *uptr)
1734 list_for_each_entry(n, &context->names_list, list) { 1708 list_for_each_entry(n, &context->names_list, list) {
1735 if (!n->name) 1709 if (!n->name)
1736 continue; 1710 continue;
1737 if (n->name->uptr == uptr) 1711 if (n->name->uptr == uptr) {
1712 n->name->refcnt++;
1738 return n->name; 1713 return n->name;
1714 }
1739 } 1715 }
1740 return NULL; 1716 return NULL;
1741} 1717}
@@ -1752,19 +1728,8 @@ void __audit_getname(struct filename *name)
1752 struct audit_context *context = current->audit_context; 1728 struct audit_context *context = current->audit_context;
1753 struct audit_names *n; 1729 struct audit_names *n;
1754 1730
1755 if (!context->in_syscall) { 1731 if (!context->in_syscall)
1756#if AUDIT_DEBUG == 2
1757 pr_err("%s:%d(:%d): ignoring getname(%p)\n",
1758 __FILE__, __LINE__, context->serial, name);
1759 dump_stack();
1760#endif
1761 return; 1732 return;
1762 }
1763
1764#if AUDIT_DEBUG
1765 /* The filename _must_ have a populated ->name */
1766 BUG_ON(!name->name);
1767#endif
1768 1733
1769 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); 1734 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
1770 if (!n) 1735 if (!n)
@@ -1772,56 +1737,13 @@ void __audit_getname(struct filename *name)
1772 1737
1773 n->name = name; 1738 n->name = name;
1774 n->name_len = AUDIT_NAME_FULL; 1739 n->name_len = AUDIT_NAME_FULL;
1775 n->name_put = true;
1776 name->aname = n; 1740 name->aname = n;
1741 name->refcnt++;
1777 1742
1778 if (!context->pwd.dentry) 1743 if (!context->pwd.dentry)
1779 get_fs_pwd(current->fs, &context->pwd); 1744 get_fs_pwd(current->fs, &context->pwd);
1780} 1745}
1781 1746
1782/* audit_putname - intercept a putname request
1783 * @name: name to intercept and delay for putname
1784 *
1785 * If we have stored the name from getname in the audit context,
1786 * then we delay the putname until syscall exit.
1787 * Called from include/linux/fs.h:putname().
1788 */
1789void audit_putname(struct filename *name)
1790{
1791 struct audit_context *context = current->audit_context;
1792
1793 BUG_ON(!context);
1794 if (!name->aname || !context->in_syscall) {
1795#if AUDIT_DEBUG == 2
1796 pr_err("%s:%d(:%d): final_putname(%p)\n",
1797 __FILE__, __LINE__, context->serial, name);
1798 if (context->name_count) {
1799 struct audit_names *n;
1800 int i = 0;
1801
1802 list_for_each_entry(n, &context->names_list, list)
1803 pr_err("name[%d] = %p = %s\n", i++, n->name,
1804 n->name->name ?: "(null)");
1805 }
1806#endif
1807 final_putname(name);
1808 }
1809#if AUDIT_DEBUG
1810 else {
1811 ++context->put_count;
1812 if (context->put_count > context->name_count) {
1813 pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
1814 " name_count=%d put_count=%d\n",
1815 __FILE__, __LINE__,
1816 context->serial, context->major,
1817 context->in_syscall, name->name,
1818 context->name_count, context->put_count);
1819 dump_stack();
1820 }
1821 }
1822#endif
1823}
1824
1825/** 1747/**
1826 * __audit_inode - store the inode and device from a lookup 1748 * __audit_inode - store the inode and device from a lookup
1827 * @name: name being audited 1749 * @name: name being audited
@@ -1842,10 +1764,6 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
1842 if (!name) 1764 if (!name)
1843 goto out_alloc; 1765 goto out_alloc;
1844 1766
1845#if AUDIT_DEBUG
1846 /* The struct filename _must_ have a populated ->name */
1847 BUG_ON(!name->name);
1848#endif
1849 /* 1767 /*
1850 * If we have a pointer to an audit_names entry already, then we can 1768 * If we have a pointer to an audit_names entry already, then we can
1851 * just use it directly if the type is correct. 1769 * just use it directly if the type is correct.
@@ -1863,7 +1781,17 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
1863 } 1781 }
1864 1782
1865 list_for_each_entry_reverse(n, &context->names_list, list) { 1783 list_for_each_entry_reverse(n, &context->names_list, list) {
1866 if (!n->name || strcmp(n->name->name, name->name)) 1784 if (n->ino) {
1785 /* valid inode number, use that for the comparison */
1786 if (n->ino != inode->i_ino ||
1787 n->dev != inode->i_sb->s_dev)
1788 continue;
1789 } else if (n->name) {
1790 /* inode number has not been set, check the name */
1791 if (strcmp(n->name->name, name->name))
1792 continue;
1793 } else
1794 /* no inode and no name (?!) ... this is odd ... */
1867 continue; 1795 continue;
1868 1796
1869 /* match the correct record type */ 1797 /* match the correct record type */
@@ -1882,44 +1810,11 @@ out_alloc:
1882 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); 1810 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
1883 if (!n) 1811 if (!n)
1884 return; 1812 return;
1885 /* unfortunately, while we may have a path name to record with the
1886 * inode, we can't always rely on the string lasting until the end of
1887 * the syscall so we need to create our own copy, it may fail due to
1888 * memory allocation issues, but we do our best */
1889 if (name) { 1813 if (name) {
1890 /* we can't use getname_kernel() due to size limits */ 1814 n->name = name;
1891 size_t len = strlen(name->name) + 1; 1815 name->refcnt++;
1892 struct filename *new = __getname();
1893
1894 if (unlikely(!new))
1895 goto out;
1896
1897 if (len <= (PATH_MAX - sizeof(*new))) {
1898 new->name = (char *)(new) + sizeof(*new);
1899 new->separate = false;
1900 } else if (len <= PATH_MAX) {
1901 /* this looks odd, but is due to final_putname() */
1902 struct filename *new2;
1903
1904 new2 = kmalloc(sizeof(*new2), GFP_KERNEL);
1905 if (unlikely(!new2)) {
1906 __putname(new);
1907 goto out;
1908 }
1909 new2->name = (char *)new;
1910 new2->separate = true;
1911 new = new2;
1912 } else {
1913 /* we should never get here, but let's be safe */
1914 __putname(new);
1915 goto out;
1916 }
1917 strlcpy((char *)new->name, name->name, len);
1918 new->uptr = NULL;
1919 new->aname = n;
1920 n->name = new;
1921 n->name_put = true;
1922 } 1816 }
1817
1923out: 1818out:
1924 if (parent) { 1819 if (parent) {
1925 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; 1820 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
@@ -1970,11 +1865,16 @@ void __audit_inode_child(const struct inode *parent,
1970 1865
1971 /* look for a parent entry first */ 1866 /* look for a parent entry first */
1972 list_for_each_entry(n, &context->names_list, list) { 1867 list_for_each_entry(n, &context->names_list, list) {
1973 if (!n->name || n->type != AUDIT_TYPE_PARENT) 1868 if (!n->name ||
1869 (n->type != AUDIT_TYPE_PARENT &&
1870 n->type != AUDIT_TYPE_UNKNOWN))
1974 continue; 1871 continue;
1975 1872
1976 if (n->ino == parent->i_ino && 1873 if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
1977 !audit_compare_dname_path(dname, n->name->name, n->name_len)) { 1874 !audit_compare_dname_path(dname,
1875 n->name->name, n->name_len)) {
1876 if (n->type == AUDIT_TYPE_UNKNOWN)
1877 n->type = AUDIT_TYPE_PARENT;
1978 found_parent = n; 1878 found_parent = n;
1979 break; 1879 break;
1980 } 1880 }
@@ -1983,11 +1883,8 @@ void __audit_inode_child(const struct inode *parent,
1983 /* is there a matching child entry? */ 1883 /* is there a matching child entry? */
1984 list_for_each_entry(n, &context->names_list, list) { 1884 list_for_each_entry(n, &context->names_list, list) {
1985 /* can only match entries that have a name */ 1885 /* can only match entries that have a name */
1986 if (!n->name || n->type != type) 1886 if (!n->name ||
1987 continue; 1887 (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
1988
1989 /* if we found a parent, make sure this one is a child of it */
1990 if (found_parent && (n->name != found_parent->name))
1991 continue; 1888 continue;
1992 1889
1993 if (!strcmp(dname, n->name->name) || 1890 if (!strcmp(dname, n->name->name) ||
@@ -1995,6 +1892,8 @@ void __audit_inode_child(const struct inode *parent,
1995 found_parent ? 1892 found_parent ?
1996 found_parent->name_len : 1893 found_parent->name_len :
1997 AUDIT_NAME_FULL)) { 1894 AUDIT_NAME_FULL)) {
1895 if (n->type == AUDIT_TYPE_UNKNOWN)
1896 n->type = type;
1998 found_child = n; 1897 found_child = n;
1999 break; 1898 break;
2000 } 1899 }
@@ -2019,10 +1918,10 @@ void __audit_inode_child(const struct inode *parent,
2019 if (found_parent) { 1918 if (found_parent) {
2020 found_child->name = found_parent->name; 1919 found_child->name = found_parent->name;
2021 found_child->name_len = AUDIT_NAME_FULL; 1920 found_child->name_len = AUDIT_NAME_FULL;
2022 /* don't call __putname() */ 1921 found_child->name->refcnt++;
2023 found_child->name_put = false;
2024 } 1922 }
2025 } 1923 }
1924
2026 if (inode) 1925 if (inode)
2027 audit_copy_inode(found_child, dentry, inode); 1926 audit_copy_inode(found_child, dentry, inode);
2028 else 1927 else
@@ -2405,7 +2304,6 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2405 struct audit_aux_data_bprm_fcaps *ax; 2304 struct audit_aux_data_bprm_fcaps *ax;
2406 struct audit_context *context = current->audit_context; 2305 struct audit_context *context = current->audit_context;
2407 struct cpu_vfs_cap_data vcaps; 2306 struct cpu_vfs_cap_data vcaps;
2408 struct dentry *dentry;
2409 2307
2410 ax = kmalloc(sizeof(*ax), GFP_KERNEL); 2308 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2411 if (!ax) 2309 if (!ax)
@@ -2415,9 +2313,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2415 ax->d.next = context->aux; 2313 ax->d.next = context->aux;
2416 context->aux = (void *)ax; 2314 context->aux = (void *)ax;
2417 2315
2418 dentry = dget(bprm->file->f_path.dentry); 2316 get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
2419 get_vfs_caps_from_disk(dentry, &vcaps);
2420 dput(dentry);
2421 2317
2422 ax->fcap.permitted = vcaps.permitted; 2318 ax->fcap.permitted = vcaps.permitted;
2423 ax->fcap.inheritable = vcaps.inheritable; 2319 ax->fcap.inheritable = vcaps.inheritable;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 04cfe8ace520..29a7b2cc593e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3077,7 +3077,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
3077#endif 3077#endif
3078 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), 3078 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3079 cgroup_file_mode(cft), 0, cft->kf_ops, cft, 3079 cgroup_file_mode(cft), 0, cft->kf_ops, cft,
3080 NULL, false, key); 3080 NULL, key);
3081 if (IS_ERR(kn)) 3081 if (IS_ERR(kn))
3082 return PTR_ERR(kn); 3082 return PTR_ERR(kn);
3083 3083
@@ -4373,16 +4373,20 @@ static void css_free_work_fn(struct work_struct *work)
4373{ 4373{
4374 struct cgroup_subsys_state *css = 4374 struct cgroup_subsys_state *css =
4375 container_of(work, struct cgroup_subsys_state, destroy_work); 4375 container_of(work, struct cgroup_subsys_state, destroy_work);
4376 struct cgroup_subsys *ss = css->ss;
4376 struct cgroup *cgrp = css->cgroup; 4377 struct cgroup *cgrp = css->cgroup;
4377 4378
4378 percpu_ref_exit(&css->refcnt); 4379 percpu_ref_exit(&css->refcnt);
4379 4380
4380 if (css->ss) { 4381 if (ss) {
4381 /* css free path */ 4382 /* css free path */
4383 int id = css->id;
4384
4382 if (css->parent) 4385 if (css->parent)
4383 css_put(css->parent); 4386 css_put(css->parent);
4384 4387
4385 css->ss->css_free(css); 4388 ss->css_free(css);
4389 cgroup_idr_remove(&ss->css_idr, id);
4386 cgroup_put(cgrp); 4390 cgroup_put(cgrp);
4387 } else { 4391 } else {
4388 /* cgroup free path */ 4392 /* cgroup free path */
@@ -4434,7 +4438,7 @@ static void css_release_work_fn(struct work_struct *work)
4434 4438
4435 if (ss) { 4439 if (ss) {
4436 /* css release path */ 4440 /* css release path */
4437 cgroup_idr_remove(&ss->css_idr, css->id); 4441 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4438 if (ss->css_released) 4442 if (ss->css_released)
4439 ss->css_released(css); 4443 ss->css_released(css);
4440 } else { 4444 } else {
diff --git a/kernel/compat.c b/kernel/compat.c
index ebb3c369d03d..24f00610c575 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -276,8 +276,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
276 * core implementation decides to return random nonsense. 276 * core implementation decides to return random nonsense.
277 */ 277 */
278 if (ret == -ERESTART_RESTARTBLOCK) { 278 if (ret == -ERESTART_RESTARTBLOCK) {
279 struct restart_block *restart 279 struct restart_block *restart = &current->restart_block;
280 = &current_thread_info()->restart_block;
281 280
282 restart->fn = compat_nanosleep_restart; 281 restart->fn = compat_nanosleep_restart;
283 restart->nanosleep.compat_rmtp = rmtp; 282 restart->nanosleep.compat_rmtp = rmtp;
@@ -860,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
860 return -EFAULT; 859 return -EFAULT;
861 860
862 if (err == -ERESTART_RESTARTBLOCK) { 861 if (err == -ERESTART_RESTARTBLOCK) {
863 restart = &current_thread_info()->restart_block; 862 restart = &current->restart_block;
864 restart->fn = compat_clock_nanosleep_restart; 863 restart->fn = compat_clock_nanosleep_restart;
865 restart->nanosleep.compat_rmtp = rmtp; 864 restart->nanosleep.compat_rmtp = rmtp;
866 } 865 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5d220234b3ca..1972b161c61e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,22 +58,23 @@ static int cpu_hotplug_disabled;
58 58
59static struct { 59static struct {
60 struct task_struct *active_writer; 60 struct task_struct *active_writer;
61 struct mutex lock; /* Synchronizes accesses to refcount, */ 61 /* wait queue to wake up the active_writer */
62 wait_queue_head_t wq;
63 /* verifies that no writer will get active while readers are active */
64 struct mutex lock;
62 /* 65 /*
63 * Also blocks the new readers during 66 * Also blocks the new readers during
64 * an ongoing cpu hotplug operation. 67 * an ongoing cpu hotplug operation.
65 */ 68 */
66 int refcount; 69 atomic_t refcount;
67 /* And allows lockless put_online_cpus(). */
68 atomic_t puts_pending;
69 70
70#ifdef CONFIG_DEBUG_LOCK_ALLOC 71#ifdef CONFIG_DEBUG_LOCK_ALLOC
71 struct lockdep_map dep_map; 72 struct lockdep_map dep_map;
72#endif 73#endif
73} cpu_hotplug = { 74} cpu_hotplug = {
74 .active_writer = NULL, 75 .active_writer = NULL,
76 .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
75 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), 77 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
76 .refcount = 0,
77#ifdef CONFIG_DEBUG_LOCK_ALLOC 78#ifdef CONFIG_DEBUG_LOCK_ALLOC
78 .dep_map = {.name = "cpu_hotplug.lock" }, 79 .dep_map = {.name = "cpu_hotplug.lock" },
79#endif 80#endif
@@ -86,15 +87,6 @@ static struct {
86#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) 87#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
87#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) 88#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
88 89
89static void apply_puts_pending(int max)
90{
91 int delta;
92
93 if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
94 delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
95 cpu_hotplug.refcount -= delta;
96 }
97}
98 90
99void get_online_cpus(void) 91void get_online_cpus(void)
100{ 92{
@@ -103,8 +95,7 @@ void get_online_cpus(void)
103 return; 95 return;
104 cpuhp_lock_acquire_read(); 96 cpuhp_lock_acquire_read();
105 mutex_lock(&cpu_hotplug.lock); 97 mutex_lock(&cpu_hotplug.lock);
106 apply_puts_pending(65536); 98 atomic_inc(&cpu_hotplug.refcount);
107 cpu_hotplug.refcount++;
108 mutex_unlock(&cpu_hotplug.lock); 99 mutex_unlock(&cpu_hotplug.lock);
109} 100}
110EXPORT_SYMBOL_GPL(get_online_cpus); 101EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -116,8 +107,7 @@ bool try_get_online_cpus(void)
116 if (!mutex_trylock(&cpu_hotplug.lock)) 107 if (!mutex_trylock(&cpu_hotplug.lock))
117 return false; 108 return false;
118 cpuhp_lock_acquire_tryread(); 109 cpuhp_lock_acquire_tryread();
119 apply_puts_pending(65536); 110 atomic_inc(&cpu_hotplug.refcount);
120 cpu_hotplug.refcount++;
121 mutex_unlock(&cpu_hotplug.lock); 111 mutex_unlock(&cpu_hotplug.lock);
122 return true; 112 return true;
123} 113}
@@ -125,20 +115,18 @@ EXPORT_SYMBOL_GPL(try_get_online_cpus);
125 115
126void put_online_cpus(void) 116void put_online_cpus(void)
127{ 117{
118 int refcount;
119
128 if (cpu_hotplug.active_writer == current) 120 if (cpu_hotplug.active_writer == current)
129 return; 121 return;
130 if (!mutex_trylock(&cpu_hotplug.lock)) {
131 atomic_inc(&cpu_hotplug.puts_pending);
132 cpuhp_lock_release();
133 return;
134 }
135 122
136 if (WARN_ON(!cpu_hotplug.refcount)) 123 refcount = atomic_dec_return(&cpu_hotplug.refcount);
137 cpu_hotplug.refcount++; /* try to fix things up */ 124 if (WARN_ON(refcount < 0)) /* try to fix things up */
125 atomic_inc(&cpu_hotplug.refcount);
126
127 if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
128 wake_up(&cpu_hotplug.wq);
138 129
139 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
140 wake_up_process(cpu_hotplug.active_writer);
141 mutex_unlock(&cpu_hotplug.lock);
142 cpuhp_lock_release(); 130 cpuhp_lock_release();
143 131
144} 132}
@@ -168,18 +156,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
168 */ 156 */
169void cpu_hotplug_begin(void) 157void cpu_hotplug_begin(void)
170{ 158{
171 cpu_hotplug.active_writer = current; 159 DEFINE_WAIT(wait);
172 160
161 cpu_hotplug.active_writer = current;
173 cpuhp_lock_acquire(); 162 cpuhp_lock_acquire();
163
174 for (;;) { 164 for (;;) {
175 mutex_lock(&cpu_hotplug.lock); 165 mutex_lock(&cpu_hotplug.lock);
176 apply_puts_pending(1); 166 prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
177 if (likely(!cpu_hotplug.refcount)) 167 if (likely(!atomic_read(&cpu_hotplug.refcount)))
178 break; 168 break;
179 __set_current_state(TASK_UNINTERRUPTIBLE);
180 mutex_unlock(&cpu_hotplug.lock); 169 mutex_unlock(&cpu_hotplug.lock);
181 schedule(); 170 schedule();
182 } 171 }
172 finish_wait(&cpu_hotplug.wq, &wait);
183} 173}
184 174
185void cpu_hotplug_done(void) 175void cpu_hotplug_done(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64b257f6bca2..1d1fe9361d29 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1707,40 +1707,27 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1707{ 1707{
1708 struct cpuset *cs = css_cs(seq_css(sf)); 1708 struct cpuset *cs = css_cs(seq_css(sf));
1709 cpuset_filetype_t type = seq_cft(sf)->private; 1709 cpuset_filetype_t type = seq_cft(sf)->private;
1710 ssize_t count;
1711 char *buf, *s;
1712 int ret = 0; 1710 int ret = 0;
1713 1711
1714 count = seq_get_buf(sf, &buf);
1715 s = buf;
1716
1717 spin_lock_irq(&callback_lock); 1712 spin_lock_irq(&callback_lock);
1718 1713
1719 switch (type) { 1714 switch (type) {
1720 case FILE_CPULIST: 1715 case FILE_CPULIST:
1721 s += cpulist_scnprintf(s, count, cs->cpus_allowed); 1716 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
1722 break; 1717 break;
1723 case FILE_MEMLIST: 1718 case FILE_MEMLIST:
1724 s += nodelist_scnprintf(s, count, cs->mems_allowed); 1719 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
1725 break; 1720 break;
1726 case FILE_EFFECTIVE_CPULIST: 1721 case FILE_EFFECTIVE_CPULIST:
1727 s += cpulist_scnprintf(s, count, cs->effective_cpus); 1722 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
1728 break; 1723 break;
1729 case FILE_EFFECTIVE_MEMLIST: 1724 case FILE_EFFECTIVE_MEMLIST:
1730 s += nodelist_scnprintf(s, count, cs->effective_mems); 1725 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
1731 break; 1726 break;
1732 default: 1727 default:
1733 ret = -EINVAL; 1728 ret = -EINVAL;
1734 goto out_unlock;
1735 } 1729 }
1736 1730
1737 if (s < buf + count - 1) {
1738 *s++ = '\n';
1739 seq_commit(sf, s - buf);
1740 } else {
1741 seq_commit(sf, -1);
1742 }
1743out_unlock:
1744 spin_unlock_irq(&callback_lock); 1731 spin_unlock_irq(&callback_lock);
1745 return ret; 1732 return ret;
1746} 1733}
@@ -2400,7 +2387,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2400 */ 2387 */
2401} 2388}
2402 2389
2403void cpuset_init_current_mems_allowed(void) 2390void __init cpuset_init_current_mems_allowed(void)
2404{ 2391{
2405 nodes_setall(current->mems_allowed); 2392 nodes_setall(current->mems_allowed);
2406} 2393}
@@ -2610,8 +2597,6 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2610 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2597 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2611} 2598}
2612 2599
2613#define CPUSET_NODELIST_LEN (256)
2614
2615/** 2600/**
2616 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed 2601 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2617 * @tsk: pointer to task_struct of some task. 2602 * @tsk: pointer to task_struct of some task.
@@ -2621,23 +2606,16 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2621 */ 2606 */
2622void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2607void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2623{ 2608{
2624 /* Statically allocated to prevent using excess stack. */
2625 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
2626 static DEFINE_SPINLOCK(cpuset_buffer_lock);
2627 struct cgroup *cgrp; 2609 struct cgroup *cgrp;
2628 2610
2629 spin_lock(&cpuset_buffer_lock);
2630 rcu_read_lock(); 2611 rcu_read_lock();
2631 2612
2632 cgrp = task_cs(tsk)->css.cgroup; 2613 cgrp = task_cs(tsk)->css.cgroup;
2633 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2634 tsk->mems_allowed);
2635 pr_info("%s cpuset=", tsk->comm); 2614 pr_info("%s cpuset=", tsk->comm);
2636 pr_cont_cgroup_name(cgrp); 2615 pr_cont_cgroup_name(cgrp);
2637 pr_cont(" mems_allowed=%s\n", cpuset_nodelist); 2616 pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
2638 2617
2639 rcu_read_unlock(); 2618 rcu_read_unlock();
2640 spin_unlock(&cpuset_buffer_lock);
2641} 2619}
2642 2620
2643/* 2621/*
@@ -2715,10 +2693,8 @@ out:
2715/* Display task mems_allowed in /proc/<pid>/status file. */ 2693/* Display task mems_allowed in /proc/<pid>/status file. */
2716void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2694void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2717{ 2695{
2718 seq_puts(m, "Mems_allowed:\t"); 2696 seq_printf(m, "Mems_allowed:\t%*pb\n",
2719 seq_nodemask(m, &task->mems_allowed); 2697 nodemask_pr_args(&task->mems_allowed));
2720 seq_puts(m, "\n"); 2698 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
2721 seq_puts(m, "Mems_allowed_list:\t"); 2699 nodemask_pr_args(&task->mems_allowed));
2722 seq_nodemask_list(m, &task->mems_allowed);
2723 seq_puts(m, "\n");
2724} 2700}
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 103f5d147b2f..2925188f50ea 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -1,5 +1,5 @@
1ifdef CONFIG_FUNCTION_TRACER 1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
3endif 3endif
4 4
5obj-y := core.o ring_buffer.o callchain.o 5obj-y := core.o ring_buffer.o callchain.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 19efcf13375a..f04daabfd1cf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -872,22 +872,32 @@ void perf_pmu_enable(struct pmu *pmu)
872 pmu->pmu_enable(pmu); 872 pmu->pmu_enable(pmu);
873} 873}
874 874
875static DEFINE_PER_CPU(struct list_head, rotation_list); 875static DEFINE_PER_CPU(struct list_head, active_ctx_list);
876 876
877/* 877/*
878 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized 878 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
879 * because they're strictly cpu affine and rotate_start is called with IRQs 879 * perf_event_task_tick() are fully serialized because they're strictly cpu
880 * disabled, while rotate_context is called from IRQ context. 880 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
881 * disabled, while perf_event_task_tick is called from IRQ context.
881 */ 882 */
882static void perf_pmu_rotate_start(struct pmu *pmu) 883static void perf_event_ctx_activate(struct perf_event_context *ctx)
883{ 884{
884 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 885 struct list_head *head = this_cpu_ptr(&active_ctx_list);
885 struct list_head *head = this_cpu_ptr(&rotation_list);
886 886
887 WARN_ON(!irqs_disabled()); 887 WARN_ON(!irqs_disabled());
888 888
889 if (list_empty(&cpuctx->rotation_list)) 889 WARN_ON(!list_empty(&ctx->active_ctx_list));
890 list_add(&cpuctx->rotation_list, head); 890
891 list_add(&ctx->active_ctx_list, head);
892}
893
894static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
895{
896 WARN_ON(!irqs_disabled());
897
898 WARN_ON(list_empty(&ctx->active_ctx_list));
899
900 list_del_init(&ctx->active_ctx_list);
891} 901}
892 902
893static void get_ctx(struct perf_event_context *ctx) 903static void get_ctx(struct perf_event_context *ctx)
@@ -907,6 +917,84 @@ static void put_ctx(struct perf_event_context *ctx)
907} 917}
908 918
909/* 919/*
920 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
921 * perf_pmu_migrate_context() we need some magic.
922 *
923 * Those places that change perf_event::ctx will hold both
924 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
925 *
926 * Lock ordering is by mutex address. There is one other site where
927 * perf_event_context::mutex nests and that is put_event(). But remember that
928 * that is a parent<->child context relation, and migration does not affect
929 * children, therefore these two orderings should not interact.
930 *
931 * The change in perf_event::ctx does not affect children (as claimed above)
932 * because the sys_perf_event_open() case will install a new event and break
933 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
934 * concerned with cpuctx and that doesn't have children.
935 *
936 * The places that change perf_event::ctx will issue:
937 *
938 * perf_remove_from_context();
939 * synchronize_rcu();
940 * perf_install_in_context();
941 *
942 * to affect the change. The remove_from_context() + synchronize_rcu() should
943 * quiesce the event, after which we can install it in the new location. This
944 * means that only external vectors (perf_fops, prctl) can perturb the event
945 * while in transit. Therefore all such accessors should also acquire
946 * perf_event_context::mutex to serialize against this.
947 *
948 * However; because event->ctx can change while we're waiting to acquire
949 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
950 * function.
951 *
952 * Lock order:
953 * task_struct::perf_event_mutex
954 * perf_event_context::mutex
955 * perf_event_context::lock
956 * perf_event::child_mutex;
957 * perf_event::mmap_mutex
958 * mmap_sem
959 */
960static struct perf_event_context *
961perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
962{
963 struct perf_event_context *ctx;
964
965again:
966 rcu_read_lock();
967 ctx = ACCESS_ONCE(event->ctx);
968 if (!atomic_inc_not_zero(&ctx->refcount)) {
969 rcu_read_unlock();
970 goto again;
971 }
972 rcu_read_unlock();
973
974 mutex_lock_nested(&ctx->mutex, nesting);
975 if (event->ctx != ctx) {
976 mutex_unlock(&ctx->mutex);
977 put_ctx(ctx);
978 goto again;
979 }
980
981 return ctx;
982}
983
984static inline struct perf_event_context *
985perf_event_ctx_lock(struct perf_event *event)
986{
987 return perf_event_ctx_lock_nested(event, 0);
988}
989
990static void perf_event_ctx_unlock(struct perf_event *event,
991 struct perf_event_context *ctx)
992{
993 mutex_unlock(&ctx->mutex);
994 put_ctx(ctx);
995}
996
997/*
910 * This must be done under the ctx->lock, such as to serialize against 998 * This must be done under the ctx->lock, such as to serialize against
911 * context_equiv(), therefore we cannot call put_ctx() since that might end up 999 * context_equiv(), therefore we cannot call put_ctx() since that might end up
912 * calling scheduler related locks and ctx->lock nests inside those. 1000 * calling scheduler related locks and ctx->lock nests inside those.
@@ -1155,8 +1243,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1155 ctx->nr_branch_stack++; 1243 ctx->nr_branch_stack++;
1156 1244
1157 list_add_rcu(&event->event_entry, &ctx->event_list); 1245 list_add_rcu(&event->event_entry, &ctx->event_list);
1158 if (!ctx->nr_events)
1159 perf_pmu_rotate_start(ctx->pmu);
1160 ctx->nr_events++; 1246 ctx->nr_events++;
1161 if (event->attr.inherit_stat) 1247 if (event->attr.inherit_stat)
1162 ctx->nr_stat++; 1248 ctx->nr_stat++;
@@ -1275,6 +1361,8 @@ static void perf_group_attach(struct perf_event *event)
1275 if (group_leader == event) 1361 if (group_leader == event)
1276 return; 1362 return;
1277 1363
1364 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1365
1278 if (group_leader->group_flags & PERF_GROUP_SOFTWARE && 1366 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1279 !is_software_event(event)) 1367 !is_software_event(event))
1280 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; 1368 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
@@ -1296,6 +1384,10 @@ static void
1296list_del_event(struct perf_event *event, struct perf_event_context *ctx) 1384list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1297{ 1385{
1298 struct perf_cpu_context *cpuctx; 1386 struct perf_cpu_context *cpuctx;
1387
1388 WARN_ON_ONCE(event->ctx != ctx);
1389 lockdep_assert_held(&ctx->lock);
1390
1299 /* 1391 /*
1300 * We can have double detach due to exit/hot-unplug + close. 1392 * We can have double detach due to exit/hot-unplug + close.
1301 */ 1393 */
@@ -1380,6 +1472,8 @@ static void perf_group_detach(struct perf_event *event)
1380 1472
1381 /* Inherit group flags from the previous leader */ 1473 /* Inherit group flags from the previous leader */
1382 sibling->group_flags = event->group_flags; 1474 sibling->group_flags = event->group_flags;
1475
1476 WARN_ON_ONCE(sibling->ctx != event->ctx);
1383 } 1477 }
1384 1478
1385out: 1479out:
@@ -1442,6 +1536,10 @@ event_sched_out(struct perf_event *event,
1442{ 1536{
1443 u64 tstamp = perf_event_time(event); 1537 u64 tstamp = perf_event_time(event);
1444 u64 delta; 1538 u64 delta;
1539
1540 WARN_ON_ONCE(event->ctx != ctx);
1541 lockdep_assert_held(&ctx->lock);
1542
1445 /* 1543 /*
1446 * An event which could not be activated because of 1544 * An event which could not be activated because of
1447 * filter mismatch still needs to have its timings 1545 * filter mismatch still needs to have its timings
@@ -1471,7 +1569,8 @@ event_sched_out(struct perf_event *event,
1471 1569
1472 if (!is_software_event(event)) 1570 if (!is_software_event(event))
1473 cpuctx->active_oncpu--; 1571 cpuctx->active_oncpu--;
1474 ctx->nr_active--; 1572 if (!--ctx->nr_active)
1573 perf_event_ctx_deactivate(ctx);
1475 if (event->attr.freq && event->attr.sample_freq) 1574 if (event->attr.freq && event->attr.sample_freq)
1476 ctx->nr_freq--; 1575 ctx->nr_freq--;
1477 if (event->attr.exclusive || !cpuctx->active_oncpu) 1576 if (event->attr.exclusive || !cpuctx->active_oncpu)
@@ -1654,7 +1753,7 @@ int __perf_event_disable(void *info)
1654 * is the current context on this CPU and preemption is disabled, 1753 * is the current context on this CPU and preemption is disabled,
1655 * hence we can't get into perf_event_task_sched_out for this context. 1754 * hence we can't get into perf_event_task_sched_out for this context.
1656 */ 1755 */
1657void perf_event_disable(struct perf_event *event) 1756static void _perf_event_disable(struct perf_event *event)
1658{ 1757{
1659 struct perf_event_context *ctx = event->ctx; 1758 struct perf_event_context *ctx = event->ctx;
1660 struct task_struct *task = ctx->task; 1759 struct task_struct *task = ctx->task;
@@ -1695,6 +1794,19 @@ retry:
1695 } 1794 }
1696 raw_spin_unlock_irq(&ctx->lock); 1795 raw_spin_unlock_irq(&ctx->lock);
1697} 1796}
1797
1798/*
1799 * Strictly speaking kernel users cannot create groups and therefore this
1800 * interface does not need the perf_event_ctx_lock() magic.
1801 */
1802void perf_event_disable(struct perf_event *event)
1803{
1804 struct perf_event_context *ctx;
1805
1806 ctx = perf_event_ctx_lock(event);
1807 _perf_event_disable(event);
1808 perf_event_ctx_unlock(event, ctx);
1809}
1698EXPORT_SYMBOL_GPL(perf_event_disable); 1810EXPORT_SYMBOL_GPL(perf_event_disable);
1699 1811
1700static void perf_set_shadow_time(struct perf_event *event, 1812static void perf_set_shadow_time(struct perf_event *event,
@@ -1782,7 +1894,8 @@ event_sched_in(struct perf_event *event,
1782 1894
1783 if (!is_software_event(event)) 1895 if (!is_software_event(event))
1784 cpuctx->active_oncpu++; 1896 cpuctx->active_oncpu++;
1785 ctx->nr_active++; 1897 if (!ctx->nr_active++)
1898 perf_event_ctx_activate(ctx);
1786 if (event->attr.freq && event->attr.sample_freq) 1899 if (event->attr.freq && event->attr.sample_freq)
1787 ctx->nr_freq++; 1900 ctx->nr_freq++;
1788 1901
@@ -2158,7 +2271,7 @@ unlock:
2158 * perf_event_for_each_child or perf_event_for_each as described 2271 * perf_event_for_each_child or perf_event_for_each as described
2159 * for perf_event_disable. 2272 * for perf_event_disable.
2160 */ 2273 */
2161void perf_event_enable(struct perf_event *event) 2274static void _perf_event_enable(struct perf_event *event)
2162{ 2275{
2163 struct perf_event_context *ctx = event->ctx; 2276 struct perf_event_context *ctx = event->ctx;
2164 struct task_struct *task = ctx->task; 2277 struct task_struct *task = ctx->task;
@@ -2214,9 +2327,21 @@ retry:
2214out: 2327out:
2215 raw_spin_unlock_irq(&ctx->lock); 2328 raw_spin_unlock_irq(&ctx->lock);
2216} 2329}
2330
2331/*
2332 * See perf_event_disable();
2333 */
2334void perf_event_enable(struct perf_event *event)
2335{
2336 struct perf_event_context *ctx;
2337
2338 ctx = perf_event_ctx_lock(event);
2339 _perf_event_enable(event);
2340 perf_event_ctx_unlock(event, ctx);
2341}
2217EXPORT_SYMBOL_GPL(perf_event_enable); 2342EXPORT_SYMBOL_GPL(perf_event_enable);
2218 2343
2219int perf_event_refresh(struct perf_event *event, int refresh) 2344static int _perf_event_refresh(struct perf_event *event, int refresh)
2220{ 2345{
2221 /* 2346 /*
2222 * not supported on inherited events 2347 * not supported on inherited events
@@ -2225,10 +2350,25 @@ int perf_event_refresh(struct perf_event *event, int refresh)
2225 return -EINVAL; 2350 return -EINVAL;
2226 2351
2227 atomic_add(refresh, &event->event_limit); 2352 atomic_add(refresh, &event->event_limit);
2228 perf_event_enable(event); 2353 _perf_event_enable(event);
2229 2354
2230 return 0; 2355 return 0;
2231} 2356}
2357
2358/*
2359 * See perf_event_disable()
2360 */
2361int perf_event_refresh(struct perf_event *event, int refresh)
2362{
2363 struct perf_event_context *ctx;
2364 int ret;
2365
2366 ctx = perf_event_ctx_lock(event);
2367 ret = _perf_event_refresh(event, refresh);
2368 perf_event_ctx_unlock(event, ctx);
2369
2370 return ret;
2371}
2232EXPORT_SYMBOL_GPL(perf_event_refresh); 2372EXPORT_SYMBOL_GPL(perf_event_refresh);
2233 2373
2234static void ctx_sched_out(struct perf_event_context *ctx, 2374static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2612,12 +2752,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2612 2752
2613 perf_pmu_enable(ctx->pmu); 2753 perf_pmu_enable(ctx->pmu);
2614 perf_ctx_unlock(cpuctx, ctx); 2754 perf_ctx_unlock(cpuctx, ctx);
2615
2616 /*
2617 * Since these rotations are per-cpu, we need to ensure the
2618 * cpu-context we got scheduled on is actually rotating.
2619 */
2620 perf_pmu_rotate_start(ctx->pmu);
2621} 2755}
2622 2756
2623/* 2757/*
@@ -2905,25 +3039,18 @@ static void rotate_ctx(struct perf_event_context *ctx)
2905 list_rotate_left(&ctx->flexible_groups); 3039 list_rotate_left(&ctx->flexible_groups);
2906} 3040}
2907 3041
2908/*
2909 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2910 * because they're strictly cpu affine and rotate_start is called with IRQs
2911 * disabled, while rotate_context is called from IRQ context.
2912 */
2913static int perf_rotate_context(struct perf_cpu_context *cpuctx) 3042static int perf_rotate_context(struct perf_cpu_context *cpuctx)
2914{ 3043{
2915 struct perf_event_context *ctx = NULL; 3044 struct perf_event_context *ctx = NULL;
2916 int rotate = 0, remove = 1; 3045 int rotate = 0;
2917 3046
2918 if (cpuctx->ctx.nr_events) { 3047 if (cpuctx->ctx.nr_events) {
2919 remove = 0;
2920 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 3048 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2921 rotate = 1; 3049 rotate = 1;
2922 } 3050 }
2923 3051
2924 ctx = cpuctx->task_ctx; 3052 ctx = cpuctx->task_ctx;
2925 if (ctx && ctx->nr_events) { 3053 if (ctx && ctx->nr_events) {
2926 remove = 0;
2927 if (ctx->nr_events != ctx->nr_active) 3054 if (ctx->nr_events != ctx->nr_active)
2928 rotate = 1; 3055 rotate = 1;
2929 } 3056 }
@@ -2947,8 +3074,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
2947 perf_pmu_enable(cpuctx->ctx.pmu); 3074 perf_pmu_enable(cpuctx->ctx.pmu);
2948 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 3075 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2949done: 3076done:
2950 if (remove)
2951 list_del_init(&cpuctx->rotation_list);
2952 3077
2953 return rotate; 3078 return rotate;
2954} 3079}
@@ -2966,9 +3091,8 @@ bool perf_event_can_stop_tick(void)
2966 3091
2967void perf_event_task_tick(void) 3092void perf_event_task_tick(void)
2968{ 3093{
2969 struct list_head *head = this_cpu_ptr(&rotation_list); 3094 struct list_head *head = this_cpu_ptr(&active_ctx_list);
2970 struct perf_cpu_context *cpuctx, *tmp; 3095 struct perf_event_context *ctx, *tmp;
2971 struct perf_event_context *ctx;
2972 int throttled; 3096 int throttled;
2973 3097
2974 WARN_ON(!irqs_disabled()); 3098 WARN_ON(!irqs_disabled());
@@ -2976,14 +3100,8 @@ void perf_event_task_tick(void)
2976 __this_cpu_inc(perf_throttled_seq); 3100 __this_cpu_inc(perf_throttled_seq);
2977 throttled = __this_cpu_xchg(perf_throttled_count, 0); 3101 throttled = __this_cpu_xchg(perf_throttled_count, 0);
2978 3102
2979 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { 3103 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
2980 ctx = &cpuctx->ctx;
2981 perf_adjust_freq_unthr_context(ctx, throttled); 3104 perf_adjust_freq_unthr_context(ctx, throttled);
2982
2983 ctx = cpuctx->task_ctx;
2984 if (ctx)
2985 perf_adjust_freq_unthr_context(ctx, throttled);
2986 }
2987} 3105}
2988 3106
2989static int event_enable_on_exec(struct perf_event *event, 3107static int event_enable_on_exec(struct perf_event *event,
@@ -3142,6 +3260,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
3142{ 3260{
3143 raw_spin_lock_init(&ctx->lock); 3261 raw_spin_lock_init(&ctx->lock);
3144 mutex_init(&ctx->mutex); 3262 mutex_init(&ctx->mutex);
3263 INIT_LIST_HEAD(&ctx->active_ctx_list);
3145 INIT_LIST_HEAD(&ctx->pinned_groups); 3264 INIT_LIST_HEAD(&ctx->pinned_groups);
3146 INIT_LIST_HEAD(&ctx->flexible_groups); 3265 INIT_LIST_HEAD(&ctx->flexible_groups);
3147 INIT_LIST_HEAD(&ctx->event_list); 3266 INIT_LIST_HEAD(&ctx->event_list);
@@ -3421,7 +3540,16 @@ static void perf_remove_from_owner(struct perf_event *event)
3421 rcu_read_unlock(); 3540 rcu_read_unlock();
3422 3541
3423 if (owner) { 3542 if (owner) {
3424 mutex_lock(&owner->perf_event_mutex); 3543 /*
3544 * If we're here through perf_event_exit_task() we're already
3545 * holding ctx->mutex which would be an inversion wrt. the
3546 * normal lock order.
3547 *
3548 * However we can safely take this lock because its the child
3549 * ctx->mutex.
3550 */
3551 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3552
3425 /* 3553 /*
3426 * We have to re-check the event->owner field, if it is cleared 3554 * We have to re-check the event->owner field, if it is cleared
3427 * we raced with perf_event_exit_task(), acquiring the mutex 3555 * we raced with perf_event_exit_task(), acquiring the mutex
@@ -3440,7 +3568,7 @@ static void perf_remove_from_owner(struct perf_event *event)
3440 */ 3568 */
3441static void put_event(struct perf_event *event) 3569static void put_event(struct perf_event *event)
3442{ 3570{
3443 struct perf_event_context *ctx = event->ctx; 3571 struct perf_event_context *ctx;
3444 3572
3445 if (!atomic_long_dec_and_test(&event->refcount)) 3573 if (!atomic_long_dec_and_test(&event->refcount))
3446 return; 3574 return;
@@ -3448,7 +3576,6 @@ static void put_event(struct perf_event *event)
3448 if (!is_kernel_event(event)) 3576 if (!is_kernel_event(event))
3449 perf_remove_from_owner(event); 3577 perf_remove_from_owner(event);
3450 3578
3451 WARN_ON_ONCE(ctx->parent_ctx);
3452 /* 3579 /*
3453 * There are two ways this annotation is useful: 3580 * There are two ways this annotation is useful:
3454 * 3581 *
@@ -3461,7 +3588,8 @@ static void put_event(struct perf_event *event)
3461 * the last filedesc died, so there is no possibility 3588 * the last filedesc died, so there is no possibility
3462 * to trigger the AB-BA case. 3589 * to trigger the AB-BA case.
3463 */ 3590 */
3464 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); 3591 ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
3592 WARN_ON_ONCE(ctx->parent_ctx);
3465 perf_remove_from_context(event, true); 3593 perf_remove_from_context(event, true);
3466 mutex_unlock(&ctx->mutex); 3594 mutex_unlock(&ctx->mutex);
3467 3595
@@ -3547,12 +3675,13 @@ static int perf_event_read_group(struct perf_event *event,
3547 u64 read_format, char __user *buf) 3675 u64 read_format, char __user *buf)
3548{ 3676{
3549 struct perf_event *leader = event->group_leader, *sub; 3677 struct perf_event *leader = event->group_leader, *sub;
3550 int n = 0, size = 0, ret = -EFAULT;
3551 struct perf_event_context *ctx = leader->ctx; 3678 struct perf_event_context *ctx = leader->ctx;
3552 u64 values[5]; 3679 int n = 0, size = 0, ret;
3553 u64 count, enabled, running; 3680 u64 count, enabled, running;
3681 u64 values[5];
3682
3683 lockdep_assert_held(&ctx->mutex);
3554 3684
3555 mutex_lock(&ctx->mutex);
3556 count = perf_event_read_value(leader, &enabled, &running); 3685 count = perf_event_read_value(leader, &enabled, &running);
3557 3686
3558 values[n++] = 1 + leader->nr_siblings; 3687 values[n++] = 1 + leader->nr_siblings;
@@ -3567,7 +3696,7 @@ static int perf_event_read_group(struct perf_event *event,
3567 size = n * sizeof(u64); 3696 size = n * sizeof(u64);
3568 3697
3569 if (copy_to_user(buf, values, size)) 3698 if (copy_to_user(buf, values, size))
3570 goto unlock; 3699 return -EFAULT;
3571 3700
3572 ret = size; 3701 ret = size;
3573 3702
@@ -3581,14 +3710,11 @@ static int perf_event_read_group(struct perf_event *event,
3581 size = n * sizeof(u64); 3710 size = n * sizeof(u64);
3582 3711
3583 if (copy_to_user(buf + ret, values, size)) { 3712 if (copy_to_user(buf + ret, values, size)) {
3584 ret = -EFAULT; 3713 return -EFAULT;
3585 goto unlock;
3586 } 3714 }
3587 3715
3588 ret += size; 3716 ret += size;
3589 } 3717 }
3590unlock:
3591 mutex_unlock(&ctx->mutex);
3592 3718
3593 return ret; 3719 return ret;
3594} 3720}
@@ -3660,8 +3786,14 @@ static ssize_t
3660perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) 3786perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3661{ 3787{
3662 struct perf_event *event = file->private_data; 3788 struct perf_event *event = file->private_data;
3789 struct perf_event_context *ctx;
3790 int ret;
3791
3792 ctx = perf_event_ctx_lock(event);
3793 ret = perf_read_hw(event, buf, count);
3794 perf_event_ctx_unlock(event, ctx);
3663 3795
3664 return perf_read_hw(event, buf, count); 3796 return ret;
3665} 3797}
3666 3798
3667static unsigned int perf_poll(struct file *file, poll_table *wait) 3799static unsigned int perf_poll(struct file *file, poll_table *wait)
@@ -3687,7 +3819,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3687 return events; 3819 return events;
3688} 3820}
3689 3821
3690static void perf_event_reset(struct perf_event *event) 3822static void _perf_event_reset(struct perf_event *event)
3691{ 3823{
3692 (void)perf_event_read(event); 3824 (void)perf_event_read(event);
3693 local64_set(&event->count, 0); 3825 local64_set(&event->count, 0);
@@ -3706,6 +3838,7 @@ static void perf_event_for_each_child(struct perf_event *event,
3706 struct perf_event *child; 3838 struct perf_event *child;
3707 3839
3708 WARN_ON_ONCE(event->ctx->parent_ctx); 3840 WARN_ON_ONCE(event->ctx->parent_ctx);
3841
3709 mutex_lock(&event->child_mutex); 3842 mutex_lock(&event->child_mutex);
3710 func(event); 3843 func(event);
3711 list_for_each_entry(child, &event->child_list, child_list) 3844 list_for_each_entry(child, &event->child_list, child_list)
@@ -3719,14 +3852,13 @@ static void perf_event_for_each(struct perf_event *event,
3719 struct perf_event_context *ctx = event->ctx; 3852 struct perf_event_context *ctx = event->ctx;
3720 struct perf_event *sibling; 3853 struct perf_event *sibling;
3721 3854
3722 WARN_ON_ONCE(ctx->parent_ctx); 3855 lockdep_assert_held(&ctx->mutex);
3723 mutex_lock(&ctx->mutex); 3856
3724 event = event->group_leader; 3857 event = event->group_leader;
3725 3858
3726 perf_event_for_each_child(event, func); 3859 perf_event_for_each_child(event, func);
3727 list_for_each_entry(sibling, &event->sibling_list, group_entry) 3860 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3728 perf_event_for_each_child(sibling, func); 3861 perf_event_for_each_child(sibling, func);
3729 mutex_unlock(&ctx->mutex);
3730} 3862}
3731 3863
3732static int perf_event_period(struct perf_event *event, u64 __user *arg) 3864static int perf_event_period(struct perf_event *event, u64 __user *arg)
@@ -3796,25 +3928,24 @@ static int perf_event_set_output(struct perf_event *event,
3796 struct perf_event *output_event); 3928 struct perf_event *output_event);
3797static int perf_event_set_filter(struct perf_event *event, void __user *arg); 3929static int perf_event_set_filter(struct perf_event *event, void __user *arg);
3798 3930
3799static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 3931static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
3800{ 3932{
3801 struct perf_event *event = file->private_data;
3802 void (*func)(struct perf_event *); 3933 void (*func)(struct perf_event *);
3803 u32 flags = arg; 3934 u32 flags = arg;
3804 3935
3805 switch (cmd) { 3936 switch (cmd) {
3806 case PERF_EVENT_IOC_ENABLE: 3937 case PERF_EVENT_IOC_ENABLE:
3807 func = perf_event_enable; 3938 func = _perf_event_enable;
3808 break; 3939 break;
3809 case PERF_EVENT_IOC_DISABLE: 3940 case PERF_EVENT_IOC_DISABLE:
3810 func = perf_event_disable; 3941 func = _perf_event_disable;
3811 break; 3942 break;
3812 case PERF_EVENT_IOC_RESET: 3943 case PERF_EVENT_IOC_RESET:
3813 func = perf_event_reset; 3944 func = _perf_event_reset;
3814 break; 3945 break;
3815 3946
3816 case PERF_EVENT_IOC_REFRESH: 3947 case PERF_EVENT_IOC_REFRESH:
3817 return perf_event_refresh(event, arg); 3948 return _perf_event_refresh(event, arg);
3818 3949
3819 case PERF_EVENT_IOC_PERIOD: 3950 case PERF_EVENT_IOC_PERIOD:
3820 return perf_event_period(event, (u64 __user *)arg); 3951 return perf_event_period(event, (u64 __user *)arg);
@@ -3861,6 +3992,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3861 return 0; 3992 return 0;
3862} 3993}
3863 3994
3995static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3996{
3997 struct perf_event *event = file->private_data;
3998 struct perf_event_context *ctx;
3999 long ret;
4000
4001 ctx = perf_event_ctx_lock(event);
4002 ret = _perf_ioctl(event, cmd, arg);
4003 perf_event_ctx_unlock(event, ctx);
4004
4005 return ret;
4006}
4007
3864#ifdef CONFIG_COMPAT 4008#ifdef CONFIG_COMPAT
3865static long perf_compat_ioctl(struct file *file, unsigned int cmd, 4009static long perf_compat_ioctl(struct file *file, unsigned int cmd,
3866 unsigned long arg) 4010 unsigned long arg)
@@ -3883,11 +4027,15 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
3883 4027
3884int perf_event_task_enable(void) 4028int perf_event_task_enable(void)
3885{ 4029{
4030 struct perf_event_context *ctx;
3886 struct perf_event *event; 4031 struct perf_event *event;
3887 4032
3888 mutex_lock(&current->perf_event_mutex); 4033 mutex_lock(&current->perf_event_mutex);
3889 list_for_each_entry(event, &current->perf_event_list, owner_entry) 4034 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
3890 perf_event_for_each_child(event, perf_event_enable); 4035 ctx = perf_event_ctx_lock(event);
4036 perf_event_for_each_child(event, _perf_event_enable);
4037 perf_event_ctx_unlock(event, ctx);
4038 }
3891 mutex_unlock(&current->perf_event_mutex); 4039 mutex_unlock(&current->perf_event_mutex);
3892 4040
3893 return 0; 4041 return 0;
@@ -3895,11 +4043,15 @@ int perf_event_task_enable(void)
3895 4043
3896int perf_event_task_disable(void) 4044int perf_event_task_disable(void)
3897{ 4045{
4046 struct perf_event_context *ctx;
3898 struct perf_event *event; 4047 struct perf_event *event;
3899 4048
3900 mutex_lock(&current->perf_event_mutex); 4049 mutex_lock(&current->perf_event_mutex);
3901 list_for_each_entry(event, &current->perf_event_list, owner_entry) 4050 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
3902 perf_event_for_each_child(event, perf_event_disable); 4051 ctx = perf_event_ctx_lock(event);
4052 perf_event_for_each_child(event, _perf_event_disable);
4053 perf_event_ctx_unlock(event, ctx);
4054 }
3903 mutex_unlock(&current->perf_event_mutex); 4055 mutex_unlock(&current->perf_event_mutex);
3904 4056
3905 return 0; 4057 return 0;
@@ -3949,7 +4101,8 @@ unlock:
3949 rcu_read_unlock(); 4101 rcu_read_unlock();
3950} 4102}
3951 4103
3952void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) 4104void __weak arch_perf_update_userpage(
4105 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
3953{ 4106{
3954} 4107}
3955 4108
@@ -3999,7 +4152,7 @@ void perf_event_update_userpage(struct perf_event *event)
3999 userpg->time_running = running + 4152 userpg->time_running = running +
4000 atomic64_read(&event->child_total_time_running); 4153 atomic64_read(&event->child_total_time_running);
4001 4154
4002 arch_perf_update_userpage(userpg, now); 4155 arch_perf_update_userpage(event, userpg, now);
4003 4156
4004 barrier(); 4157 barrier();
4005 ++userpg->lock; 4158 ++userpg->lock;
@@ -4141,6 +4294,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
4141 4294
4142 atomic_inc(&event->mmap_count); 4295 atomic_inc(&event->mmap_count);
4143 atomic_inc(&event->rb->mmap_count); 4296 atomic_inc(&event->rb->mmap_count);
4297
4298 if (event->pmu->event_mapped)
4299 event->pmu->event_mapped(event);
4144} 4300}
4145 4301
4146/* 4302/*
@@ -4160,6 +4316,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
4160 int mmap_locked = rb->mmap_locked; 4316 int mmap_locked = rb->mmap_locked;
4161 unsigned long size = perf_data_size(rb); 4317 unsigned long size = perf_data_size(rb);
4162 4318
4319 if (event->pmu->event_unmapped)
4320 event->pmu->event_unmapped(event);
4321
4163 atomic_dec(&rb->mmap_count); 4322 atomic_dec(&rb->mmap_count);
4164 4323
4165 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 4324 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4361,6 +4520,9 @@ unlock:
4361 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; 4520 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
4362 vma->vm_ops = &perf_mmap_vmops; 4521 vma->vm_ops = &perf_mmap_vmops;
4363 4522
4523 if (event->pmu->event_mapped)
4524 event->pmu->event_mapped(event);
4525
4364 return ret; 4526 return ret;
4365} 4527}
4366 4528
@@ -5889,6 +6051,8 @@ end:
5889 rcu_read_unlock(); 6051 rcu_read_unlock();
5890} 6052}
5891 6053
6054DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
6055
5892int perf_swevent_get_recursion_context(void) 6056int perf_swevent_get_recursion_context(void)
5893{ 6057{
5894 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 6058 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
@@ -5904,21 +6068,30 @@ inline void perf_swevent_put_recursion_context(int rctx)
5904 put_recursion_context(swhash->recursion, rctx); 6068 put_recursion_context(swhash->recursion, rctx);
5905} 6069}
5906 6070
5907void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 6071void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5908{ 6072{
5909 struct perf_sample_data data; 6073 struct perf_sample_data data;
5910 int rctx;
5911 6074
5912 preempt_disable_notrace(); 6075 if (WARN_ON_ONCE(!regs))
5913 rctx = perf_swevent_get_recursion_context();
5914 if (rctx < 0)
5915 return; 6076 return;
5916 6077
5917 perf_sample_data_init(&data, addr, 0); 6078 perf_sample_data_init(&data, addr, 0);
5918
5919 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); 6079 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
6080}
6081
6082void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6083{
6084 int rctx;
6085
6086 preempt_disable_notrace();
6087 rctx = perf_swevent_get_recursion_context();
6088 if (unlikely(rctx < 0))
6089 goto fail;
6090
6091 ___perf_sw_event(event_id, nr, regs, addr);
5920 6092
5921 perf_swevent_put_recursion_context(rctx); 6093 perf_swevent_put_recursion_context(rctx);
6094fail:
5922 preempt_enable_notrace(); 6095 preempt_enable_notrace();
5923} 6096}
5924 6097
@@ -6780,7 +6953,6 @@ skip_type:
6780 6953
6781 __perf_cpu_hrtimer_init(cpuctx, cpu); 6954 __perf_cpu_hrtimer_init(cpuctx, cpu);
6782 6955
6783 INIT_LIST_HEAD(&cpuctx->rotation_list);
6784 cpuctx->unique_pmu = pmu; 6956 cpuctx->unique_pmu = pmu;
6785 } 6957 }
6786 6958
@@ -6853,6 +7025,20 @@ void perf_pmu_unregister(struct pmu *pmu)
6853} 7025}
6854EXPORT_SYMBOL_GPL(perf_pmu_unregister); 7026EXPORT_SYMBOL_GPL(perf_pmu_unregister);
6855 7027
7028static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7029{
7030 int ret;
7031
7032 if (!try_module_get(pmu->module))
7033 return -ENODEV;
7034 event->pmu = pmu;
7035 ret = pmu->event_init(event);
7036 if (ret)
7037 module_put(pmu->module);
7038
7039 return ret;
7040}
7041
6856struct pmu *perf_init_event(struct perf_event *event) 7042struct pmu *perf_init_event(struct perf_event *event)
6857{ 7043{
6858 struct pmu *pmu = NULL; 7044 struct pmu *pmu = NULL;
@@ -6865,24 +7051,14 @@ struct pmu *perf_init_event(struct perf_event *event)
6865 pmu = idr_find(&pmu_idr, event->attr.type); 7051 pmu = idr_find(&pmu_idr, event->attr.type);
6866 rcu_read_unlock(); 7052 rcu_read_unlock();
6867 if (pmu) { 7053 if (pmu) {
6868 if (!try_module_get(pmu->module)) { 7054 ret = perf_try_init_event(pmu, event);
6869 pmu = ERR_PTR(-ENODEV);
6870 goto unlock;
6871 }
6872 event->pmu = pmu;
6873 ret = pmu->event_init(event);
6874 if (ret) 7055 if (ret)
6875 pmu = ERR_PTR(ret); 7056 pmu = ERR_PTR(ret);
6876 goto unlock; 7057 goto unlock;
6877 } 7058 }
6878 7059
6879 list_for_each_entry_rcu(pmu, &pmus, entry) { 7060 list_for_each_entry_rcu(pmu, &pmus, entry) {
6880 if (!try_module_get(pmu->module)) { 7061 ret = perf_try_init_event(pmu, event);
6881 pmu = ERR_PTR(-ENODEV);
6882 goto unlock;
6883 }
6884 event->pmu = pmu;
6885 ret = pmu->event_init(event);
6886 if (!ret) 7062 if (!ret)
6887 goto unlock; 7063 goto unlock;
6888 7064
@@ -7246,6 +7422,15 @@ out:
7246 return ret; 7422 return ret;
7247} 7423}
7248 7424
7425static void mutex_lock_double(struct mutex *a, struct mutex *b)
7426{
7427 if (b < a)
7428 swap(a, b);
7429
7430 mutex_lock(a);
7431 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
7432}
7433
7249/** 7434/**
7250 * sys_perf_event_open - open a performance event, associate it to a task/cpu 7435 * sys_perf_event_open - open a performance event, associate it to a task/cpu
7251 * 7436 *
@@ -7261,7 +7446,7 @@ SYSCALL_DEFINE5(perf_event_open,
7261 struct perf_event *group_leader = NULL, *output_event = NULL; 7446 struct perf_event *group_leader = NULL, *output_event = NULL;
7262 struct perf_event *event, *sibling; 7447 struct perf_event *event, *sibling;
7263 struct perf_event_attr attr; 7448 struct perf_event_attr attr;
7264 struct perf_event_context *ctx; 7449 struct perf_event_context *ctx, *uninitialized_var(gctx);
7265 struct file *event_file = NULL; 7450 struct file *event_file = NULL;
7266 struct fd group = {NULL, 0}; 7451 struct fd group = {NULL, 0};
7267 struct task_struct *task = NULL; 7452 struct task_struct *task = NULL;
@@ -7459,43 +7644,68 @@ SYSCALL_DEFINE5(perf_event_open,
7459 } 7644 }
7460 7645
7461 if (move_group) { 7646 if (move_group) {
7462 struct perf_event_context *gctx = group_leader->ctx; 7647 gctx = group_leader->ctx;
7463
7464 mutex_lock(&gctx->mutex);
7465 perf_remove_from_context(group_leader, false);
7466 7648
7467 /* 7649 /*
7468 * Removing from the context ends up with disabled 7650 * See perf_event_ctx_lock() for comments on the details
7469 * event. What we want here is event in the initial 7651 * of swizzling perf_event::ctx.
7470 * startup state, ready to be add into new context.
7471 */ 7652 */
7472 perf_event__state_init(group_leader); 7653 mutex_lock_double(&gctx->mutex, &ctx->mutex);
7654
7655 perf_remove_from_context(group_leader, false);
7656
7473 list_for_each_entry(sibling, &group_leader->sibling_list, 7657 list_for_each_entry(sibling, &group_leader->sibling_list,
7474 group_entry) { 7658 group_entry) {
7475 perf_remove_from_context(sibling, false); 7659 perf_remove_from_context(sibling, false);
7476 perf_event__state_init(sibling);
7477 put_ctx(gctx); 7660 put_ctx(gctx);
7478 } 7661 }
7479 mutex_unlock(&gctx->mutex); 7662 } else {
7480 put_ctx(gctx); 7663 mutex_lock(&ctx->mutex);
7481 } 7664 }
7482 7665
7483 WARN_ON_ONCE(ctx->parent_ctx); 7666 WARN_ON_ONCE(ctx->parent_ctx);
7484 mutex_lock(&ctx->mutex);
7485 7667
7486 if (move_group) { 7668 if (move_group) {
7669 /*
7670 * Wait for everybody to stop referencing the events through
7671 * the old lists, before installing it on new lists.
7672 */
7487 synchronize_rcu(); 7673 synchronize_rcu();
7488 perf_install_in_context(ctx, group_leader, group_leader->cpu); 7674
7489 get_ctx(ctx); 7675 /*
7676 * Install the group siblings before the group leader.
7677 *
7678 * Because a group leader will try and install the entire group
7679 * (through the sibling list, which is still in-tact), we can
7680 * end up with siblings installed in the wrong context.
7681 *
7682 * By installing siblings first we NO-OP because they're not
7683 * reachable through the group lists.
7684 */
7490 list_for_each_entry(sibling, &group_leader->sibling_list, 7685 list_for_each_entry(sibling, &group_leader->sibling_list,
7491 group_entry) { 7686 group_entry) {
7687 perf_event__state_init(sibling);
7492 perf_install_in_context(ctx, sibling, sibling->cpu); 7688 perf_install_in_context(ctx, sibling, sibling->cpu);
7493 get_ctx(ctx); 7689 get_ctx(ctx);
7494 } 7690 }
7691
7692 /*
7693 * Removing from the context ends up with disabled
7694 * event. What we want here is event in the initial
7695 * startup state, ready to be add into new context.
7696 */
7697 perf_event__state_init(group_leader);
7698 perf_install_in_context(ctx, group_leader, group_leader->cpu);
7699 get_ctx(ctx);
7495 } 7700 }
7496 7701
7497 perf_install_in_context(ctx, event, event->cpu); 7702 perf_install_in_context(ctx, event, event->cpu);
7498 perf_unpin_context(ctx); 7703 perf_unpin_context(ctx);
7704
7705 if (move_group) {
7706 mutex_unlock(&gctx->mutex);
7707 put_ctx(gctx);
7708 }
7499 mutex_unlock(&ctx->mutex); 7709 mutex_unlock(&ctx->mutex);
7500 7710
7501 put_online_cpus(); 7711 put_online_cpus();
@@ -7603,7 +7813,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7603 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; 7813 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
7604 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; 7814 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
7605 7815
7606 mutex_lock(&src_ctx->mutex); 7816 /*
7817 * See perf_event_ctx_lock() for comments on the details
7818 * of swizzling perf_event::ctx.
7819 */
7820 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
7607 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 7821 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
7608 event_entry) { 7822 event_entry) {
7609 perf_remove_from_context(event, false); 7823 perf_remove_from_context(event, false);
@@ -7611,11 +7825,36 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7611 put_ctx(src_ctx); 7825 put_ctx(src_ctx);
7612 list_add(&event->migrate_entry, &events); 7826 list_add(&event->migrate_entry, &events);
7613 } 7827 }
7614 mutex_unlock(&src_ctx->mutex);
7615 7828
7829 /*
7830 * Wait for the events to quiesce before re-instating them.
7831 */
7616 synchronize_rcu(); 7832 synchronize_rcu();
7617 7833
7618 mutex_lock(&dst_ctx->mutex); 7834 /*
7835 * Re-instate events in 2 passes.
7836 *
7837 * Skip over group leaders and only install siblings on this first
7838 * pass, siblings will not get enabled without a leader, however a
7839 * leader will enable its siblings, even if those are still on the old
7840 * context.
7841 */
7842 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
7843 if (event->group_leader == event)
7844 continue;
7845
7846 list_del(&event->migrate_entry);
7847 if (event->state >= PERF_EVENT_STATE_OFF)
7848 event->state = PERF_EVENT_STATE_INACTIVE;
7849 account_event_cpu(event, dst_cpu);
7850 perf_install_in_context(dst_ctx, event, dst_cpu);
7851 get_ctx(dst_ctx);
7852 }
7853
7854 /*
7855 * Once all the siblings are setup properly, install the group leaders
7856 * to make it go.
7857 */
7619 list_for_each_entry_safe(event, tmp, &events, migrate_entry) { 7858 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
7620 list_del(&event->migrate_entry); 7859 list_del(&event->migrate_entry);
7621 if (event->state >= PERF_EVENT_STATE_OFF) 7860 if (event->state >= PERF_EVENT_STATE_OFF)
@@ -7625,6 +7864,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7625 get_ctx(dst_ctx); 7864 get_ctx(dst_ctx);
7626 } 7865 }
7627 mutex_unlock(&dst_ctx->mutex); 7866 mutex_unlock(&dst_ctx->mutex);
7867 mutex_unlock(&src_ctx->mutex);
7628} 7868}
7629EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); 7869EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
7630 7870
@@ -7811,14 +8051,19 @@ static void perf_free_event(struct perf_event *event,
7811 8051
7812 put_event(parent); 8052 put_event(parent);
7813 8053
8054 raw_spin_lock_irq(&ctx->lock);
7814 perf_group_detach(event); 8055 perf_group_detach(event);
7815 list_del_event(event, ctx); 8056 list_del_event(event, ctx);
8057 raw_spin_unlock_irq(&ctx->lock);
7816 free_event(event); 8058 free_event(event);
7817} 8059}
7818 8060
7819/* 8061/*
7820 * free an unexposed, unused context as created by inheritance by 8062 * Free an unexposed, unused context as created by inheritance by
7821 * perf_event_init_task below, used by fork() in case of fail. 8063 * perf_event_init_task below, used by fork() in case of fail.
8064 *
8065 * Not all locks are strictly required, but take them anyway to be nice and
8066 * help out with the lockdep assertions.
7822 */ 8067 */
7823void perf_event_free_task(struct task_struct *task) 8068void perf_event_free_task(struct task_struct *task)
7824{ 8069{
@@ -8137,7 +8382,7 @@ static void __init perf_event_init_all_cpus(void)
8137 for_each_possible_cpu(cpu) { 8382 for_each_possible_cpu(cpu) {
8138 swhash = &per_cpu(swevent_htable, cpu); 8383 swhash = &per_cpu(swevent_htable, cpu);
8139 mutex_init(&swhash->hlist_mutex); 8384 mutex_init(&swhash->hlist_mutex);
8140 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); 8385 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
8141 } 8386 }
8142} 8387}
8143 8388
@@ -8158,22 +8403,11 @@ static void perf_event_init_cpu(int cpu)
8158} 8403}
8159 8404
8160#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC 8405#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
8161static void perf_pmu_rotate_stop(struct pmu *pmu)
8162{
8163 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
8164
8165 WARN_ON(!irqs_disabled());
8166
8167 list_del_init(&cpuctx->rotation_list);
8168}
8169
8170static void __perf_event_exit_context(void *__info) 8406static void __perf_event_exit_context(void *__info)
8171{ 8407{
8172 struct remove_event re = { .detach_group = true }; 8408 struct remove_event re = { .detach_group = true };
8173 struct perf_event_context *ctx = __info; 8409 struct perf_event_context *ctx = __info;
8174 8410
8175 perf_pmu_rotate_stop(ctx->pmu);
8176
8177 rcu_read_lock(); 8411 rcu_read_lock();
8178 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) 8412 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
8179 __perf_remove_from_context(&re); 8413 __perf_remove_from_context(&re);
@@ -8284,6 +8518,18 @@ void __init perf_event_init(void)
8284 != 1024); 8518 != 1024);
8285} 8519}
8286 8520
8521ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
8522 char *page)
8523{
8524 struct perf_pmu_events_attr *pmu_attr =
8525 container_of(attr, struct perf_pmu_events_attr, attr);
8526
8527 if (pmu_attr->event_str)
8528 return sprintf(page, "%s\n", pmu_attr->event_str);
8529
8530 return 0;
8531}
8532
8287static int __init perf_event_sysfs_init(void) 8533static int __init perf_event_sysfs_init(void)
8288{ 8534{
8289 struct pmu *pmu; 8535 struct pmu *pmu;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 146a5792b1d2..eadb95ce7aac 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -13,12 +13,13 @@
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/circ_buf.h> 15#include <linux/circ_buf.h>
16#include <linux/poll.h>
16 17
17#include "internal.h" 18#include "internal.h"
18 19
19static void perf_output_wakeup(struct perf_output_handle *handle) 20static void perf_output_wakeup(struct perf_output_handle *handle)
20{ 21{
21 atomic_set(&handle->rb->poll, POLL_IN); 22 atomic_set(&handle->rb->poll, POLLIN);
22 23
23 handle->event->pending_wakeup = 1; 24 handle->event->pending_wakeup = 1;
24 irq_work_queue(&handle->event->pending); 25 irq_work_queue(&handle->event->pending);
diff --git a/kernel/exit.c b/kernel/exit.c
index 6806c55475ee..feff10bbb307 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk)
435 task_unlock(tsk); 435 task_unlock(tsk);
436 mm_update_next_owner(mm); 436 mm_update_next_owner(mm);
437 mmput(mm); 437 mmput(mm);
438 clear_thread_flag(TIF_MEMDIE); 438 if (test_thread_flag(TIF_MEMDIE))
439 unmark_oom_victim();
439} 440}
440 441
441static struct task_struct *find_alive_thread(struct task_struct *p) 442static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2ddade9f1..cf65139615a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
438 atomic_inc(&mapping->i_mmap_writable); 438 atomic_inc(&mapping->i_mmap_writable);
439 flush_dcache_mmap_lock(mapping); 439 flush_dcache_mmap_lock(mapping);
440 /* insert tmp into the share list, just after mpnt */ 440 /* insert tmp into the share list, just after mpnt */
441 if (unlikely(tmp->vm_flags & VM_NONLINEAR)) 441 vma_interval_tree_insert_after(tmp, mpnt,
442 vma_nonlinear_insert(tmp, 442 &mapping->i_mmap);
443 &mapping->i_mmap_nonlinear);
444 else
445 vma_interval_tree_insert_after(tmp, mpnt,
446 &mapping->i_mmap);
447 flush_dcache_mmap_unlock(mapping); 443 flush_dcache_mmap_unlock(mapping);
448 i_mmap_unlock_write(mapping); 444 i_mmap_unlock_write(mapping);
449 } 445 }
@@ -559,6 +555,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
559 INIT_LIST_HEAD(&mm->mmlist); 555 INIT_LIST_HEAD(&mm->mmlist);
560 mm->core_state = NULL; 556 mm->core_state = NULL;
561 atomic_long_set(&mm->nr_ptes, 0); 557 atomic_long_set(&mm->nr_ptes, 0);
558 mm_nr_pmds_init(mm);
562 mm->map_count = 0; 559 mm->map_count = 0;
563 mm->locked_vm = 0; 560 mm->locked_vm = 0;
564 mm->pinned_vm = 0; 561 mm->pinned_vm = 0;
@@ -607,6 +604,14 @@ static void check_mm(struct mm_struct *mm)
607 printk(KERN_ALERT "BUG: Bad rss-counter state " 604 printk(KERN_ALERT "BUG: Bad rss-counter state "
608 "mm:%p idx:%d val:%ld\n", mm, i, x); 605 "mm:%p idx:%d val:%ld\n", mm, i, x);
609 } 606 }
607
608 if (atomic_long_read(&mm->nr_ptes))
609 pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
610 atomic_long_read(&mm->nr_ptes));
611 if (mm_nr_pmds(mm))
612 pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
613 mm_nr_pmds(mm));
614
610#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 615#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
611 VM_BUG_ON_MM(mm->pmd_huge_pte, mm); 616 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
612#endif 617#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index 63678b573d61..2a5e3830e953 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2217,7 +2217,7 @@ retry:
2217 if (!abs_time) 2217 if (!abs_time)
2218 goto out; 2218 goto out;
2219 2219
2220 restart = &current_thread_info()->restart_block; 2220 restart = &current->restart_block;
2221 restart->fn = futex_wait_restart; 2221 restart->fn = futex_wait_restart;
2222 restart->futex.uaddr = uaddr; 2222 restart->futex.uaddr = uaddr;
2223 restart->futex.val = val; 2223 restart->futex.val = val;
@@ -2258,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart)
2258 * if there are waiters then it will block, it does PI, etc. (Due to 2258 * if there are waiters then it will block, it does PI, etc. (Due to
2259 * races the kernel might see a 0 value of the futex too.) 2259 * races the kernel might see a 0 value of the futex too.)
2260 */ 2260 */
2261static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, 2261static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
2262 ktime_t *time, int trylock) 2262 ktime_t *time, int trylock)
2263{ 2263{
2264 struct hrtimer_sleeper timeout, *to = NULL; 2264 struct hrtimer_sleeper timeout, *to = NULL;
@@ -2953,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2953 case FUTEX_WAKE_OP: 2953 case FUTEX_WAKE_OP:
2954 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); 2954 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2955 case FUTEX_LOCK_PI: 2955 case FUTEX_LOCK_PI:
2956 return futex_lock_pi(uaddr, flags, val, timeout, 0); 2956 return futex_lock_pi(uaddr, flags, timeout, 0);
2957 case FUTEX_UNLOCK_PI: 2957 case FUTEX_UNLOCK_PI:
2958 return futex_unlock_pi(uaddr, flags); 2958 return futex_unlock_pi(uaddr, flags);
2959 case FUTEX_TRYLOCK_PI: 2959 case FUTEX_TRYLOCK_PI:
2960 return futex_lock_pi(uaddr, flags, 0, timeout, 1); 2960 return futex_lock_pi(uaddr, flags, NULL, 1);
2961 case FUTEX_WAIT_REQUEUE_PI: 2961 case FUTEX_WAIT_REQUEUE_PI:
2962 val3 = FUTEX_BITSET_MATCH_ANY; 2962 val3 = FUTEX_BITSET_MATCH_ANY;
2963 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, 2963 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 52aa7e8de927..752d6486b67e 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,33 +1,7 @@
1ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' 1ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2 2
3# if-lt 3obj-y := base.o fs.o
4# Usage VAR := $(call if-lt, $(a), $(b)) 4obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
5# Returns 1 if (a < b) 5obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
6if-lt = $(shell [ $(1) -lt $(2) ] && echo 1) 6obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \
7 7 gcc_3_4.o, gcc_4_7.o)
8ifeq ($(CONFIG_GCOV_FORMAT_3_4),y)
9 cc-ver := 0304
10else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y)
11 cc-ver := 0407
12else
13# Use cc-version if available, otherwise set 0
14#
15# scripts/Kbuild.include, which contains cc-version function, is not included
16# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov"
17# Meaning cc-ver is empty causing if-lt test to fail with
18# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage.
19# This has no affect on the clean phase, but the error message could be
20# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version
21# is not available. We can probably move if-lt to Kbuild.include, so it's also
22# not defined during clean or to include Kbuild.include in
23# scripts/Makefile.clean. But the following workaround seems least invasive.
24 cc-ver := $(if $(call cc-version),$(call cc-version),0)
25endif
26
27obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
28
29ifeq ($(call if-lt, $(cc-ver), 0407),1)
30 obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o
31else
32 obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o
33endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 80692373abd6..196a06fbc122 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -243,6 +243,9 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
243 return -EINVAL; 243 return -EINVAL;
244 desc->affinity_hint = m; 244 desc->affinity_hint = m;
245 irq_put_desc_unlock(desc, flags); 245 irq_put_desc_unlock(desc, flags);
246 /* set the initial affinity to prevent every interrupt being on CPU0 */
247 if (m)
248 __irq_set_affinity(irq, m, false);
246 return 0; 249 return 0;
247} 250}
248EXPORT_SYMBOL_GPL(irq_set_affinity_hint); 251EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9dc9bfd8a678..df2f4642d1e7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -46,10 +46,9 @@ static int show_irq_affinity(int type, struct seq_file *m, void *v)
46 mask = desc->pending_mask; 46 mask = desc->pending_mask;
47#endif 47#endif
48 if (type) 48 if (type)
49 seq_cpumask_list(m, mask); 49 seq_printf(m, "%*pbl\n", cpumask_pr_args(mask));
50 else 50 else
51 seq_cpumask(m, mask); 51 seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
52 seq_putc(m, '\n');
53 return 0; 52 return 0;
54} 53}
55 54
@@ -67,8 +66,7 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
67 cpumask_copy(mask, desc->affinity_hint); 66 cpumask_copy(mask, desc->affinity_hint);
68 raw_spin_unlock_irqrestore(&desc->lock, flags); 67 raw_spin_unlock_irqrestore(&desc->lock, flags);
69 68
70 seq_cpumask(m, mask); 69 seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
71 seq_putc(m, '\n');
72 free_cpumask_var(mask); 70 free_cpumask_var(mask);
73 71
74 return 0; 72 return 0;
@@ -186,8 +184,7 @@ static const struct file_operations irq_affinity_list_proc_fops = {
186 184
187static int default_affinity_show(struct seq_file *m, void *v) 185static int default_affinity_show(struct seq_file *m, void *v)
188{ 186{
189 seq_cpumask(m, irq_default_affinity); 187 seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity));
190 seq_putc(m, '\n');
191 return 0; 188 return 0;
192} 189}
193 190
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9a8a01abbaed..38c25b1f2fd5 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -444,7 +444,7 @@ arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
444} 444}
445 445
446/* 446/*
447 * Free up memory used by kernel, initrd, and comand line. This is temporary 447 * Free up memory used by kernel, initrd, and command line. This is temporary
448 * memory allocation which is not needed any more after these buffers have 448 * memory allocation which is not needed any more after these buffers have
449 * been loaded into separate segments and have been copied elsewhere. 449 * been loaded into separate segments and have been copied elsewhere.
450 */ 450 */
@@ -856,8 +856,6 @@ static int kimage_set_destination(struct kimage *image,
856 856
857 destination &= PAGE_MASK; 857 destination &= PAGE_MASK;
858 result = kimage_add_entry(image, destination | IND_DESTINATION); 858 result = kimage_add_entry(image, destination | IND_DESTINATION);
859 if (result == 0)
860 image->destination = destination;
861 859
862 return result; 860 return result;
863} 861}
@@ -869,8 +867,6 @@ static int kimage_add_page(struct kimage *image, unsigned long page)
869 867
870 page &= PAGE_MASK; 868 page &= PAGE_MASK;
871 result = kimage_add_entry(image, page | IND_SOURCE); 869 result = kimage_add_entry(image, page | IND_SOURCE);
872 if (result == 0)
873 image->destination += PAGE_SIZE;
874 870
875 return result; 871 return result;
876} 872}
@@ -1288,19 +1284,22 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
1288 if (nr_segments > 0) { 1284 if (nr_segments > 0) {
1289 unsigned long i; 1285 unsigned long i;
1290 1286
1291 /* Loading another kernel to reboot into */ 1287 if (flags & KEXEC_ON_CRASH) {
1292 if ((flags & KEXEC_ON_CRASH) == 0) 1288 /*
1293 result = kimage_alloc_init(&image, entry, nr_segments, 1289 * Loading another kernel to switch to if this one
1294 segments, flags); 1290 * crashes. Free any current crash dump kernel before
1295 /* Loading another kernel to switch to if this one crashes */
1296 else if (flags & KEXEC_ON_CRASH) {
1297 /* Free any current crash dump kernel before
1298 * we corrupt it. 1291 * we corrupt it.
1299 */ 1292 */
1293
1300 kimage_free(xchg(&kexec_crash_image, NULL)); 1294 kimage_free(xchg(&kexec_crash_image, NULL));
1301 result = kimage_alloc_init(&image, entry, nr_segments, 1295 result = kimage_alloc_init(&image, entry, nr_segments,
1302 segments, flags); 1296 segments, flags);
1303 crash_map_reserved_pages(); 1297 crash_map_reserved_pages();
1298 } else {
1299 /* Loading another kernel to reboot into. */
1300
1301 result = kimage_alloc_init(&image, entry, nr_segments,
1302 segments, flags);
1304 } 1303 }
1305 if (result) 1304 if (result)
1306 goto out; 1305 goto out;
@@ -2512,7 +2511,7 @@ static int kexec_apply_relocations(struct kimage *image)
2512 continue; 2511 continue;
2513 2512
2514 /* 2513 /*
2515 * Respective archicture needs to provide support for applying 2514 * Respective architecture needs to provide support for applying
2516 * relocations of type SHT_RELA/SHT_REL. 2515 * relocations of type SHT_RELA/SHT_REL.
2517 */ 2516 */
2518 if (sechdrs[i].sh_type == SHT_RELA) 2517 if (sechdrs[i].sh_type == SHT_RELA)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ee619929cf90..c90e417bb963 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -717,7 +717,7 @@ static void prepare_optimized_kprobe(struct kprobe *p)
717 struct optimized_kprobe *op; 717 struct optimized_kprobe *op;
718 718
719 op = container_of(p, struct optimized_kprobe, kp); 719 op = container_of(p, struct optimized_kprobe, kp);
720 arch_prepare_optimized_kprobe(op); 720 arch_prepare_optimized_kprobe(op, p);
721} 721}
722 722
723/* Allocate new optimized_kprobe and try to prepare optimized instructions */ 723/* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -731,7 +731,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
731 731
732 INIT_LIST_HEAD(&op->list); 732 INIT_LIST_HEAD(&op->list);
733 op->kp.addr = p->addr; 733 op->kp.addr = p->addr;
734 arch_prepare_optimized_kprobe(op); 734 arch_prepare_optimized_kprobe(op, p);
735 735
736 return &op->kp; 736 return &op->kp;
737} 737}
@@ -869,7 +869,8 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt)
869{ 869{
870 struct kprobe *_p; 870 struct kprobe *_p;
871 871
872 unoptimize_kprobe(p, false); /* Try to unoptimize */ 872 /* Try to unoptimize */
873 unoptimize_kprobe(p, kprobes_all_disarmed);
873 874
874 if (!kprobe_queued(p)) { 875 if (!kprobe_queued(p)) {
875 arch_disarm_kprobe(p); 876 arch_disarm_kprobe(p);
@@ -1571,7 +1572,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
1571 1572
1572 /* Try to disarm and disable this/parent probe */ 1573 /* Try to disarm and disable this/parent probe */
1573 if (p == orig_p || aggr_kprobe_disabled(orig_p)) { 1574 if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
1574 disarm_kprobe(orig_p, true); 1575 /*
1576 * If kprobes_all_disarmed is set, orig_p
1577 * should have already been disarmed, so
1578 * skip unneed disarming process.
1579 */
1580 if (!kprobes_all_disarmed)
1581 disarm_kprobe(orig_p, true);
1575 orig_p->flags |= KPROBE_FLAG_DISABLED; 1582 orig_p->flags |= KPROBE_FLAG_DISABLED;
1576 } 1583 }
1577 } 1584 }
@@ -2320,6 +2327,12 @@ static void arm_all_kprobes(void)
2320 if (!kprobes_all_disarmed) 2327 if (!kprobes_all_disarmed)
2321 goto already_enabled; 2328 goto already_enabled;
2322 2329
2330 /*
2331 * optimize_kprobe() called by arm_kprobe() checks
2332 * kprobes_all_disarmed, so set kprobes_all_disarmed before
2333 * arm_kprobe.
2334 */
2335 kprobes_all_disarmed = false;
2323 /* Arming kprobes doesn't optimize kprobe itself */ 2336 /* Arming kprobes doesn't optimize kprobe itself */
2324 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2337 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2325 head = &kprobe_table[i]; 2338 head = &kprobe_table[i];
@@ -2328,7 +2341,6 @@ static void arm_all_kprobes(void)
2328 arm_kprobe(p); 2341 arm_kprobe(p);
2329 } 2342 }
2330 2343
2331 kprobes_all_disarmed = false;
2332 printk(KERN_INFO "Kprobes globally enabled\n"); 2344 printk(KERN_INFO "Kprobes globally enabled\n");
2333 2345
2334already_enabled: 2346already_enabled:
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
new file mode 100644
index 000000000000..045022557936
--- /dev/null
+++ b/kernel/livepatch/Kconfig
@@ -0,0 +1,18 @@
1config HAVE_LIVEPATCH
2 bool
3 help
4 Arch supports kernel live patching
5
6config LIVEPATCH
7 bool "Kernel Live Patching"
8 depends on DYNAMIC_FTRACE_WITH_REGS
9 depends on MODULES
10 depends on SYSFS
11 depends on KALLSYMS_ALL
12 depends on HAVE_LIVEPATCH
13 help
14 Say Y here if you want to support kernel live patching.
15 This option has no runtime impact until a kernel "patch"
16 module uses the interface provided by this option to register
17 a patch, causing calls to patched functions to be redirected
18 to new function code contained in the patch module.
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
new file mode 100644
index 000000000000..e8780c0901d9
--- /dev/null
+++ b/kernel/livepatch/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_LIVEPATCH) += livepatch.o
2
3livepatch-objs := core.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
new file mode 100644
index 000000000000..ff7f47d026ac
--- /dev/null
+++ b/kernel/livepatch/core.c
@@ -0,0 +1,1015 @@
1/*
2 * core.c - Kernel Live Patching Core
3 *
4 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
5 * Copyright (C) 2014 SUSE
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
23#include <linux/module.h>
24#include <linux/kernel.h>
25#include <linux/mutex.h>
26#include <linux/slab.h>
27#include <linux/ftrace.h>
28#include <linux/list.h>
29#include <linux/kallsyms.h>
30#include <linux/livepatch.h>
31
32/**
33 * struct klp_ops - structure for tracking registered ftrace ops structs
34 *
35 * A single ftrace_ops is shared between all enabled replacement functions
36 * (klp_func structs) which have the same old_addr. This allows the switch
37 * between function versions to happen instantaneously by updating the klp_ops
38 * struct's func_stack list. The winner is the klp_func at the top of the
39 * func_stack (front of the list).
40 *
41 * @node: node for the global klp_ops list
42 * @func_stack: list head for the stack of klp_func's (active func is on top)
43 * @fops: registered ftrace ops struct
44 */
45struct klp_ops {
46 struct list_head node;
47 struct list_head func_stack;
48 struct ftrace_ops fops;
49};
50
51/*
52 * The klp_mutex protects the global lists and state transitions of any
53 * structure reachable from them. References to any structure must be obtained
54 * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
55 * ensure it gets consistent data).
56 */
57static DEFINE_MUTEX(klp_mutex);
58
59static LIST_HEAD(klp_patches);
60static LIST_HEAD(klp_ops);
61
62static struct kobject *klp_root_kobj;
63
64static struct klp_ops *klp_find_ops(unsigned long old_addr)
65{
66 struct klp_ops *ops;
67 struct klp_func *func;
68
69 list_for_each_entry(ops, &klp_ops, node) {
70 func = list_first_entry(&ops->func_stack, struct klp_func,
71 stack_node);
72 if (func->old_addr == old_addr)
73 return ops;
74 }
75
76 return NULL;
77}
78
79static bool klp_is_module(struct klp_object *obj)
80{
81 return obj->name;
82}
83
84static bool klp_is_object_loaded(struct klp_object *obj)
85{
86 return !obj->name || obj->mod;
87}
88
89/* sets obj->mod if object is not vmlinux and module is found */
90static void klp_find_object_module(struct klp_object *obj)
91{
92 if (!klp_is_module(obj))
93 return;
94
95 mutex_lock(&module_mutex);
96 /*
97 * We don't need to take a reference on the module here because we have
98 * the klp_mutex, which is also taken by the module notifier. This
99 * prevents any module from unloading until we release the klp_mutex.
100 */
101 obj->mod = find_module(obj->name);
102 mutex_unlock(&module_mutex);
103}
104
105/* klp_mutex must be held by caller */
106static bool klp_is_patch_registered(struct klp_patch *patch)
107{
108 struct klp_patch *mypatch;
109
110 list_for_each_entry(mypatch, &klp_patches, list)
111 if (mypatch == patch)
112 return true;
113
114 return false;
115}
116
117static bool klp_initialized(void)
118{
119 return klp_root_kobj;
120}
121
122struct klp_find_arg {
123 const char *objname;
124 const char *name;
125 unsigned long addr;
126 /*
127 * If count == 0, the symbol was not found. If count == 1, a unique
128 * match was found and addr is set. If count > 1, there is
129 * unresolvable ambiguity among "count" number of symbols with the same
130 * name in the same object.
131 */
132 unsigned long count;
133};
134
135static int klp_find_callback(void *data, const char *name,
136 struct module *mod, unsigned long addr)
137{
138 struct klp_find_arg *args = data;
139
140 if ((mod && !args->objname) || (!mod && args->objname))
141 return 0;
142
143 if (strcmp(args->name, name))
144 return 0;
145
146 if (args->objname && strcmp(args->objname, mod->name))
147 return 0;
148
149 /*
150 * args->addr might be overwritten if another match is found
151 * but klp_find_object_symbol() handles this and only returns the
152 * addr if count == 1.
153 */
154 args->addr = addr;
155 args->count++;
156
157 return 0;
158}
159
160static int klp_find_object_symbol(const char *objname, const char *name,
161 unsigned long *addr)
162{
163 struct klp_find_arg args = {
164 .objname = objname,
165 .name = name,
166 .addr = 0,
167 .count = 0
168 };
169
170 kallsyms_on_each_symbol(klp_find_callback, &args);
171
172 if (args.count == 0)
173 pr_err("symbol '%s' not found in symbol table\n", name);
174 else if (args.count > 1)
175 pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
176 args.count, name, objname);
177 else {
178 *addr = args.addr;
179 return 0;
180 }
181
182 *addr = 0;
183 return -EINVAL;
184}
185
186struct klp_verify_args {
187 const char *name;
188 const unsigned long addr;
189};
190
191static int klp_verify_callback(void *data, const char *name,
192 struct module *mod, unsigned long addr)
193{
194 struct klp_verify_args *args = data;
195
196 if (!mod &&
197 !strcmp(args->name, name) &&
198 args->addr == addr)
199 return 1;
200
201 return 0;
202}
203
204static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
205{
206 struct klp_verify_args args = {
207 .name = name,
208 .addr = addr,
209 };
210
211 if (kallsyms_on_each_symbol(klp_verify_callback, &args))
212 return 0;
213
214 pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
215 name, addr);
216 return -EINVAL;
217}
218
219static int klp_find_verify_func_addr(struct klp_object *obj,
220 struct klp_func *func)
221{
222 int ret;
223
224#if defined(CONFIG_RANDOMIZE_BASE)
225 /* KASLR is enabled, disregard old_addr from user */
226 func->old_addr = 0;
227#endif
228
229 if (!func->old_addr || klp_is_module(obj))
230 ret = klp_find_object_symbol(obj->name, func->old_name,
231 &func->old_addr);
232 else
233 ret = klp_verify_vmlinux_symbol(func->old_name,
234 func->old_addr);
235
236 return ret;
237}
238
239/*
240 * external symbols are located outside the parent object (where the parent
241 * object is either vmlinux or the kmod being patched).
242 */
243static int klp_find_external_symbol(struct module *pmod, const char *name,
244 unsigned long *addr)
245{
246 const struct kernel_symbol *sym;
247
248 /* first, check if it's an exported symbol */
249 preempt_disable();
250 sym = find_symbol(name, NULL, NULL, true, true);
251 preempt_enable();
252 if (sym) {
253 *addr = sym->value;
254 return 0;
255 }
256
257 /* otherwise check if it's in another .o within the patch module */
258 return klp_find_object_symbol(pmod->name, name, addr);
259}
260
261static int klp_write_object_relocations(struct module *pmod,
262 struct klp_object *obj)
263{
264 int ret;
265 struct klp_reloc *reloc;
266
267 if (WARN_ON(!klp_is_object_loaded(obj)))
268 return -EINVAL;
269
270 if (WARN_ON(!obj->relocs))
271 return -EINVAL;
272
273 for (reloc = obj->relocs; reloc->name; reloc++) {
274 if (!klp_is_module(obj)) {
275 ret = klp_verify_vmlinux_symbol(reloc->name,
276 reloc->val);
277 if (ret)
278 return ret;
279 } else {
280 /* module, reloc->val needs to be discovered */
281 if (reloc->external)
282 ret = klp_find_external_symbol(pmod,
283 reloc->name,
284 &reloc->val);
285 else
286 ret = klp_find_object_symbol(obj->mod->name,
287 reloc->name,
288 &reloc->val);
289 if (ret)
290 return ret;
291 }
292 ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
293 reloc->val + reloc->addend);
294 if (ret) {
295 pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
296 reloc->name, reloc->val, ret);
297 return ret;
298 }
299 }
300
301 return 0;
302}
303
304static void notrace klp_ftrace_handler(unsigned long ip,
305 unsigned long parent_ip,
306 struct ftrace_ops *fops,
307 struct pt_regs *regs)
308{
309 struct klp_ops *ops;
310 struct klp_func *func;
311
312 ops = container_of(fops, struct klp_ops, fops);
313
314 rcu_read_lock();
315 func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
316 stack_node);
317 rcu_read_unlock();
318
319 if (WARN_ON_ONCE(!func))
320 return;
321
322 klp_arch_set_pc(regs, (unsigned long)func->new_func);
323}
324
325static int klp_disable_func(struct klp_func *func)
326{
327 struct klp_ops *ops;
328 int ret;
329
330 if (WARN_ON(func->state != KLP_ENABLED))
331 return -EINVAL;
332
333 if (WARN_ON(!func->old_addr))
334 return -EINVAL;
335
336 ops = klp_find_ops(func->old_addr);
337 if (WARN_ON(!ops))
338 return -EINVAL;
339
340 if (list_is_singular(&ops->func_stack)) {
341 ret = unregister_ftrace_function(&ops->fops);
342 if (ret) {
343 pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
344 func->old_name, ret);
345 return ret;
346 }
347
348 ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
349 if (ret)
350 pr_warn("function unregister succeeded but failed to clear the filter\n");
351
352 list_del_rcu(&func->stack_node);
353 list_del(&ops->node);
354 kfree(ops);
355 } else {
356 list_del_rcu(&func->stack_node);
357 }
358
359 func->state = KLP_DISABLED;
360
361 return 0;
362}
363
364static int klp_enable_func(struct klp_func *func)
365{
366 struct klp_ops *ops;
367 int ret;
368
369 if (WARN_ON(!func->old_addr))
370 return -EINVAL;
371
372 if (WARN_ON(func->state != KLP_DISABLED))
373 return -EINVAL;
374
375 ops = klp_find_ops(func->old_addr);
376 if (!ops) {
377 ops = kzalloc(sizeof(*ops), GFP_KERNEL);
378 if (!ops)
379 return -ENOMEM;
380
381 ops->fops.func = klp_ftrace_handler;
382 ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
383 FTRACE_OPS_FL_DYNAMIC |
384 FTRACE_OPS_FL_IPMODIFY;
385
386 list_add(&ops->node, &klp_ops);
387
388 INIT_LIST_HEAD(&ops->func_stack);
389 list_add_rcu(&func->stack_node, &ops->func_stack);
390
391 ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0);
392 if (ret) {
393 pr_err("failed to set ftrace filter for function '%s' (%d)\n",
394 func->old_name, ret);
395 goto err;
396 }
397
398 ret = register_ftrace_function(&ops->fops);
399 if (ret) {
400 pr_err("failed to register ftrace handler for function '%s' (%d)\n",
401 func->old_name, ret);
402 ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
403 goto err;
404 }
405
406
407 } else {
408 list_add_rcu(&func->stack_node, &ops->func_stack);
409 }
410
411 func->state = KLP_ENABLED;
412
413 return 0;
414
415err:
416 list_del_rcu(&func->stack_node);
417 list_del(&ops->node);
418 kfree(ops);
419 return ret;
420}
421
422static int klp_disable_object(struct klp_object *obj)
423{
424 struct klp_func *func;
425 int ret;
426
427 for (func = obj->funcs; func->old_name; func++) {
428 if (func->state != KLP_ENABLED)
429 continue;
430
431 ret = klp_disable_func(func);
432 if (ret)
433 return ret;
434 }
435
436 obj->state = KLP_DISABLED;
437
438 return 0;
439}
440
441static int klp_enable_object(struct klp_object *obj)
442{
443 struct klp_func *func;
444 int ret;
445
446 if (WARN_ON(obj->state != KLP_DISABLED))
447 return -EINVAL;
448
449 if (WARN_ON(!klp_is_object_loaded(obj)))
450 return -EINVAL;
451
452 for (func = obj->funcs; func->old_name; func++) {
453 ret = klp_enable_func(func);
454 if (ret)
455 goto unregister;
456 }
457 obj->state = KLP_ENABLED;
458
459 return 0;
460
461unregister:
462 WARN_ON(klp_disable_object(obj));
463 return ret;
464}
465
466static int __klp_disable_patch(struct klp_patch *patch)
467{
468 struct klp_object *obj;
469 int ret;
470
471 /* enforce stacking: only the last enabled patch can be disabled */
472 if (!list_is_last(&patch->list, &klp_patches) &&
473 list_next_entry(patch, list)->state == KLP_ENABLED)
474 return -EBUSY;
475
476 pr_notice("disabling patch '%s'\n", patch->mod->name);
477
478 for (obj = patch->objs; obj->funcs; obj++) {
479 if (obj->state != KLP_ENABLED)
480 continue;
481
482 ret = klp_disable_object(obj);
483 if (ret)
484 return ret;
485 }
486
487 patch->state = KLP_DISABLED;
488
489 return 0;
490}
491
492/**
493 * klp_disable_patch() - disables a registered patch
494 * @patch: The registered, enabled patch to be disabled
495 *
496 * Unregisters the patched functions from ftrace.
497 *
498 * Return: 0 on success, otherwise error
499 */
500int klp_disable_patch(struct klp_patch *patch)
501{
502 int ret;
503
504 mutex_lock(&klp_mutex);
505
506 if (!klp_is_patch_registered(patch)) {
507 ret = -EINVAL;
508 goto err;
509 }
510
511 if (patch->state == KLP_DISABLED) {
512 ret = -EINVAL;
513 goto err;
514 }
515
516 ret = __klp_disable_patch(patch);
517
518err:
519 mutex_unlock(&klp_mutex);
520 return ret;
521}
522EXPORT_SYMBOL_GPL(klp_disable_patch);
523
524static int __klp_enable_patch(struct klp_patch *patch)
525{
526 struct klp_object *obj;
527 int ret;
528
529 if (WARN_ON(patch->state != KLP_DISABLED))
530 return -EINVAL;
531
532 /* enforce stacking: only the first disabled patch can be enabled */
533 if (patch->list.prev != &klp_patches &&
534 list_prev_entry(patch, list)->state == KLP_DISABLED)
535 return -EBUSY;
536
537 pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
538 add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
539
540 pr_notice("enabling patch '%s'\n", patch->mod->name);
541
542 for (obj = patch->objs; obj->funcs; obj++) {
543 klp_find_object_module(obj);
544
545 if (!klp_is_object_loaded(obj))
546 continue;
547
548 ret = klp_enable_object(obj);
549 if (ret)
550 goto unregister;
551 }
552
553 patch->state = KLP_ENABLED;
554
555 return 0;
556
557unregister:
558 WARN_ON(__klp_disable_patch(patch));
559 return ret;
560}
561
562/**
563 * klp_enable_patch() - enables a registered patch
564 * @patch: The registered, disabled patch to be enabled
565 *
566 * Performs the needed symbol lookups and code relocations,
567 * then registers the patched functions with ftrace.
568 *
569 * Return: 0 on success, otherwise error
570 */
571int klp_enable_patch(struct klp_patch *patch)
572{
573 int ret;
574
575 mutex_lock(&klp_mutex);
576
577 if (!klp_is_patch_registered(patch)) {
578 ret = -EINVAL;
579 goto err;
580 }
581
582 ret = __klp_enable_patch(patch);
583
584err:
585 mutex_unlock(&klp_mutex);
586 return ret;
587}
588EXPORT_SYMBOL_GPL(klp_enable_patch);
589
590/*
591 * Sysfs Interface
592 *
593 * /sys/kernel/livepatch
594 * /sys/kernel/livepatch/<patch>
595 * /sys/kernel/livepatch/<patch>/enabled
596 * /sys/kernel/livepatch/<patch>/<object>
597 * /sys/kernel/livepatch/<patch>/<object>/<func>
598 */
599
600static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
601 const char *buf, size_t count)
602{
603 struct klp_patch *patch;
604 int ret;
605 unsigned long val;
606
607 ret = kstrtoul(buf, 10, &val);
608 if (ret)
609 return -EINVAL;
610
611 if (val != KLP_DISABLED && val != KLP_ENABLED)
612 return -EINVAL;
613
614 patch = container_of(kobj, struct klp_patch, kobj);
615
616 mutex_lock(&klp_mutex);
617
618 if (val == patch->state) {
619 /* already in requested state */
620 ret = -EINVAL;
621 goto err;
622 }
623
624 if (val == KLP_ENABLED) {
625 ret = __klp_enable_patch(patch);
626 if (ret)
627 goto err;
628 } else {
629 ret = __klp_disable_patch(patch);
630 if (ret)
631 goto err;
632 }
633
634 mutex_unlock(&klp_mutex);
635
636 return count;
637
638err:
639 mutex_unlock(&klp_mutex);
640 return ret;
641}
642
643static ssize_t enabled_show(struct kobject *kobj,
644 struct kobj_attribute *attr, char *buf)
645{
646 struct klp_patch *patch;
647
648 patch = container_of(kobj, struct klp_patch, kobj);
649 return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
650}
651
652static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
653static struct attribute *klp_patch_attrs[] = {
654 &enabled_kobj_attr.attr,
655 NULL
656};
657
658static void klp_kobj_release_patch(struct kobject *kobj)
659{
660 /*
661 * Once we have a consistency model we'll need to module_put() the
662 * patch module here. See klp_register_patch() for more details.
663 */
664}
665
666static struct kobj_type klp_ktype_patch = {
667 .release = klp_kobj_release_patch,
668 .sysfs_ops = &kobj_sysfs_ops,
669 .default_attrs = klp_patch_attrs,
670};
671
672static void klp_kobj_release_func(struct kobject *kobj)
673{
674}
675
676static struct kobj_type klp_ktype_func = {
677 .release = klp_kobj_release_func,
678 .sysfs_ops = &kobj_sysfs_ops,
679};
680
681/*
682 * Free all functions' kobjects in the array up to some limit. When limit is
683 * NULL, all kobjects are freed.
684 */
685static void klp_free_funcs_limited(struct klp_object *obj,
686 struct klp_func *limit)
687{
688 struct klp_func *func;
689
690 for (func = obj->funcs; func->old_name && func != limit; func++)
691 kobject_put(&func->kobj);
692}
693
694/* Clean up when a patched object is unloaded */
695static void klp_free_object_loaded(struct klp_object *obj)
696{
697 struct klp_func *func;
698
699 obj->mod = NULL;
700
701 for (func = obj->funcs; func->old_name; func++)
702 func->old_addr = 0;
703}
704
705/*
706 * Free all objects' kobjects in the array up to some limit. When limit is
707 * NULL, all kobjects are freed.
708 */
709static void klp_free_objects_limited(struct klp_patch *patch,
710 struct klp_object *limit)
711{
712 struct klp_object *obj;
713
714 for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
715 klp_free_funcs_limited(obj, NULL);
716 kobject_put(obj->kobj);
717 }
718}
719
720static void klp_free_patch(struct klp_patch *patch)
721{
722 klp_free_objects_limited(patch, NULL);
723 if (!list_empty(&patch->list))
724 list_del(&patch->list);
725 kobject_put(&patch->kobj);
726}
727
728static int klp_init_func(struct klp_object *obj, struct klp_func *func)
729{
730 INIT_LIST_HEAD(&func->stack_node);
731 func->state = KLP_DISABLED;
732
733 return kobject_init_and_add(&func->kobj, &klp_ktype_func,
734 obj->kobj, func->old_name);
735}
736
737/* parts of the initialization that is done only when the object is loaded */
738static int klp_init_object_loaded(struct klp_patch *patch,
739 struct klp_object *obj)
740{
741 struct klp_func *func;
742 int ret;
743
744 if (obj->relocs) {
745 ret = klp_write_object_relocations(patch->mod, obj);
746 if (ret)
747 return ret;
748 }
749
750 for (func = obj->funcs; func->old_name; func++) {
751 ret = klp_find_verify_func_addr(obj, func);
752 if (ret)
753 return ret;
754 }
755
756 return 0;
757}
758
759static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
760{
761 struct klp_func *func;
762 int ret;
763 const char *name;
764
765 if (!obj->funcs)
766 return -EINVAL;
767
768 obj->state = KLP_DISABLED;
769
770 klp_find_object_module(obj);
771
772 name = klp_is_module(obj) ? obj->name : "vmlinux";
773 obj->kobj = kobject_create_and_add(name, &patch->kobj);
774 if (!obj->kobj)
775 return -ENOMEM;
776
777 for (func = obj->funcs; func->old_name; func++) {
778 ret = klp_init_func(obj, func);
779 if (ret)
780 goto free;
781 }
782
783 if (klp_is_object_loaded(obj)) {
784 ret = klp_init_object_loaded(patch, obj);
785 if (ret)
786 goto free;
787 }
788
789 return 0;
790
791free:
792 klp_free_funcs_limited(obj, func);
793 kobject_put(obj->kobj);
794 return ret;
795}
796
797static int klp_init_patch(struct klp_patch *patch)
798{
799 struct klp_object *obj;
800 int ret;
801
802 if (!patch->objs)
803 return -EINVAL;
804
805 mutex_lock(&klp_mutex);
806
807 patch->state = KLP_DISABLED;
808
809 ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
810 klp_root_kobj, patch->mod->name);
811 if (ret)
812 goto unlock;
813
814 for (obj = patch->objs; obj->funcs; obj++) {
815 ret = klp_init_object(patch, obj);
816 if (ret)
817 goto free;
818 }
819
820 list_add_tail(&patch->list, &klp_patches);
821
822 mutex_unlock(&klp_mutex);
823
824 return 0;
825
826free:
827 klp_free_objects_limited(patch, obj);
828 kobject_put(&patch->kobj);
829unlock:
830 mutex_unlock(&klp_mutex);
831 return ret;
832}
833
834/**
835 * klp_unregister_patch() - unregisters a patch
836 * @patch: Disabled patch to be unregistered
837 *
838 * Frees the data structures and removes the sysfs interface.
839 *
840 * Return: 0 on success, otherwise error
841 */
842int klp_unregister_patch(struct klp_patch *patch)
843{
844 int ret = 0;
845
846 mutex_lock(&klp_mutex);
847
848 if (!klp_is_patch_registered(patch)) {
849 ret = -EINVAL;
850 goto out;
851 }
852
853 if (patch->state == KLP_ENABLED) {
854 ret = -EBUSY;
855 goto out;
856 }
857
858 klp_free_patch(patch);
859
860out:
861 mutex_unlock(&klp_mutex);
862 return ret;
863}
864EXPORT_SYMBOL_GPL(klp_unregister_patch);
865
866/**
867 * klp_register_patch() - registers a patch
868 * @patch: Patch to be registered
869 *
870 * Initializes the data structure associated with the patch and
871 * creates the sysfs interface.
872 *
873 * Return: 0 on success, otherwise error
874 */
875int klp_register_patch(struct klp_patch *patch)
876{
877 int ret;
878
879 if (!klp_initialized())
880 return -ENODEV;
881
882 if (!patch || !patch->mod)
883 return -EINVAL;
884
885 /*
886 * A reference is taken on the patch module to prevent it from being
887 * unloaded. Right now, we don't allow patch modules to unload since
888 * there is currently no method to determine if a thread is still
889 * running in the patched code contained in the patch module once
890 * the ftrace registration is successful.
891 */
892 if (!try_module_get(patch->mod))
893 return -ENODEV;
894
895 ret = klp_init_patch(patch);
896 if (ret)
897 module_put(patch->mod);
898
899 return ret;
900}
901EXPORT_SYMBOL_GPL(klp_register_patch);
902
903static void klp_module_notify_coming(struct klp_patch *patch,
904 struct klp_object *obj)
905{
906 struct module *pmod = patch->mod;
907 struct module *mod = obj->mod;
908 int ret;
909
910 ret = klp_init_object_loaded(patch, obj);
911 if (ret)
912 goto err;
913
914 if (patch->state == KLP_DISABLED)
915 return;
916
917 pr_notice("applying patch '%s' to loading module '%s'\n",
918 pmod->name, mod->name);
919
920 ret = klp_enable_object(obj);
921 if (!ret)
922 return;
923
924err:
925 pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
926 pmod->name, mod->name, ret);
927}
928
929static void klp_module_notify_going(struct klp_patch *patch,
930 struct klp_object *obj)
931{
932 struct module *pmod = patch->mod;
933 struct module *mod = obj->mod;
934 int ret;
935
936 if (patch->state == KLP_DISABLED)
937 goto disabled;
938
939 pr_notice("reverting patch '%s' on unloading module '%s'\n",
940 pmod->name, mod->name);
941
942 ret = klp_disable_object(obj);
943 if (ret)
944 pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
945 pmod->name, mod->name, ret);
946
947disabled:
948 klp_free_object_loaded(obj);
949}
950
951static int klp_module_notify(struct notifier_block *nb, unsigned long action,
952 void *data)
953{
954 struct module *mod = data;
955 struct klp_patch *patch;
956 struct klp_object *obj;
957
958 if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING)
959 return 0;
960
961 mutex_lock(&klp_mutex);
962
963 list_for_each_entry(patch, &klp_patches, list) {
964 for (obj = patch->objs; obj->funcs; obj++) {
965 if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
966 continue;
967
968 if (action == MODULE_STATE_COMING) {
969 obj->mod = mod;
970 klp_module_notify_coming(patch, obj);
971 } else /* MODULE_STATE_GOING */
972 klp_module_notify_going(patch, obj);
973
974 break;
975 }
976 }
977
978 mutex_unlock(&klp_mutex);
979
980 return 0;
981}
982
983static struct notifier_block klp_module_nb = {
984 .notifier_call = klp_module_notify,
985 .priority = INT_MIN+1, /* called late but before ftrace notifier */
986};
987
988static int klp_init(void)
989{
990 int ret;
991
992 ret = klp_check_compiler_support();
993 if (ret) {
994 pr_info("Your compiler is too old; turning off.\n");
995 return -EINVAL;
996 }
997
998 ret = register_module_notifier(&klp_module_nb);
999 if (ret)
1000 return ret;
1001
1002 klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
1003 if (!klp_root_kobj) {
1004 ret = -ENOMEM;
1005 goto unregister;
1006 }
1007
1008 return 0;
1009
1010unregister:
1011 unregister_module_notifier(&klp_module_nb);
1012 return ret;
1013}
1014
1015module_init(klp_init);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8541bfdfd232..de7a416cca2a 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,11 +1,11 @@
1 1
2obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o 2obj-y += mutex.o semaphore.o rwsem.o
3 3
4ifdef CONFIG_FUNCTION_TRACER 4ifdef CONFIG_FUNCTION_TRACER
5CFLAGS_REMOVE_lockdep.o = -pg 5CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
6CFLAGS_REMOVE_lockdep_proc.o = -pg 6CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
7CFLAGS_REMOVE_mutex-debug.o = -pg 7CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
8CFLAGS_REMOVE_rtmutex-debug.o = -pg 8CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
9endif 9endif
10 10
11obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 11obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
14obj-$(CONFIG_LOCKDEP) += lockdep_proc.o 14obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
15endif 15endif
16obj-$(CONFIG_SMP) += spinlock.o 16obj-$(CONFIG_SMP) += spinlock.o
17obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
17obj-$(CONFIG_SMP) += lglock.o 18obj-$(CONFIG_SMP) += lglock.o
18obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 19obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
19obj-$(CONFIG_RT_MUTEXES) += rtmutex.o 20obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 4d60986fcbee..d1fe2ba5bac9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
108 arch_mcs_spin_unlock_contended(&next->locked); 108 arch_mcs_spin_unlock_contended(&next->locked);
109} 109}
110 110
111/*
112 * Cancellable version of the MCS lock above.
113 *
114 * Intended for adaptive spinning of sleeping locks:
115 * mutex_lock()/rwsem_down_{read,write}() etc.
116 */
117
118struct optimistic_spin_node {
119 struct optimistic_spin_node *next, *prev;
120 int locked; /* 1 if lock acquired */
121 int cpu; /* encoded CPU # value */
122};
123
124extern bool osq_lock(struct optimistic_spin_queue *lock);
125extern void osq_unlock(struct optimistic_spin_queue *lock);
126
127#endif /* __LINUX_MCS_SPINLOCK_H */ 111#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 454195194d4a..94674e5919cb 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -81,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
81 * The mutex must later on be released by the same task that 81 * The mutex must later on be released by the same task that
82 * acquired it. Recursive locking is not allowed. The task 82 * acquired it. Recursive locking is not allowed. The task
83 * may not exit without first unlocking the mutex. Also, kernel 83 * may not exit without first unlocking the mutex. Also, kernel
84 * memory where the mutex resides mutex must not be freed with 84 * memory where the mutex resides must not be freed with
85 * the mutex still locked. The mutex must first be initialized 85 * the mutex still locked. The mutex must first be initialized
86 * (or statically defined) before it can be locked. memset()-ing 86 * (or statically defined) before it can be locked. memset()-ing
87 * the mutex to 0 is not allowed. 87 * the mutex to 0 is not allowed.
@@ -147,7 +147,7 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
147} 147}
148 148
149/* 149/*
150 * after acquiring lock with fastpath or when we lost out in contested 150 * After acquiring lock with fastpath or when we lost out in contested
151 * slowpath, set ctx and wake up any waiters so they can recheck. 151 * slowpath, set ctx and wake up any waiters so they can recheck.
152 * 152 *
153 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, 153 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
@@ -191,19 +191,32 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
191 spin_unlock_mutex(&lock->base.wait_lock, flags); 191 spin_unlock_mutex(&lock->base.wait_lock, flags);
192} 192}
193 193
194
195#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
196/* 194/*
197 * In order to avoid a stampede of mutex spinners from acquiring the mutex 195 * After acquiring lock in the slowpath set ctx and wake up any
198 * more or less simultaneously, the spinners need to acquire a MCS lock 196 * waiters so they can recheck.
199 * first before spinning on the owner field.
200 * 197 *
198 * Callers must hold the mutex wait_lock.
201 */ 199 */
200static __always_inline void
201ww_mutex_set_context_slowpath(struct ww_mutex *lock,
202 struct ww_acquire_ctx *ctx)
203{
204 struct mutex_waiter *cur;
202 205
203/* 206 ww_mutex_lock_acquired(lock, ctx);
204 * Mutex spinning code migrated from kernel/sched/core.c 207 lock->ctx = ctx;
205 */ 208
209 /*
210 * Give any possible sleeping processes the chance to wake up,
211 * so they can recheck if they have to back off.
212 */
213 list_for_each_entry(cur, &lock->base.wait_list, list) {
214 debug_mutex_wake_waiter(&lock->base, cur);
215 wake_up_process(cur->task);
216 }
217}
206 218
219#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
207static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 220static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
208{ 221{
209 if (lock->owner != owner) 222 if (lock->owner != owner)
@@ -307,6 +320,11 @@ static bool mutex_optimistic_spin(struct mutex *lock,
307 if (!mutex_can_spin_on_owner(lock)) 320 if (!mutex_can_spin_on_owner(lock))
308 goto done; 321 goto done;
309 322
323 /*
324 * In order to avoid a stampede of mutex spinners trying to
325 * acquire the mutex all at once, the spinners need to take a
326 * MCS (queued) lock first before spinning on the owner field.
327 */
310 if (!osq_lock(&lock->osq)) 328 if (!osq_lock(&lock->osq))
311 goto done; 329 goto done;
312 330
@@ -469,7 +487,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
469EXPORT_SYMBOL(ww_mutex_unlock); 487EXPORT_SYMBOL(ww_mutex_unlock);
470 488
471static inline int __sched 489static inline int __sched
472__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) 490__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
473{ 491{
474 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); 492 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
475 struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); 493 struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
@@ -557,7 +575,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
557 } 575 }
558 576
559 if (use_ww_ctx && ww_ctx->acquired > 0) { 577 if (use_ww_ctx && ww_ctx->acquired > 0) {
560 ret = __mutex_lock_check_stamp(lock, ww_ctx); 578 ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
561 if (ret) 579 if (ret)
562 goto err; 580 goto err;
563 } 581 }
@@ -569,6 +587,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
569 schedule_preempt_disabled(); 587 schedule_preempt_disabled();
570 spin_lock_mutex(&lock->wait_lock, flags); 588 spin_lock_mutex(&lock->wait_lock, flags);
571 } 589 }
590 __set_task_state(task, TASK_RUNNING);
591
572 mutex_remove_waiter(lock, &waiter, current_thread_info()); 592 mutex_remove_waiter(lock, &waiter, current_thread_info());
573 /* set it to 0 if there are no waiters left: */ 593 /* set it to 0 if there are no waiters left: */
574 if (likely(list_empty(&lock->wait_list))) 594 if (likely(list_empty(&lock->wait_list)))
@@ -582,23 +602,7 @@ skip_wait:
582 602
583 if (use_ww_ctx) { 603 if (use_ww_ctx) {
584 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); 604 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
585 struct mutex_waiter *cur; 605 ww_mutex_set_context_slowpath(ww, ww_ctx);
586
587 /*
588 * This branch gets optimized out for the common case,
589 * and is only important for ww_mutex_lock.
590 */
591 ww_mutex_lock_acquired(ww, ww_ctx);
592 ww->ctx = ww_ctx;
593
594 /*
595 * Give any possible sleeping processes the chance to wake up,
596 * so they can recheck if they have to back off.
597 */
598 list_for_each_entry(cur, &lock->wait_list, list) {
599 debug_mutex_wake_waiter(lock, cur);
600 wake_up_process(cur->task);
601 }
602 } 606 }
603 607
604 spin_unlock_mutex(&lock->wait_lock, flags); 608 spin_unlock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/osq_lock.c
index 9887a905a762..c112d00341b0 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,8 +1,6 @@
1#include <linux/percpu.h> 1#include <linux/percpu.h>
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include "mcs_spinlock.h" 3#include <linux/osq_lock.h>
4
5#ifdef CONFIG_SMP
6 4
7/* 5/*
8 * An MCS like lock especially tailored for optimistic spinning for sleeping 6 * An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -111,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
111 * cmpxchg in an attempt to undo our queueing. 109 * cmpxchg in an attempt to undo our queueing.
112 */ 110 */
113 111
114 while (!smp_load_acquire(&node->locked)) { 112 while (!ACCESS_ONCE(node->locked)) {
115 /* 113 /*
116 * If we need to reschedule bail... so we can block. 114 * If we need to reschedule bail... so we can block.
117 */ 115 */
@@ -203,6 +201,3 @@ void osq_unlock(struct optimistic_spin_queue *lock)
203 if (next) 201 if (next)
204 ACCESS_ONCE(next->locked) = 1; 202 ACCESS_ONCE(next->locked) = 1;
205} 203}
206
207#endif
208
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7c98873a3077..3059bc2f022d 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1130,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
1130 set_current_state(state); 1130 set_current_state(state);
1131 } 1131 }
1132 1132
1133 __set_current_state(TASK_RUNNING);
1133 return ret; 1134 return ret;
1134} 1135}
1135 1136
@@ -1188,10 +1189,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
1188 ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); 1189 ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
1189 1190
1190 if (likely(!ret)) 1191 if (likely(!ret))
1192 /* sleep on the mutex */
1191 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); 1193 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
1192 1194
1193 set_current_state(TASK_RUNNING);
1194
1195 if (unlikely(ret)) { 1195 if (unlikely(ret)) {
1196 remove_waiter(lock, &waiter); 1196 remove_waiter(lock, &waiter);
1197 rt_mutex_handle_deadlock(ret, chwalk, &waiter); 1197 rt_mutex_handle_deadlock(ret, chwalk, &waiter);
@@ -1626,10 +1626,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1626 1626
1627 set_current_state(TASK_INTERRUPTIBLE); 1627 set_current_state(TASK_INTERRUPTIBLE);
1628 1628
1629 /* sleep on the mutex */
1629 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); 1630 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
1630 1631
1631 set_current_state(TASK_RUNNING);
1632
1633 if (unlikely(ret)) 1632 if (unlikely(ret))
1634 remove_waiter(lock, waiter); 1633 remove_waiter(lock, waiter);
1635 1634
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2c93571162cb..2555ae15ec14 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -154,7 +154,7 @@ void __sched __down_read(struct rw_semaphore *sem)
154 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 154 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
155 } 155 }
156 156
157 tsk->state = TASK_RUNNING; 157 __set_task_state(tsk, TASK_RUNNING);
158 out: 158 out:
159 ; 159 ;
160} 160}
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 7628c3fc37ca..2f7cc4076f50 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -242,8 +242,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
242 schedule(); 242 schedule();
243 } 243 }
244 244
245 tsk->state = TASK_RUNNING; 245 __set_task_state(tsk, TASK_RUNNING);
246
247 return sem; 246 return sem;
248} 247}
249EXPORT_SYMBOL(rwsem_down_read_failed); 248EXPORT_SYMBOL(rwsem_down_read_failed);
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..db3ccb1dd614 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
363} 363}
364EXPORT_SYMBOL(_raw_spin_lock_nested); 364EXPORT_SYMBOL(_raw_spin_lock_nested);
365 365
366void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
367{
368 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
369 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
370 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
371}
372EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
373
366unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, 374unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
367 int subclass) 375 int subclass)
368{ 376{
diff --git a/kernel/module.c b/kernel/module.c
index d856e96a3cce..b34813f725e9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
56#include <linux/async.h> 56#include <linux/async.h>
57#include <linux/percpu.h> 57#include <linux/percpu.h>
58#include <linux/kmemleak.h> 58#include <linux/kmemleak.h>
59#include <linux/kasan.h>
59#include <linux/jump_label.h> 60#include <linux/jump_label.h>
60#include <linux/pfn.h> 61#include <linux/pfn.h>
61#include <linux/bsearch.h> 62#include <linux/bsearch.h>
@@ -1225,6 +1226,12 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
1225 const unsigned long *crc; 1226 const unsigned long *crc;
1226 int err; 1227 int err;
1227 1228
1229 /*
1230 * The module_mutex should not be a heavily contended lock;
1231 * if we get the occasional sleep here, we'll go an extra iteration
1232 * in the wait_event_interruptible(), which is harmless.
1233 */
1234 sched_annotate_sleep();
1228 mutex_lock(&module_mutex); 1235 mutex_lock(&module_mutex);
1229 sym = find_symbol(name, &owner, &crc, 1236 sym = find_symbol(name, &owner, &crc,
1230 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); 1237 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
@@ -1807,6 +1814,7 @@ static void unset_module_init_ro_nx(struct module *mod) { }
1807void __weak module_memfree(void *module_region) 1814void __weak module_memfree(void *module_region)
1808{ 1815{
1809 vfree(module_region); 1816 vfree(module_region);
1817 kasan_module_free(module_region);
1810} 1818}
1811 1819
1812void __weak module_arch_cleanup(struct module *mod) 1820void __weak module_arch_cleanup(struct module *mod)
@@ -2978,6 +2986,12 @@ static bool finished_loading(const char *name)
2978 struct module *mod; 2986 struct module *mod;
2979 bool ret; 2987 bool ret;
2980 2988
2989 /*
2990 * The module_mutex should not be a heavily contended lock;
2991 * if we get the occasional sleep here, we'll go an extra iteration
2992 * in the wait_event_interruptible(), which is harmless.
2993 */
2994 sched_annotate_sleep();
2981 mutex_lock(&module_mutex); 2995 mutex_lock(&module_mutex);
2982 mod = find_module_all(name, strlen(name), true); 2996 mod = find_module_all(name, strlen(name), true);
2983 ret = !mod || mod->state == MODULE_STATE_LIVE 2997 ret = !mod || mod->state == MODULE_STATE_LIVE
@@ -3011,8 +3025,13 @@ static void do_free_init(struct rcu_head *head)
3011 kfree(m); 3025 kfree(m);
3012} 3026}
3013 3027
3014/* This is where the real work happens */ 3028/*
3015static int do_init_module(struct module *mod) 3029 * This is where the real work happens.
3030 *
3031 * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
3032 * helper command 'lx-symbols'.
3033 */
3034static noinline int do_init_module(struct module *mod)
3016{ 3035{
3017 int ret = 0; 3036 int ret = 0;
3018 struct mod_initfree *freeinit; 3037 struct mod_initfree *freeinit;
@@ -3120,32 +3139,6 @@ static int may_init_module(void)
3120} 3139}
3121 3140
3122/* 3141/*
3123 * Can't use wait_event_interruptible() because our condition
3124 * 'finished_loading()' contains a blocking primitive itself (mutex_lock).
3125 */
3126static int wait_finished_loading(struct module *mod)
3127{
3128 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3129 int ret = 0;
3130
3131 add_wait_queue(&module_wq, &wait);
3132 for (;;) {
3133 if (finished_loading(mod->name))
3134 break;
3135
3136 if (signal_pending(current)) {
3137 ret = -ERESTARTSYS;
3138 break;
3139 }
3140
3141 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3142 }
3143 remove_wait_queue(&module_wq, &wait);
3144
3145 return ret;
3146}
3147
3148/*
3149 * We try to place it in the list now to make sure it's unique before 3142 * We try to place it in the list now to make sure it's unique before
3150 * we dedicate too many resources. In particular, temporary percpu 3143 * we dedicate too many resources. In particular, temporary percpu
3151 * memory exhaustion. 3144 * memory exhaustion.
@@ -3165,8 +3158,8 @@ again:
3165 || old->state == MODULE_STATE_UNFORMED) { 3158 || old->state == MODULE_STATE_UNFORMED) {
3166 /* Wait in case it fails to load. */ 3159 /* Wait in case it fails to load. */
3167 mutex_unlock(&module_mutex); 3160 mutex_unlock(&module_mutex);
3168 3161 err = wait_event_interruptible(module_wq,
3169 err = wait_finished_loading(mod); 3162 finished_loading(mod->name));
3170 if (err) 3163 if (err)
3171 goto out_unlocked; 3164 goto out_unlocked;
3172 goto again; 3165 goto again;
@@ -3265,7 +3258,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
3265 mod->sig_ok = info->sig_ok; 3258 mod->sig_ok = info->sig_ok;
3266 if (!mod->sig_ok) { 3259 if (!mod->sig_ok) {
3267 pr_notice_once("%s: module verification failed: signature " 3260 pr_notice_once("%s: module verification failed: signature "
3268 "and/or required key missing - tainting " 3261 "and/or required key missing - tainting "
3269 "kernel\n", mod->name); 3262 "kernel\n", mod->name);
3270 add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK); 3263 add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
3271 } 3264 }
@@ -3356,6 +3349,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
3356 module_bug_cleanup(mod); 3349 module_bug_cleanup(mod);
3357 mutex_unlock(&module_mutex); 3350 mutex_unlock(&module_mutex);
3358 3351
3352 /* Free lock-classes: */
3353 lockdep_free_key_range(mod->module_core, mod->core_size);
3354
3359 /* we can't deallocate the module until we clear memory protection */ 3355 /* we can't deallocate the module until we clear memory protection */
3360 unset_module_init_ro_nx(mod); 3356 unset_module_init_ro_nx(mod);
3361 unset_module_core_ro_nx(mod); 3357 unset_module_core_ro_nx(mod);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4803da6eab62..ae9fc7cc360e 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
402} 402}
403EXPORT_SYMBOL_GPL(raw_notifier_call_chain); 403EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
404 404
405#ifdef CONFIG_SRCU
405/* 406/*
406 * SRCU notifier chain routines. Registration and unregistration 407 * SRCU notifier chain routines. Registration and unregistration
407 * use a mutex, and call_chain is synchronized by SRCU (no locks). 408 * use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
528} 529}
529EXPORT_SYMBOL_GPL(srcu_init_notifier_head); 530EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
530 531
532#endif /* CONFIG_SRCU */
533
531static ATOMIC_NOTIFIER_HEAD(die_chain); 534static ATOMIC_NOTIFIER_HEAD(die_chain);
532 535
533int notrace notify_die(enum die_val val, const char *str, 536int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/padata.c b/kernel/padata.c
index 161402f0b517..b38bea9c466a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -917,15 +917,10 @@ static ssize_t show_cpumask(struct padata_instance *pinst,
917 else 917 else
918 cpumask = pinst->cpumask.pcpu; 918 cpumask = pinst->cpumask.pcpu;
919 919
920 len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask), 920 len = snprintf(buf, PAGE_SIZE, "%*pb\n",
921 nr_cpu_ids); 921 nr_cpu_ids, cpumask_bits(cpumask));
922 if (PAGE_SIZE - len < 2)
923 len = -EINVAL;
924 else
925 len += sprintf(buf + len, "\n");
926
927 mutex_unlock(&pinst->lock); 922 mutex_unlock(&pinst->lock);
928 return len; 923 return len < PAGE_SIZE ? len : -EINVAL;
929} 924}
930 925
931static ssize_t store_cpumask(struct padata_instance *pinst, 926static ssize_t store_cpumask(struct padata_instance *pinst,
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d8d6f906dec..8136ad76e5fd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -226,6 +226,7 @@ static const struct tnt tnts[] = {
226 { TAINT_OOT_MODULE, 'O', ' ' }, 226 { TAINT_OOT_MODULE, 'O', ' ' },
227 { TAINT_UNSIGNED_MODULE, 'E', ' ' }, 227 { TAINT_UNSIGNED_MODULE, 'E', ' ' },
228 { TAINT_SOFTLOCKUP, 'L', ' ' }, 228 { TAINT_SOFTLOCKUP, 'L', ' ' },
229 { TAINT_LIVEPATCH, 'K', ' ' },
229}; 230};
230 231
231/** 232/**
@@ -246,6 +247,7 @@ static const struct tnt tnts[] = {
246 * 'O' - Out-of-tree module has been loaded. 247 * 'O' - Out-of-tree module has been loaded.
247 * 'E' - Unsigned module has been loaded. 248 * 'E' - Unsigned module has been loaded.
248 * 'L' - A soft lockup has previously occurred. 249 * 'L' - A soft lockup has previously occurred.
250 * 'K' - Kernel has been live patched.
249 * 251 *
250 * The string is overwritten by the next call to print_tainted(). 252 * The string is overwritten by the next call to print_tainted().
251 */ 253 */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 48b28d387c7f..7e01f78f0417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -251,6 +251,7 @@ config APM_EMULATION
251 251
252config PM_OPP 252config PM_OPP
253 bool 253 bool
254 select SRCU
254 ---help--- 255 ---help---
255 SOCs have a standard set of tuples consisting of frequency and 256 SOCs have a standard set of tuples consisting of frequency and
256 voltage pairs that the device will support per voltage domain. This 257 voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5a6ec8678b9a..564f786df470 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -84,8 +84,8 @@ static int try_to_freeze_tasks(bool user_only)
84 elapsed_msecs = elapsed_msecs64; 84 elapsed_msecs = elapsed_msecs64;
85 85
86 if (todo) { 86 if (todo) {
87 printk("\n"); 87 pr_cont("\n");
88 printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " 88 pr_err("Freezing of tasks %s after %d.%03d seconds "
89 "(%d tasks refusing to freeze, wq_busy=%d):\n", 89 "(%d tasks refusing to freeze, wq_busy=%d):\n",
90 wakeup ? "aborted" : "failed", 90 wakeup ? "aborted" : "failed",
91 elapsed_msecs / 1000, elapsed_msecs % 1000, 91 elapsed_msecs / 1000, elapsed_msecs % 1000,
@@ -101,37 +101,13 @@ static int try_to_freeze_tasks(bool user_only)
101 read_unlock(&tasklist_lock); 101 read_unlock(&tasklist_lock);
102 } 102 }
103 } else { 103 } else {
104 printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, 104 pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
105 elapsed_msecs % 1000); 105 elapsed_msecs % 1000);
106 } 106 }
107 107
108 return todo ? -EBUSY : 0; 108 return todo ? -EBUSY : 0;
109} 109}
110 110
111static bool __check_frozen_processes(void)
112{
113 struct task_struct *g, *p;
114
115 for_each_process_thread(g, p)
116 if (p != current && !freezer_should_skip(p) && !frozen(p))
117 return false;
118
119 return true;
120}
121
122/*
123 * Returns true if all freezable tasks (except for current) are frozen already
124 */
125static bool check_frozen_processes(void)
126{
127 bool ret;
128
129 read_lock(&tasklist_lock);
130 ret = __check_frozen_processes();
131 read_unlock(&tasklist_lock);
132 return ret;
133}
134
135/** 111/**
136 * freeze_processes - Signal user space processes to enter the refrigerator. 112 * freeze_processes - Signal user space processes to enter the refrigerator.
137 * The current thread will not be frozen. The same process that calls 113 * The current thread will not be frozen. The same process that calls
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void)
142int freeze_processes(void) 118int freeze_processes(void)
143{ 119{
144 int error; 120 int error;
145 int oom_kills_saved;
146 121
147 error = __usermodehelper_disable(UMH_FREEZING); 122 error = __usermodehelper_disable(UMH_FREEZING);
148 if (error) 123 if (error)
@@ -155,31 +130,24 @@ int freeze_processes(void)
155 atomic_inc(&system_freezing_cnt); 130 atomic_inc(&system_freezing_cnt);
156 131
157 pm_wakeup_clear(); 132 pm_wakeup_clear();
158 printk("Freezing user space processes ... "); 133 pr_info("Freezing user space processes ... ");
159 pm_freezing = true; 134 pm_freezing = true;
160 oom_kills_saved = oom_kills_count();
161 error = try_to_freeze_tasks(true); 135 error = try_to_freeze_tasks(true);
162 if (!error) { 136 if (!error) {
163 __usermodehelper_set_disable_depth(UMH_DISABLED); 137 __usermodehelper_set_disable_depth(UMH_DISABLED);
164 oom_killer_disable(); 138 pr_cont("done.");
165
166 /*
167 * There might have been an OOM kill while we were
168 * freezing tasks and the killed task might be still
169 * on the way out so we have to double check for race.
170 */
171 if (oom_kills_count() != oom_kills_saved &&
172 !check_frozen_processes()) {
173 __usermodehelper_set_disable_depth(UMH_ENABLED);
174 printk("OOM in progress.");
175 error = -EBUSY;
176 } else {
177 printk("done.");
178 }
179 } 139 }
180 printk("\n"); 140 pr_cont("\n");
181 BUG_ON(in_atomic()); 141 BUG_ON(in_atomic());
182 142
143 /*
144 * Now that the whole userspace is frozen we need to disbale
145 * the OOM killer to disallow any further interference with
146 * killable tasks.
147 */
148 if (!error && !oom_killer_disable())
149 error = -EBUSY;
150
183 if (error) 151 if (error)
184 thaw_processes(); 152 thaw_processes();
185 return error; 153 return error;
@@ -197,13 +165,14 @@ int freeze_kernel_threads(void)
197{ 165{
198 int error; 166 int error;
199 167
200 printk("Freezing remaining freezable tasks ... "); 168 pr_info("Freezing remaining freezable tasks ... ");
169
201 pm_nosig_freezing = true; 170 pm_nosig_freezing = true;
202 error = try_to_freeze_tasks(false); 171 error = try_to_freeze_tasks(false);
203 if (!error) 172 if (!error)
204 printk("done."); 173 pr_cont("done.");
205 174
206 printk("\n"); 175 pr_cont("\n");
207 BUG_ON(in_atomic()); 176 BUG_ON(in_atomic());
208 177
209 if (error) 178 if (error)
@@ -224,7 +193,7 @@ void thaw_processes(void)
224 193
225 oom_killer_enable(); 194 oom_killer_enable();
226 195
227 printk("Restarting tasks ... "); 196 pr_info("Restarting tasks ... ");
228 197
229 __usermodehelper_set_disable_depth(UMH_FREEZING); 198 __usermodehelper_set_disable_depth(UMH_FREEZING);
230 thaw_workqueues(); 199 thaw_workqueues();
@@ -243,7 +212,7 @@ void thaw_processes(void)
243 usermodehelper_enable(); 212 usermodehelper_enable();
244 213
245 schedule(); 214 schedule();
246 printk("done.\n"); 215 pr_cont("done.\n");
247 trace_suspend_resume(TPS("thaw_processes"), 0, false); 216 trace_suspend_resume(TPS("thaw_processes"), 0, false);
248} 217}
249 218
@@ -252,7 +221,7 @@ void thaw_kernel_threads(void)
252 struct task_struct *g, *p; 221 struct task_struct *g, *p;
253 222
254 pm_nosig_freezing = false; 223 pm_nosig_freezing = false;
255 printk("Restarting kernel threads ... "); 224 pr_info("Restarting kernel threads ... ");
256 225
257 thaw_workqueues(); 226 thaw_workqueues();
258 227
@@ -264,5 +233,5 @@ void thaw_kernel_threads(void)
264 read_unlock(&tasklist_lock); 233 read_unlock(&tasklist_lock);
265 234
266 schedule(); 235 schedule();
267 printk("done.\n"); 236 pr_cont("done.\n");
268} 237}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 5f4c006c4b1e..97b0df71303e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -41,6 +41,8 @@
41#include <linux/platform_device.h> 41#include <linux/platform_device.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/kernel.h> 43#include <linux/kernel.h>
44#include <linux/debugfs.h>
45#include <linux/seq_file.h>
44 46
45#include <linux/uaccess.h> 47#include <linux/uaccess.h>
46#include <linux/export.h> 48#include <linux/export.h>
@@ -182,6 +184,81 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
182 c->target_value = value; 184 c->target_value = value;
183} 185}
184 186
187static inline int pm_qos_get_value(struct pm_qos_constraints *c);
188static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
189{
190 struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
191 struct pm_qos_constraints *c;
192 struct pm_qos_request *req;
193 char *type;
194 unsigned long flags;
195 int tot_reqs = 0;
196 int active_reqs = 0;
197
198 if (IS_ERR_OR_NULL(qos)) {
199 pr_err("%s: bad qos param!\n", __func__);
200 return -EINVAL;
201 }
202 c = qos->constraints;
203 if (IS_ERR_OR_NULL(c)) {
204 pr_err("%s: Bad constraints on qos?\n", __func__);
205 return -EINVAL;
206 }
207
208 /* Lock to ensure we have a snapshot */
209 spin_lock_irqsave(&pm_qos_lock, flags);
210 if (plist_head_empty(&c->list)) {
211 seq_puts(s, "Empty!\n");
212 goto out;
213 }
214
215 switch (c->type) {
216 case PM_QOS_MIN:
217 type = "Minimum";
218 break;
219 case PM_QOS_MAX:
220 type = "Maximum";
221 break;
222 case PM_QOS_SUM:
223 type = "Sum";
224 break;
225 default:
226 type = "Unknown";
227 }
228
229 plist_for_each_entry(req, &c->list, node) {
230 char *state = "Default";
231
232 if ((req->node).prio != c->default_value) {
233 active_reqs++;
234 state = "Active";
235 }
236 tot_reqs++;
237 seq_printf(s, "%d: %d: %s\n", tot_reqs,
238 (req->node).prio, state);
239 }
240
241 seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n",
242 type, pm_qos_get_value(c), active_reqs, tot_reqs);
243
244out:
245 spin_unlock_irqrestore(&pm_qos_lock, flags);
246 return 0;
247}
248
249static int pm_qos_dbg_open(struct inode *inode, struct file *file)
250{
251 return single_open(file, pm_qos_dbg_show_requests,
252 inode->i_private);
253}
254
255static const struct file_operations pm_qos_debug_fops = {
256 .open = pm_qos_dbg_open,
257 .read = seq_read,
258 .llseek = seq_lseek,
259 .release = single_release,
260};
261
185/** 262/**
186 * pm_qos_update_target - manages the constraints list and calls the notifiers 263 * pm_qos_update_target - manages the constraints list and calls the notifiers
187 * if needed 264 * if needed
@@ -509,12 +586,17 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
509EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 586EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
510 587
511/* User space interface to PM QoS classes via misc devices */ 588/* User space interface to PM QoS classes via misc devices */
512static int register_pm_qos_misc(struct pm_qos_object *qos) 589static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
513{ 590{
514 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; 591 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
515 qos->pm_qos_power_miscdev.name = qos->name; 592 qos->pm_qos_power_miscdev.name = qos->name;
516 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; 593 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
517 594
595 if (d) {
596 (void)debugfs_create_file(qos->name, S_IRUGO, d,
597 (void *)qos, &pm_qos_debug_fops);
598 }
599
518 return misc_register(&qos->pm_qos_power_miscdev); 600 return misc_register(&qos->pm_qos_power_miscdev);
519} 601}
520 602
@@ -608,11 +690,16 @@ static int __init pm_qos_power_init(void)
608{ 690{
609 int ret = 0; 691 int ret = 0;
610 int i; 692 int i;
693 struct dentry *d;
611 694
612 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); 695 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
613 696
697 d = debugfs_create_dir("pm_qos", NULL);
698 if (IS_ERR_OR_NULL(d))
699 d = NULL;
700
614 for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { 701 for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
615 ret = register_pm_qos_misc(pm_qos_array[i]); 702 ret = register_pm_qos_misc(pm_qos_array[i], d);
616 if (ret < 0) { 703 if (ret < 0) {
617 printk(KERN_ERR "pm_qos_param: %s setup failed\n", 704 printk(KERN_ERR "pm_qos_param: %s setup failed\n",
618 pm_qos_array[i]->name); 705 pm_qos_array[i]->name);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0c40c16174b4..c24d5a23bf93 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1472,9 +1472,9 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1472/** 1472/**
1473 * free_unnecessary_pages - Release preallocated pages not needed for the image 1473 * free_unnecessary_pages - Release preallocated pages not needed for the image
1474 */ 1474 */
1475static void free_unnecessary_pages(void) 1475static unsigned long free_unnecessary_pages(void)
1476{ 1476{
1477 unsigned long save, to_free_normal, to_free_highmem; 1477 unsigned long save, to_free_normal, to_free_highmem, free;
1478 1478
1479 save = count_data_pages(); 1479 save = count_data_pages();
1480 if (alloc_normal >= save) { 1480 if (alloc_normal >= save) {
@@ -1495,6 +1495,7 @@ static void free_unnecessary_pages(void)
1495 else 1495 else
1496 to_free_normal = 0; 1496 to_free_normal = 0;
1497 } 1497 }
1498 free = to_free_normal + to_free_highmem;
1498 1499
1499 memory_bm_position_reset(&copy_bm); 1500 memory_bm_position_reset(&copy_bm);
1500 1501
@@ -1518,6 +1519,8 @@ static void free_unnecessary_pages(void)
1518 swsusp_unset_page_free(page); 1519 swsusp_unset_page_free(page);
1519 __free_page(page); 1520 __free_page(page);
1520 } 1521 }
1522
1523 return free;
1521} 1524}
1522 1525
1523/** 1526/**
@@ -1707,7 +1710,7 @@ int hibernate_preallocate_memory(void)
1707 * pages in memory, but we have allocated more. Release the excessive 1710 * pages in memory, but we have allocated more. Release the excessive
1708 * ones now. 1711 * ones now.
1709 */ 1712 */
1710 free_unnecessary_pages(); 1713 pages -= free_unnecessary_pages();
1711 1714
1712 out: 1715 out:
1713 stop = ktime_get(); 1716 stop = ktime_get();
@@ -2310,8 +2313,6 @@ static inline void free_highmem_data(void)
2310 free_image_page(buffer, PG_UNSAFE_CLEAR); 2313 free_image_page(buffer, PG_UNSAFE_CLEAR);
2311} 2314}
2312#else 2315#else
2313static inline int get_safe_write_buffer(void) { return 0; }
2314
2315static unsigned int 2316static unsigned int
2316count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } 2317count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
2317 2318
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c347e3ce3a55..b7d6b3a721b1 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,7 +37,9 @@ const char *pm_states[PM_SUSPEND_MAX];
37static const struct platform_suspend_ops *suspend_ops; 37static const struct platform_suspend_ops *suspend_ops;
38static const struct platform_freeze_ops *freeze_ops; 38static const struct platform_freeze_ops *freeze_ops;
39static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); 39static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
40static bool suspend_freeze_wake; 40
41enum freeze_state __read_mostly suspend_freeze_state;
42static DEFINE_SPINLOCK(suspend_freeze_lock);
41 43
42void freeze_set_ops(const struct platform_freeze_ops *ops) 44void freeze_set_ops(const struct platform_freeze_ops *ops)
43{ 45{
@@ -48,22 +50,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
48 50
49static void freeze_begin(void) 51static void freeze_begin(void)
50{ 52{
51 suspend_freeze_wake = false; 53 suspend_freeze_state = FREEZE_STATE_NONE;
52} 54}
53 55
54static void freeze_enter(void) 56static void freeze_enter(void)
55{ 57{
56 cpuidle_use_deepest_state(true); 58 spin_lock_irq(&suspend_freeze_lock);
59 if (pm_wakeup_pending())
60 goto out;
61
62 suspend_freeze_state = FREEZE_STATE_ENTER;
63 spin_unlock_irq(&suspend_freeze_lock);
64
65 get_online_cpus();
57 cpuidle_resume(); 66 cpuidle_resume();
58 wait_event(suspend_freeze_wait_head, suspend_freeze_wake); 67
68 /* Push all the CPUs into the idle loop. */
69 wake_up_all_idle_cpus();
70 pr_debug("PM: suspend-to-idle\n");
71 /* Make the current CPU wait so it can enter the idle loop too. */
72 wait_event(suspend_freeze_wait_head,
73 suspend_freeze_state == FREEZE_STATE_WAKE);
74 pr_debug("PM: resume from suspend-to-idle\n");
75
59 cpuidle_pause(); 76 cpuidle_pause();
60 cpuidle_use_deepest_state(false); 77 put_online_cpus();
78
79 spin_lock_irq(&suspend_freeze_lock);
80
81 out:
82 suspend_freeze_state = FREEZE_STATE_NONE;
83 spin_unlock_irq(&suspend_freeze_lock);
61} 84}
62 85
63void freeze_wake(void) 86void freeze_wake(void)
64{ 87{
65 suspend_freeze_wake = true; 88 unsigned long flags;
66 wake_up(&suspend_freeze_wait_head); 89
90 spin_lock_irqsave(&suspend_freeze_lock, flags);
91 if (suspend_freeze_state > FREEZE_STATE_NONE) {
92 suspend_freeze_state = FREEZE_STATE_WAKE;
93 wake_up(&suspend_freeze_wait_head);
94 }
95 spin_unlock_irqrestore(&suspend_freeze_lock, flags);
67} 96}
68EXPORT_SYMBOL_GPL(freeze_wake); 97EXPORT_SYMBOL_GPL(freeze_wake);
69 98
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index fae29e3ffbf0..01cfd69c54c6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -935,8 +935,8 @@ static int __init ignore_loglevel_setup(char *str)
935 935
936early_param("ignore_loglevel", ignore_loglevel_setup); 936early_param("ignore_loglevel", ignore_loglevel_setup);
937module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); 937module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
938MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" 938MODULE_PARM_DESC(ignore_loglevel,
939 "print all kernel messages to the console."); 939 "ignore loglevel setting (prints all kernel messages to the console)");
940 940
941#ifdef CONFIG_BOOT_PRINTK_DELAY 941#ifdef CONFIG_BOOT_PRINTK_DELAY
942 942
@@ -1419,16 +1419,16 @@ static void call_console_drivers(int level, const char *text, size_t len)
1419} 1419}
1420 1420
1421/* 1421/*
1422 * Zap console related locks when oopsing. Only zap at most once 1422 * Zap console related locks when oopsing.
1423 * every 10 seconds, to leave time for slow consoles to print a 1423 * To leave time for slow consoles to print a full oops,
1424 * full oops. 1424 * only zap at most once every 30 seconds.
1425 */ 1425 */
1426static void zap_locks(void) 1426static void zap_locks(void)
1427{ 1427{
1428 static unsigned long oops_timestamp; 1428 static unsigned long oops_timestamp;
1429 1429
1430 if (time_after_eq(jiffies, oops_timestamp) && 1430 if (time_after_eq(jiffies, oops_timestamp) &&
1431 !time_after(jiffies, oops_timestamp + 30 * HZ)) 1431 !time_after(jiffies, oops_timestamp + 30 * HZ))
1432 return; 1432 return;
1433 1433
1434 oops_timestamp = jiffies; 1434 oops_timestamp = jiffies;
diff --git a/kernel/profile.c b/kernel/profile.c
index 54bf5ba26420..a7bcd28d6e9f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -422,8 +422,7 @@ void profile_tick(int type)
422 422
423static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) 423static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
424{ 424{
425 seq_cpumask(m, prof_cpu_mask); 425 seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
426 seq_putc(m, '\n');
427 return 0; 426 return 0;
428} 427}
429 428
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1eb9d90c3af9..227fec36b12a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1077,7 +1077,6 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
1077} 1077}
1078 1078
1079#if defined CONFIG_COMPAT 1079#if defined CONFIG_COMPAT
1080#include <linux/compat.h>
1081 1080
1082int compat_ptrace_request(struct task_struct *child, compat_long_t request, 1081int compat_ptrace_request(struct task_struct *child, compat_long_t request,
1083 compat_ulong_t addr, compat_ulong_t data) 1082 compat_ulong_t addr, compat_ulong_t data)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index e6fae503d1bc..50a808424b06 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,5 @@
1obj-y += update.o srcu.o 1obj-y += update.o
2obj-$(CONFIG_SRCU) += srcu.o
2obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 3obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
3obj-$(CONFIG_TREE_RCU) += tree.o 4obj-$(CONFIG_TREE_RCU) += tree.o
4obj-$(CONFIG_PREEMPT_RCU) += tree.o 5obj-$(CONFIG_PREEMPT_RCU) += tree.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 07bb02eda844..80adef7d4c3d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -137,4 +137,10 @@ int rcu_jiffies_till_stall_check(void);
137 137
138void rcu_early_boot_tests(void); 138void rcu_early_boot_tests(void);
139 139
140/*
141 * This function really isn't for public consumption, but RCU is special in
142 * that context switches can allow the state machine to make progress.
143 */
144extern void resched_cpu(int cpu);
145
140#endif /* __LINUX_RCU_H */ 146#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4d559baf06e0..30d42aa55d83 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -244,7 +244,8 @@ struct rcu_torture_ops {
244 int (*readlock)(void); 244 int (*readlock)(void);
245 void (*read_delay)(struct torture_random_state *rrsp); 245 void (*read_delay)(struct torture_random_state *rrsp);
246 void (*readunlock)(int idx); 246 void (*readunlock)(int idx);
247 int (*completed)(void); 247 unsigned long (*started)(void);
248 unsigned long (*completed)(void);
248 void (*deferred_free)(struct rcu_torture *p); 249 void (*deferred_free)(struct rcu_torture *p);
249 void (*sync)(void); 250 void (*sync)(void);
250 void (*exp_sync)(void); 251 void (*exp_sync)(void);
@@ -296,11 +297,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU)
296 rcu_read_unlock(); 297 rcu_read_unlock();
297} 298}
298 299
299static int rcu_torture_completed(void)
300{
301 return rcu_batches_completed();
302}
303
304/* 300/*
305 * Update callback in the pipe. This should be invoked after a grace period. 301 * Update callback in the pipe. This should be invoked after a grace period.
306 */ 302 */
@@ -356,7 +352,7 @@ rcu_torture_cb(struct rcu_head *p)
356 cur_ops->deferred_free(rp); 352 cur_ops->deferred_free(rp);
357} 353}
358 354
359static int rcu_no_completed(void) 355static unsigned long rcu_no_completed(void)
360{ 356{
361 return 0; 357 return 0;
362} 358}
@@ -377,7 +373,8 @@ static struct rcu_torture_ops rcu_ops = {
377 .readlock = rcu_torture_read_lock, 373 .readlock = rcu_torture_read_lock,
378 .read_delay = rcu_read_delay, 374 .read_delay = rcu_read_delay,
379 .readunlock = rcu_torture_read_unlock, 375 .readunlock = rcu_torture_read_unlock,
380 .completed = rcu_torture_completed, 376 .started = rcu_batches_started,
377 .completed = rcu_batches_completed,
381 .deferred_free = rcu_torture_deferred_free, 378 .deferred_free = rcu_torture_deferred_free,
382 .sync = synchronize_rcu, 379 .sync = synchronize_rcu,
383 .exp_sync = synchronize_rcu_expedited, 380 .exp_sync = synchronize_rcu_expedited,
@@ -407,11 +404,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
407 rcu_read_unlock_bh(); 404 rcu_read_unlock_bh();
408} 405}
409 406
410static int rcu_bh_torture_completed(void)
411{
412 return rcu_batches_completed_bh();
413}
414
415static void rcu_bh_torture_deferred_free(struct rcu_torture *p) 407static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
416{ 408{
417 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); 409 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
@@ -423,7 +415,8 @@ static struct rcu_torture_ops rcu_bh_ops = {
423 .readlock = rcu_bh_torture_read_lock, 415 .readlock = rcu_bh_torture_read_lock,
424 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 416 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
425 .readunlock = rcu_bh_torture_read_unlock, 417 .readunlock = rcu_bh_torture_read_unlock,
426 .completed = rcu_bh_torture_completed, 418 .started = rcu_batches_started_bh,
419 .completed = rcu_batches_completed_bh,
427 .deferred_free = rcu_bh_torture_deferred_free, 420 .deferred_free = rcu_bh_torture_deferred_free,
428 .sync = synchronize_rcu_bh, 421 .sync = synchronize_rcu_bh,
429 .exp_sync = synchronize_rcu_bh_expedited, 422 .exp_sync = synchronize_rcu_bh_expedited,
@@ -466,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
466 .readlock = rcu_torture_read_lock, 459 .readlock = rcu_torture_read_lock,
467 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 460 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
468 .readunlock = rcu_torture_read_unlock, 461 .readunlock = rcu_torture_read_unlock,
462 .started = rcu_no_completed,
469 .completed = rcu_no_completed, 463 .completed = rcu_no_completed,
470 .deferred_free = rcu_busted_torture_deferred_free, 464 .deferred_free = rcu_busted_torture_deferred_free,
471 .sync = synchronize_rcu_busted, 465 .sync = synchronize_rcu_busted,
@@ -510,7 +504,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
510 srcu_read_unlock(&srcu_ctl, idx); 504 srcu_read_unlock(&srcu_ctl, idx);
511} 505}
512 506
513static int srcu_torture_completed(void) 507static unsigned long srcu_torture_completed(void)
514{ 508{
515 return srcu_batches_completed(&srcu_ctl); 509 return srcu_batches_completed(&srcu_ctl);
516} 510}
@@ -564,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = {
564 .readlock = srcu_torture_read_lock, 558 .readlock = srcu_torture_read_lock,
565 .read_delay = srcu_read_delay, 559 .read_delay = srcu_read_delay,
566 .readunlock = srcu_torture_read_unlock, 560 .readunlock = srcu_torture_read_unlock,
561 .started = NULL,
567 .completed = srcu_torture_completed, 562 .completed = srcu_torture_completed,
568 .deferred_free = srcu_torture_deferred_free, 563 .deferred_free = srcu_torture_deferred_free,
569 .sync = srcu_torture_synchronize, 564 .sync = srcu_torture_synchronize,
@@ -600,7 +595,8 @@ static struct rcu_torture_ops sched_ops = {
600 .readlock = sched_torture_read_lock, 595 .readlock = sched_torture_read_lock,
601 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 596 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
602 .readunlock = sched_torture_read_unlock, 597 .readunlock = sched_torture_read_unlock,
603 .completed = rcu_no_completed, 598 .started = rcu_batches_started_sched,
599 .completed = rcu_batches_completed_sched,
604 .deferred_free = rcu_sched_torture_deferred_free, 600 .deferred_free = rcu_sched_torture_deferred_free,
605 .sync = synchronize_sched, 601 .sync = synchronize_sched,
606 .exp_sync = synchronize_sched_expedited, 602 .exp_sync = synchronize_sched_expedited,
@@ -638,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = {
638 .readlock = tasks_torture_read_lock, 634 .readlock = tasks_torture_read_lock,
639 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 635 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
640 .readunlock = tasks_torture_read_unlock, 636 .readunlock = tasks_torture_read_unlock,
637 .started = rcu_no_completed,
641 .completed = rcu_no_completed, 638 .completed = rcu_no_completed,
642 .deferred_free = rcu_tasks_torture_deferred_free, 639 .deferred_free = rcu_tasks_torture_deferred_free,
643 .sync = synchronize_rcu_tasks, 640 .sync = synchronize_rcu_tasks,
@@ -1015,8 +1012,8 @@ static void rcutorture_trace_dump(void)
1015static void rcu_torture_timer(unsigned long unused) 1012static void rcu_torture_timer(unsigned long unused)
1016{ 1013{
1017 int idx; 1014 int idx;
1018 int completed; 1015 unsigned long started;
1019 int completed_end; 1016 unsigned long completed;
1020 static DEFINE_TORTURE_RANDOM(rand); 1017 static DEFINE_TORTURE_RANDOM(rand);
1021 static DEFINE_SPINLOCK(rand_lock); 1018 static DEFINE_SPINLOCK(rand_lock);
1022 struct rcu_torture *p; 1019 struct rcu_torture *p;
@@ -1024,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused)
1024 unsigned long long ts; 1021 unsigned long long ts;
1025 1022
1026 idx = cur_ops->readlock(); 1023 idx = cur_ops->readlock();
1027 completed = cur_ops->completed(); 1024 if (cur_ops->started)
1025 started = cur_ops->started();
1026 else
1027 started = cur_ops->completed();
1028 ts = rcu_trace_clock_local(); 1028 ts = rcu_trace_clock_local();
1029 p = rcu_dereference_check(rcu_torture_current, 1029 p = rcu_dereference_check(rcu_torture_current,
1030 rcu_read_lock_bh_held() || 1030 rcu_read_lock_bh_held() ||
@@ -1047,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused)
1047 /* Should not happen, but... */ 1047 /* Should not happen, but... */
1048 pipe_count = RCU_TORTURE_PIPE_LEN; 1048 pipe_count = RCU_TORTURE_PIPE_LEN;
1049 } 1049 }
1050 completed_end = cur_ops->completed(); 1050 completed = cur_ops->completed();
1051 if (pipe_count > 1) { 1051 if (pipe_count > 1) {
1052 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, 1052 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
1053 completed, completed_end); 1053 started, completed);
1054 rcutorture_trace_dump(); 1054 rcutorture_trace_dump();
1055 } 1055 }
1056 __this_cpu_inc(rcu_torture_count[pipe_count]); 1056 __this_cpu_inc(rcu_torture_count[pipe_count]);
1057 completed = completed_end - completed; 1057 completed = completed - started;
1058 if (cur_ops->started)
1059 completed++;
1058 if (completed > RCU_TORTURE_PIPE_LEN) { 1060 if (completed > RCU_TORTURE_PIPE_LEN) {
1059 /* Should not happen, but... */ 1061 /* Should not happen, but... */
1060 completed = RCU_TORTURE_PIPE_LEN; 1062 completed = RCU_TORTURE_PIPE_LEN;
@@ -1073,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused)
1073static int 1075static int
1074rcu_torture_reader(void *arg) 1076rcu_torture_reader(void *arg)
1075{ 1077{
1076 int completed; 1078 unsigned long started;
1077 int completed_end; 1079 unsigned long completed;
1078 int idx; 1080 int idx;
1079 DEFINE_TORTURE_RANDOM(rand); 1081 DEFINE_TORTURE_RANDOM(rand);
1080 struct rcu_torture *p; 1082 struct rcu_torture *p;
@@ -1093,7 +1095,10 @@ rcu_torture_reader(void *arg)
1093 mod_timer(&t, jiffies + 1); 1095 mod_timer(&t, jiffies + 1);
1094 } 1096 }
1095 idx = cur_ops->readlock(); 1097 idx = cur_ops->readlock();
1096 completed = cur_ops->completed(); 1098 if (cur_ops->started)
1099 started = cur_ops->started();
1100 else
1101 started = cur_ops->completed();
1097 ts = rcu_trace_clock_local(); 1102 ts = rcu_trace_clock_local();
1098 p = rcu_dereference_check(rcu_torture_current, 1103 p = rcu_dereference_check(rcu_torture_current,
1099 rcu_read_lock_bh_held() || 1104 rcu_read_lock_bh_held() ||
@@ -1114,14 +1119,16 @@ rcu_torture_reader(void *arg)
1114 /* Should not happen, but... */ 1119 /* Should not happen, but... */
1115 pipe_count = RCU_TORTURE_PIPE_LEN; 1120 pipe_count = RCU_TORTURE_PIPE_LEN;
1116 } 1121 }
1117 completed_end = cur_ops->completed(); 1122 completed = cur_ops->completed();
1118 if (pipe_count > 1) { 1123 if (pipe_count > 1) {
1119 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, 1124 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
1120 ts, completed, completed_end); 1125 ts, started, completed);
1121 rcutorture_trace_dump(); 1126 rcutorture_trace_dump();
1122 } 1127 }
1123 __this_cpu_inc(rcu_torture_count[pipe_count]); 1128 __this_cpu_inc(rcu_torture_count[pipe_count]);
1124 completed = completed_end - completed; 1129 completed = completed - started;
1130 if (cur_ops->started)
1131 completed++;
1125 if (completed > RCU_TORTURE_PIPE_LEN) { 1132 if (completed > RCU_TORTURE_PIPE_LEN) {
1126 /* Should not happen, but... */ 1133 /* Should not happen, but... */
1127 completed = RCU_TORTURE_PIPE_LEN; 1134 completed = RCU_TORTURE_PIPE_LEN;
@@ -1420,6 +1427,9 @@ static int rcu_torture_barrier(void *arg)
1420 cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ 1427 cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
1421 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { 1428 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
1422 n_rcu_torture_barrier_error++; 1429 n_rcu_torture_barrier_error++;
1430 pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
1431 atomic_read(&barrier_cbs_invoked),
1432 n_barrier_cbs);
1423 WARN_ON_ONCE(1); 1433 WARN_ON_ONCE(1);
1424 } 1434 }
1425 n_barrier_successes++; 1435 n_barrier_successes++;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e037f3eb2f7b..445bf8ffe3fb 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
546 * Report the number of batches, correlated with, but not necessarily 546 * Report the number of batches, correlated with, but not necessarily
547 * precisely the same as, the number of grace periods that have elapsed. 547 * precisely the same as, the number of grace periods that have elapsed.
548 */ 548 */
549long srcu_batches_completed(struct srcu_struct *sp) 549unsigned long srcu_batches_completed(struct srcu_struct *sp)
550{ 550{
551 return sp->completed; 551 return sp->completed;
552} 552}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 0db5649f8817..cc9ceca7bde1 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head,
47 void (*func)(struct rcu_head *rcu), 47 void (*func)(struct rcu_head *rcu),
48 struct rcu_ctrlblk *rcp); 48 struct rcu_ctrlblk *rcp);
49 49
50static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
51
52#include "tiny_plugin.h" 50#include "tiny_plugin.h"
53 51
54/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */
55static void rcu_idle_enter_common(long long newval)
56{
57 if (newval) {
58 RCU_TRACE(trace_rcu_dyntick(TPS("--="),
59 rcu_dynticks_nesting, newval));
60 rcu_dynticks_nesting = newval;
61 return;
62 }
63 RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
64 rcu_dynticks_nesting, newval));
65 if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
66 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
67
68 RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
69 rcu_dynticks_nesting, newval));
70 ftrace_dump(DUMP_ALL);
71 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
72 current->pid, current->comm,
73 idle->pid, idle->comm); /* must be idle task! */
74 }
75 rcu_sched_qs(); /* implies rcu_bh_inc() */
76 barrier();
77 rcu_dynticks_nesting = newval;
78}
79
80/* 52/*
81 * Enter idle, which is an extended quiescent state if we have fully 53 * Enter idle, which is an extended quiescent state if we have fully
82 * entered that mode (i.e., if the new value of dynticks_nesting is zero). 54 * entered that mode.
83 */ 55 */
84void rcu_idle_enter(void) 56void rcu_idle_enter(void)
85{ 57{
86 unsigned long flags;
87 long long newval;
88
89 local_irq_save(flags);
90 WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
91 if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
92 DYNTICK_TASK_NEST_VALUE)
93 newval = 0;
94 else
95 newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
96 rcu_idle_enter_common(newval);
97 local_irq_restore(flags);
98} 58}
99EXPORT_SYMBOL_GPL(rcu_idle_enter); 59EXPORT_SYMBOL_GPL(rcu_idle_enter);
100 60
@@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
103 */ 63 */
104void rcu_irq_exit(void) 64void rcu_irq_exit(void)
105{ 65{
106 unsigned long flags;
107 long long newval;
108
109 local_irq_save(flags);
110 newval = rcu_dynticks_nesting - 1;
111 WARN_ON_ONCE(newval < 0);
112 rcu_idle_enter_common(newval);
113 local_irq_restore(flags);
114} 66}
115EXPORT_SYMBOL_GPL(rcu_irq_exit); 67EXPORT_SYMBOL_GPL(rcu_irq_exit);
116 68
117/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */
118static void rcu_idle_exit_common(long long oldval)
119{
120 if (oldval) {
121 RCU_TRACE(trace_rcu_dyntick(TPS("++="),
122 oldval, rcu_dynticks_nesting));
123 return;
124 }
125 RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
126 if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
127 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
128
129 RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
130 oldval, rcu_dynticks_nesting));
131 ftrace_dump(DUMP_ALL);
132 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
133 current->pid, current->comm,
134 idle->pid, idle->comm); /* must be idle task! */
135 }
136}
137
138/* 69/*
139 * Exit idle, so that we are no longer in an extended quiescent state. 70 * Exit idle, so that we are no longer in an extended quiescent state.
140 */ 71 */
141void rcu_idle_exit(void) 72void rcu_idle_exit(void)
142{ 73{
143 unsigned long flags;
144 long long oldval;
145
146 local_irq_save(flags);
147 oldval = rcu_dynticks_nesting;
148 WARN_ON_ONCE(rcu_dynticks_nesting < 0);
149 if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
150 rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
151 else
152 rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
153 rcu_idle_exit_common(oldval);
154 local_irq_restore(flags);
155} 74}
156EXPORT_SYMBOL_GPL(rcu_idle_exit); 75EXPORT_SYMBOL_GPL(rcu_idle_exit);
157 76
@@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
160 */ 79 */
161void rcu_irq_enter(void) 80void rcu_irq_enter(void)
162{ 81{
163 unsigned long flags;
164 long long oldval;
165
166 local_irq_save(flags);
167 oldval = rcu_dynticks_nesting;
168 rcu_dynticks_nesting++;
169 WARN_ON_ONCE(rcu_dynticks_nesting == 0);
170 rcu_idle_exit_common(oldval);
171 local_irq_restore(flags);
172} 82}
173EXPORT_SYMBOL_GPL(rcu_irq_enter); 83EXPORT_SYMBOL_GPL(rcu_irq_enter);
174 84
@@ -179,23 +89,13 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter);
179 */ 89 */
180bool notrace __rcu_is_watching(void) 90bool notrace __rcu_is_watching(void)
181{ 91{
182 return rcu_dynticks_nesting; 92 return true;
183} 93}
184EXPORT_SYMBOL(__rcu_is_watching); 94EXPORT_SYMBOL(__rcu_is_watching);
185 95
186#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ 96#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
187 97
188/* 98/*
189 * Test whether the current CPU was interrupted from idle. Nested
190 * interrupts don't count, we must be running at the first interrupt
191 * level.
192 */
193static int rcu_is_cpu_rrupt_from_idle(void)
194{
195 return rcu_dynticks_nesting <= 1;
196}
197
198/*
199 * Helper function for rcu_sched_qs() and rcu_bh_qs(). 99 * Helper function for rcu_sched_qs() and rcu_bh_qs().
200 * Also irqs are disabled to avoid confusion due to interrupt handlers 100 * Also irqs are disabled to avoid confusion due to interrupt handlers
201 * invoking call_rcu(). 101 * invoking call_rcu().
@@ -250,7 +150,7 @@ void rcu_bh_qs(void)
250void rcu_check_callbacks(int user) 150void rcu_check_callbacks(int user)
251{ 151{
252 RCU_TRACE(check_cpu_stalls()); 152 RCU_TRACE(check_cpu_stalls());
253 if (user || rcu_is_cpu_rrupt_from_idle()) 153 if (user)
254 rcu_sched_qs(); 154 rcu_sched_qs();
255 else if (!in_softirq()) 155 else if (!in_softirq())
256 rcu_bh_qs(); 156 rcu_bh_qs();
@@ -357,6 +257,11 @@ static void __call_rcu(struct rcu_head *head,
357 rcp->curtail = &head->next; 257 rcp->curtail = &head->next;
358 RCU_TRACE(rcp->qlen++); 258 RCU_TRACE(rcp->qlen++);
359 local_irq_restore(flags); 259 local_irq_restore(flags);
260
261 if (unlikely(is_idle_task(current))) {
262 /* force scheduling for rcu_sched_qs() */
263 resched_cpu(0);
264 }
360} 265}
361 266
362/* 267/*
@@ -383,6 +288,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
383void __init rcu_init(void) 288void __init rcu_init(void)
384{ 289{
385 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 290 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
291 RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
292 RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
386 293
387 rcu_early_boot_tests(); 294 rcu_early_boot_tests();
388} 295}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 858c56569127..f94e209a10d6 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
145 rcp->ticks_this_gp++; 145 rcp->ticks_this_gp++;
146 j = jiffies; 146 j = jiffies;
147 js = ACCESS_ONCE(rcp->jiffies_stall); 147 js = ACCESS_ONCE(rcp->jiffies_stall);
148 if (*rcp->curtail && ULONG_CMP_GE(j, js)) { 148 if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", 149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
150 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, 150 rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
151 jiffies - rcp->gp_start, rcp->qlen); 151 jiffies - rcp->gp_start, rcp->qlen);
152 dump_stack(); 152 dump_stack();
153 }
154 if (*rcp->curtail && ULONG_CMP_GE(j, js))
155 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + 153 ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
156 3 * rcu_jiffies_till_stall_check() + 3; 154 3 * rcu_jiffies_till_stall_check() + 3;
157 else if (ULONG_CMP_GE(j, js)) 155 } else if (ULONG_CMP_GE(j, js)) {
158 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); 156 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
157 }
159} 158}
160 159
161static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) 160static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7680fc275036..48d640ca1a05 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
156static void invoke_rcu_core(void); 156static void invoke_rcu_core(void);
157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
158 158
159/* rcuc/rcub kthread realtime priority */
160static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
161module_param(kthread_prio, int, 0644);
162
159/* 163/*
160 * Track the rcutorture test sequence number and the update version 164 * Track the rcutorture test sequence number and the update version
161 * number within a given test. The rcutorture_testseq is incremented 165 * number within a given test. The rcutorture_testseq is incremented
@@ -215,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
215#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 219#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
216}; 220};
217 221
222DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
223EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
224
218/* 225/*
219 * Let the RCU core know that this CPU has gone through the scheduler, 226 * Let the RCU core know that this CPU has gone through the scheduler,
220 * which is a quiescent state. This is called when the need for a 227 * which is a quiescent state. This is called when the need for a
@@ -284,6 +291,22 @@ void rcu_note_context_switch(void)
284} 291}
285EXPORT_SYMBOL_GPL(rcu_note_context_switch); 292EXPORT_SYMBOL_GPL(rcu_note_context_switch);
286 293
294/*
295 * Register a quiesecent state for all RCU flavors. If there is an
296 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
297 * dyntick-idle quiescent state visible to other CPUs (but only for those
298 * RCU flavors in desparate need of a quiescent state, which will normally
299 * be none of them). Either way, do a lightweight quiescent state for
300 * all RCU flavors.
301 */
302void rcu_all_qs(void)
303{
304 if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
305 rcu_momentary_dyntick_idle();
306 this_cpu_inc(rcu_qs_ctr);
307}
308EXPORT_SYMBOL_GPL(rcu_all_qs);
309
287static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 310static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
288static long qhimark = 10000; /* If this many pending, ignore blimit. */ 311static long qhimark = 10000; /* If this many pending, ignore blimit. */
289static long qlowmark = 100; /* Once only this many pending, use blimit. */ 312static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -315,18 +338,54 @@ static void force_quiescent_state(struct rcu_state *rsp);
315static int rcu_pending(void); 338static int rcu_pending(void);
316 339
317/* 340/*
318 * Return the number of RCU-sched batches processed thus far for debug & stats. 341 * Return the number of RCU batches started thus far for debug & stats.
342 */
343unsigned long rcu_batches_started(void)
344{
345 return rcu_state_p->gpnum;
346}
347EXPORT_SYMBOL_GPL(rcu_batches_started);
348
349/*
350 * Return the number of RCU-sched batches started thus far for debug & stats.
351 */
352unsigned long rcu_batches_started_sched(void)
353{
354 return rcu_sched_state.gpnum;
355}
356EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
357
358/*
359 * Return the number of RCU BH batches started thus far for debug & stats.
319 */ 360 */
320long rcu_batches_completed_sched(void) 361unsigned long rcu_batches_started_bh(void)
362{
363 return rcu_bh_state.gpnum;
364}
365EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
366
367/*
368 * Return the number of RCU batches completed thus far for debug & stats.
369 */
370unsigned long rcu_batches_completed(void)
371{
372 return rcu_state_p->completed;
373}
374EXPORT_SYMBOL_GPL(rcu_batches_completed);
375
376/*
377 * Return the number of RCU-sched batches completed thus far for debug & stats.
378 */
379unsigned long rcu_batches_completed_sched(void)
321{ 380{
322 return rcu_sched_state.completed; 381 return rcu_sched_state.completed;
323} 382}
324EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); 383EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
325 384
326/* 385/*
327 * Return the number of RCU BH batches processed thus far for debug & stats. 386 * Return the number of RCU BH batches completed thus far for debug & stats.
328 */ 387 */
329long rcu_batches_completed_bh(void) 388unsigned long rcu_batches_completed_bh(void)
330{ 389{
331 return rcu_bh_state.completed; 390 return rcu_bh_state.completed;
332} 391}
@@ -759,39 +818,71 @@ void rcu_irq_enter(void)
759/** 818/**
760 * rcu_nmi_enter - inform RCU of entry to NMI context 819 * rcu_nmi_enter - inform RCU of entry to NMI context
761 * 820 *
762 * If the CPU was idle with dynamic ticks active, and there is no 821 * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
763 * irq handler running, this updates rdtp->dynticks_nmi to let the 822 * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
764 * RCU grace-period handling know that the CPU is active. 823 * that the CPU is active. This implementation permits nested NMIs, as
824 * long as the nesting level does not overflow an int. (You will probably
825 * run out of stack space first.)
765 */ 826 */
766void rcu_nmi_enter(void) 827void rcu_nmi_enter(void)
767{ 828{
768 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 829 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
830 int incby = 2;
769 831
770 if (rdtp->dynticks_nmi_nesting == 0 && 832 /* Complain about underflow. */
771 (atomic_read(&rdtp->dynticks) & 0x1)) 833 WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
772 return; 834
773 rdtp->dynticks_nmi_nesting++; 835 /*
774 smp_mb__before_atomic(); /* Force delay from prior write. */ 836 * If idle from RCU viewpoint, atomically increment ->dynticks
775 atomic_inc(&rdtp->dynticks); 837 * to mark non-idle and increment ->dynticks_nmi_nesting by one.
776 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 838 * Otherwise, increment ->dynticks_nmi_nesting by two. This means
777 smp_mb__after_atomic(); /* See above. */ 839 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
778 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 840 * to be in the outermost NMI handler that interrupted an RCU-idle
841 * period (observation due to Andy Lutomirski).
842 */
843 if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
844 smp_mb__before_atomic(); /* Force delay from prior write. */
845 atomic_inc(&rdtp->dynticks);
846 /* atomic_inc() before later RCU read-side crit sects */
847 smp_mb__after_atomic(); /* See above. */
848 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
849 incby = 1;
850 }
851 rdtp->dynticks_nmi_nesting += incby;
852 barrier();
779} 853}
780 854
781/** 855/**
782 * rcu_nmi_exit - inform RCU of exit from NMI context 856 * rcu_nmi_exit - inform RCU of exit from NMI context
783 * 857 *
784 * If the CPU was idle with dynamic ticks active, and there is no 858 * If we are returning from the outermost NMI handler that interrupted an
785 * irq handler running, this updates rdtp->dynticks_nmi to let the 859 * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
786 * RCU grace-period handling know that the CPU is no longer active. 860 * to let the RCU grace-period handling know that the CPU is back to
861 * being RCU-idle.
787 */ 862 */
788void rcu_nmi_exit(void) 863void rcu_nmi_exit(void)
789{ 864{
790 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 865 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
791 866
792 if (rdtp->dynticks_nmi_nesting == 0 || 867 /*
793 --rdtp->dynticks_nmi_nesting != 0) 868 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
869 * (We are exiting an NMI handler, so RCU better be paying attention
870 * to us!)
871 */
872 WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
873 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
874
875 /*
876 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
877 * leave it in non-RCU-idle state.
878 */
879 if (rdtp->dynticks_nmi_nesting != 1) {
880 rdtp->dynticks_nmi_nesting -= 2;
794 return; 881 return;
882 }
883
884 /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
885 rdtp->dynticks_nmi_nesting = 0;
795 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 886 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
796 smp_mb__before_atomic(); /* See above. */ 887 smp_mb__before_atomic(); /* See above. */
797 atomic_inc(&rdtp->dynticks); 888 atomic_inc(&rdtp->dynticks);
@@ -898,17 +989,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
898 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); 989 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
899 return 1; 990 return 1;
900 } else { 991 } else {
992 if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
993 rdp->mynode->gpnum))
994 ACCESS_ONCE(rdp->gpwrap) = true;
901 return 0; 995 return 0;
902 } 996 }
903} 997}
904 998
905/* 999/*
906 * This function really isn't for public consumption, but RCU is special in
907 * that context switches can allow the state machine to make progress.
908 */
909extern void resched_cpu(int cpu);
910
911/*
912 * Return true if the specified CPU has passed through a quiescent 1000 * Return true if the specified CPU has passed through a quiescent
913 * state by virtue of being in or having passed through an dynticks 1001 * state by virtue of being in or having passed through an dynticks
914 * idle state since the last call to dyntick_save_progress_counter() 1002 * idle state since the last call to dyntick_save_progress_counter()
@@ -1011,6 +1099,22 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
1011 j1 = rcu_jiffies_till_stall_check(); 1099 j1 = rcu_jiffies_till_stall_check();
1012 ACCESS_ONCE(rsp->jiffies_stall) = j + j1; 1100 ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
1013 rsp->jiffies_resched = j + j1 / 2; 1101 rsp->jiffies_resched = j + j1 / 2;
1102 rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
1103}
1104
1105/*
1106 * Complain about starvation of grace-period kthread.
1107 */
1108static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
1109{
1110 unsigned long gpa;
1111 unsigned long j;
1112
1113 j = jiffies;
1114 gpa = ACCESS_ONCE(rsp->gp_activity);
1115 if (j - gpa > 2 * HZ)
1116 pr_err("%s kthread starved for %ld jiffies!\n",
1117 rsp->name, j - gpa);
1014} 1118}
1015 1119
1016/* 1120/*
@@ -1033,11 +1137,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
1033 } 1137 }
1034} 1138}
1035 1139
1036static void print_other_cpu_stall(struct rcu_state *rsp) 1140static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1037{ 1141{
1038 int cpu; 1142 int cpu;
1039 long delta; 1143 long delta;
1040 unsigned long flags; 1144 unsigned long flags;
1145 unsigned long gpa;
1146 unsigned long j;
1041 int ndetected = 0; 1147 int ndetected = 0;
1042 struct rcu_node *rnp = rcu_get_root(rsp); 1148 struct rcu_node *rnp = rcu_get_root(rsp);
1043 long totqlen = 0; 1149 long totqlen = 0;
@@ -1075,30 +1181,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
1075 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1181 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1076 } 1182 }
1077 1183
1078 /*
1079 * Now rat on any tasks that got kicked up to the root rcu_node
1080 * due to CPU offlining.
1081 */
1082 rnp = rcu_get_root(rsp);
1083 raw_spin_lock_irqsave(&rnp->lock, flags);
1084 ndetected += rcu_print_task_stall(rnp);
1085 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1086
1087 print_cpu_stall_info_end(); 1184 print_cpu_stall_info_end();
1088 for_each_possible_cpu(cpu) 1185 for_each_possible_cpu(cpu)
1089 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1186 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
1090 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", 1187 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
1091 smp_processor_id(), (long)(jiffies - rsp->gp_start), 1188 smp_processor_id(), (long)(jiffies - rsp->gp_start),
1092 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1189 (long)rsp->gpnum, (long)rsp->completed, totqlen);
1093 if (ndetected == 0) 1190 if (ndetected) {
1094 pr_err("INFO: Stall ended before state dump start\n");
1095 else
1096 rcu_dump_cpu_stacks(rsp); 1191 rcu_dump_cpu_stacks(rsp);
1192 } else {
1193 if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
1194 ACCESS_ONCE(rsp->completed) == gpnum) {
1195 pr_err("INFO: Stall ended before state dump start\n");
1196 } else {
1197 j = jiffies;
1198 gpa = ACCESS_ONCE(rsp->gp_activity);
1199 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
1200 rsp->name, j - gpa, j, gpa,
1201 jiffies_till_next_fqs);
1202 /* In this case, the current CPU might be at fault. */
1203 sched_show_task(current);
1204 }
1205 }
1097 1206
1098 /* Complain about tasks blocking the grace period. */ 1207 /* Complain about tasks blocking the grace period. */
1099
1100 rcu_print_detail_task_stall(rsp); 1208 rcu_print_detail_task_stall(rsp);
1101 1209
1210 rcu_check_gp_kthread_starvation(rsp);
1211
1102 force_quiescent_state(rsp); /* Kick them all. */ 1212 force_quiescent_state(rsp); /* Kick them all. */
1103} 1213}
1104 1214
@@ -1123,6 +1233,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
1123 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", 1233 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
1124 jiffies - rsp->gp_start, 1234 jiffies - rsp->gp_start,
1125 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1235 (long)rsp->gpnum, (long)rsp->completed, totqlen);
1236
1237 rcu_check_gp_kthread_starvation(rsp);
1238
1126 rcu_dump_cpu_stacks(rsp); 1239 rcu_dump_cpu_stacks(rsp);
1127 1240
1128 raw_spin_lock_irqsave(&rnp->lock, flags); 1241 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1193,7 +1306,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
1193 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { 1306 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
1194 1307
1195 /* They had a few time units to dump stack, so complain. */ 1308 /* They had a few time units to dump stack, so complain. */
1196 print_other_cpu_stall(rsp); 1309 print_other_cpu_stall(rsp, gpnum);
1197 } 1310 }
1198} 1311}
1199 1312
@@ -1530,7 +1643,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1530 bool ret; 1643 bool ret;
1531 1644
1532 /* Handle the ends of any preceding grace periods first. */ 1645 /* Handle the ends of any preceding grace periods first. */
1533 if (rdp->completed == rnp->completed) { 1646 if (rdp->completed == rnp->completed &&
1647 !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
1534 1648
1535 /* No grace period end, so just accelerate recent callbacks. */ 1649 /* No grace period end, so just accelerate recent callbacks. */
1536 ret = rcu_accelerate_cbs(rsp, rnp, rdp); 1650 ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1545,7 +1659,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1545 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); 1659 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
1546 } 1660 }
1547 1661
1548 if (rdp->gpnum != rnp->gpnum) { 1662 if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
1549 /* 1663 /*
1550 * If the current grace period is waiting for this CPU, 1664 * If the current grace period is waiting for this CPU,
1551 * set up to detect a quiescent state, otherwise don't 1665 * set up to detect a quiescent state, otherwise don't
@@ -1554,8 +1668,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1554 rdp->gpnum = rnp->gpnum; 1668 rdp->gpnum = rnp->gpnum;
1555 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); 1669 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
1556 rdp->passed_quiesce = 0; 1670 rdp->passed_quiesce = 0;
1671 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
1557 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1672 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1558 zero_cpu_stall_ticks(rdp); 1673 zero_cpu_stall_ticks(rdp);
1674 ACCESS_ONCE(rdp->gpwrap) = false;
1559 } 1675 }
1560 return ret; 1676 return ret;
1561} 1677}
@@ -1569,7 +1685,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1569 local_irq_save(flags); 1685 local_irq_save(flags);
1570 rnp = rdp->mynode; 1686 rnp = rdp->mynode;
1571 if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && 1687 if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
1572 rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ 1688 rdp->completed == ACCESS_ONCE(rnp->completed) &&
1689 !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
1573 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ 1690 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1574 local_irq_restore(flags); 1691 local_irq_restore(flags);
1575 return; 1692 return;
@@ -1589,6 +1706,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1589 struct rcu_data *rdp; 1706 struct rcu_data *rdp;
1590 struct rcu_node *rnp = rcu_get_root(rsp); 1707 struct rcu_node *rnp = rcu_get_root(rsp);
1591 1708
1709 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1592 rcu_bind_gp_kthread(); 1710 rcu_bind_gp_kthread();
1593 raw_spin_lock_irq(&rnp->lock); 1711 raw_spin_lock_irq(&rnp->lock);
1594 smp_mb__after_unlock_lock(); 1712 smp_mb__after_unlock_lock();
@@ -1649,6 +1767,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1649 rnp->grphi, rnp->qsmask); 1767 rnp->grphi, rnp->qsmask);
1650 raw_spin_unlock_irq(&rnp->lock); 1768 raw_spin_unlock_irq(&rnp->lock);
1651 cond_resched_rcu_qs(); 1769 cond_resched_rcu_qs();
1770 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1652 } 1771 }
1653 1772
1654 mutex_unlock(&rsp->onoff_mutex); 1773 mutex_unlock(&rsp->onoff_mutex);
@@ -1665,6 +1784,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1665 unsigned long maxj; 1784 unsigned long maxj;
1666 struct rcu_node *rnp = rcu_get_root(rsp); 1785 struct rcu_node *rnp = rcu_get_root(rsp);
1667 1786
1787 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1668 rsp->n_force_qs++; 1788 rsp->n_force_qs++;
1669 if (fqs_state == RCU_SAVE_DYNTICK) { 1789 if (fqs_state == RCU_SAVE_DYNTICK) {
1670 /* Collect dyntick-idle snapshots. */ 1790 /* Collect dyntick-idle snapshots. */
@@ -1703,6 +1823,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1703 struct rcu_data *rdp; 1823 struct rcu_data *rdp;
1704 struct rcu_node *rnp = rcu_get_root(rsp); 1824 struct rcu_node *rnp = rcu_get_root(rsp);
1705 1825
1826 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1706 raw_spin_lock_irq(&rnp->lock); 1827 raw_spin_lock_irq(&rnp->lock);
1707 smp_mb__after_unlock_lock(); 1828 smp_mb__after_unlock_lock();
1708 gp_duration = jiffies - rsp->gp_start; 1829 gp_duration = jiffies - rsp->gp_start;
@@ -1739,6 +1860,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1739 nocb += rcu_future_gp_cleanup(rsp, rnp); 1860 nocb += rcu_future_gp_cleanup(rsp, rnp);
1740 raw_spin_unlock_irq(&rnp->lock); 1861 raw_spin_unlock_irq(&rnp->lock);
1741 cond_resched_rcu_qs(); 1862 cond_resched_rcu_qs();
1863 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1742 } 1864 }
1743 rnp = rcu_get_root(rsp); 1865 rnp = rcu_get_root(rsp);
1744 raw_spin_lock_irq(&rnp->lock); 1866 raw_spin_lock_irq(&rnp->lock);
@@ -1788,6 +1910,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1788 if (rcu_gp_init(rsp)) 1910 if (rcu_gp_init(rsp))
1789 break; 1911 break;
1790 cond_resched_rcu_qs(); 1912 cond_resched_rcu_qs();
1913 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1791 WARN_ON(signal_pending(current)); 1914 WARN_ON(signal_pending(current));
1792 trace_rcu_grace_period(rsp->name, 1915 trace_rcu_grace_period(rsp->name,
1793 ACCESS_ONCE(rsp->gpnum), 1916 ACCESS_ONCE(rsp->gpnum),
@@ -1831,9 +1954,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
1831 ACCESS_ONCE(rsp->gpnum), 1954 ACCESS_ONCE(rsp->gpnum),
1832 TPS("fqsend")); 1955 TPS("fqsend"));
1833 cond_resched_rcu_qs(); 1956 cond_resched_rcu_qs();
1957 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1834 } else { 1958 } else {
1835 /* Deal with stray signal. */ 1959 /* Deal with stray signal. */
1836 cond_resched_rcu_qs(); 1960 cond_resched_rcu_qs();
1961 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1837 WARN_ON(signal_pending(current)); 1962 WARN_ON(signal_pending(current));
1838 trace_rcu_grace_period(rsp->name, 1963 trace_rcu_grace_period(rsp->name,
1839 ACCESS_ONCE(rsp->gpnum), 1964 ACCESS_ONCE(rsp->gpnum),
@@ -2010,8 +2135,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2010 rnp = rdp->mynode; 2135 rnp = rdp->mynode;
2011 raw_spin_lock_irqsave(&rnp->lock, flags); 2136 raw_spin_lock_irqsave(&rnp->lock, flags);
2012 smp_mb__after_unlock_lock(); 2137 smp_mb__after_unlock_lock();
2013 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || 2138 if ((rdp->passed_quiesce == 0 &&
2014 rnp->completed == rnp->gpnum) { 2139 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
2140 rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
2141 rdp->gpwrap) {
2015 2142
2016 /* 2143 /*
2017 * The grace period in which this quiescent state was 2144 * The grace period in which this quiescent state was
@@ -2020,6 +2147,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2020 * within the current grace period. 2147 * within the current grace period.
2021 */ 2148 */
2022 rdp->passed_quiesce = 0; /* need qs for new gp. */ 2149 rdp->passed_quiesce = 0; /* need qs for new gp. */
2150 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
2023 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2151 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2024 return; 2152 return;
2025 } 2153 }
@@ -2064,7 +2192,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
2064 * Was there a quiescent state since the beginning of the grace 2192 * Was there a quiescent state since the beginning of the grace
2065 * period? If no, then exit and wait for the next call. 2193 * period? If no, then exit and wait for the next call.
2066 */ 2194 */
2067 if (!rdp->passed_quiesce) 2195 if (!rdp->passed_quiesce &&
2196 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
2068 return; 2197 return;
2069 2198
2070 /* 2199 /*
@@ -2195,6 +2324,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2195} 2324}
2196 2325
2197/* 2326/*
2327 * All CPUs for the specified rcu_node structure have gone offline,
2328 * and all tasks that were preempted within an RCU read-side critical
2329 * section while running on one of those CPUs have since exited their RCU
2330 * read-side critical section. Some other CPU is reporting this fact with
2331 * the specified rcu_node structure's ->lock held and interrupts disabled.
2332 * This function therefore goes up the tree of rcu_node structures,
2333 * clearing the corresponding bits in the ->qsmaskinit fields. Note that
2334 * the leaf rcu_node structure's ->qsmaskinit field has already been
2335 * updated
2336 *
2337 * This function does check that the specified rcu_node structure has
2338 * all CPUs offline and no blocked tasks, so it is OK to invoke it
2339 * prematurely. That said, invoking it after the fact will cost you
2340 * a needless lock acquisition. So once it has done its work, don't
2341 * invoke it again.
2342 */
2343static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2344{
2345 long mask;
2346 struct rcu_node *rnp = rnp_leaf;
2347
2348 if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
2349 return;
2350 for (;;) {
2351 mask = rnp->grpmask;
2352 rnp = rnp->parent;
2353 if (!rnp)
2354 break;
2355 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
2356 smp_mb__after_unlock_lock(); /* GP memory ordering. */
2357 rnp->qsmaskinit &= ~mask;
2358 if (rnp->qsmaskinit) {
2359 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2360 return;
2361 }
2362 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2363 }
2364}
2365
2366/*
2198 * The CPU has been completely removed, and some other CPU is reporting 2367 * The CPU has been completely removed, and some other CPU is reporting
2199 * this fact from process context. Do the remainder of the cleanup, 2368 * this fact from process context. Do the remainder of the cleanup,
2200 * including orphaning the outgoing CPU's RCU callbacks, and also 2369 * including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2204,8 +2373,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2204static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2373static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2205{ 2374{
2206 unsigned long flags; 2375 unsigned long flags;
2207 unsigned long mask;
2208 int need_report = 0;
2209 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2376 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2210 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 2377 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
2211 2378
@@ -2219,40 +2386,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2219 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 2386 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
2220 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 2387 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
2221 rcu_adopt_orphan_cbs(rsp, flags); 2388 rcu_adopt_orphan_cbs(rsp, flags);
2389 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
2222 2390
2223 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 2391 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
2224 mask = rdp->grpmask; /* rnp->grplo is constant. */ 2392 raw_spin_lock_irqsave(&rnp->lock, flags);
2225 do { 2393 smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
2226 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 2394 rnp->qsmaskinit &= ~rdp->grpmask;
2227 smp_mb__after_unlock_lock(); 2395 if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
2228 rnp->qsmaskinit &= ~mask; 2396 rcu_cleanup_dead_rnp(rnp);
2229 if (rnp->qsmaskinit != 0) { 2397 rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
2230 if (rnp != rdp->mynode)
2231 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2232 break;
2233 }
2234 if (rnp == rdp->mynode)
2235 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
2236 else
2237 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2238 mask = rnp->grpmask;
2239 rnp = rnp->parent;
2240 } while (rnp != NULL);
2241
2242 /*
2243 * We still hold the leaf rcu_node structure lock here, and
2244 * irqs are still disabled. The reason for this subterfuge is
2245 * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
2246 * held leads to deadlock.
2247 */
2248 raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
2249 rnp = rdp->mynode;
2250 if (need_report & RCU_OFL_TASKS_NORM_GP)
2251 rcu_report_unblock_qs_rnp(rnp, flags);
2252 else
2253 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2254 if (need_report & RCU_OFL_TASKS_EXP_GP)
2255 rcu_report_exp_rnp(rsp, rnp, true);
2256 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 2398 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
2257 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 2399 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
2258 cpu, rdp->qlen, rdp->nxtlist); 2400 cpu, rdp->qlen, rdp->nxtlist);
@@ -2268,6 +2410,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2268{ 2410{
2269} 2411}
2270 2412
2413static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2414{
2415}
2416
2271static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2417static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2272{ 2418{
2273} 2419}
@@ -2464,12 +2610,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
2464 } 2610 }
2465 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2611 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2466 } 2612 }
2467 rnp = rcu_get_root(rsp);
2468 if (rnp->qsmask == 0) {
2469 raw_spin_lock_irqsave(&rnp->lock, flags);
2470 smp_mb__after_unlock_lock();
2471 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
2472 }
2473} 2613}
2474 2614
2475/* 2615/*
@@ -2569,7 +2709,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
2569 * Schedule RCU callback invocation. If the specified type of RCU 2709 * Schedule RCU callback invocation. If the specified type of RCU
2570 * does not support RCU priority boosting, just do a direct call, 2710 * does not support RCU priority boosting, just do a direct call,
2571 * otherwise wake up the per-CPU kernel kthread. Note that because we 2711 * otherwise wake up the per-CPU kernel kthread. Note that because we
2572 * are running on the current CPU with interrupts disabled, the 2712 * are running on the current CPU with softirqs disabled, the
2573 * rcu_cpu_kthread_task cannot disappear out from under us. 2713 * rcu_cpu_kthread_task cannot disappear out from under us.
2574 */ 2714 */
2575static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 2715static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -3109,9 +3249,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3109 3249
3110 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3250 /* Is the RCU core waiting for a quiescent state from this CPU? */
3111 if (rcu_scheduler_fully_active && 3251 if (rcu_scheduler_fully_active &&
3112 rdp->qs_pending && !rdp->passed_quiesce) { 3252 rdp->qs_pending && !rdp->passed_quiesce &&
3253 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
3113 rdp->n_rp_qs_pending++; 3254 rdp->n_rp_qs_pending++;
3114 } else if (rdp->qs_pending && rdp->passed_quiesce) { 3255 } else if (rdp->qs_pending &&
3256 (rdp->passed_quiesce ||
3257 rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
3115 rdp->n_rp_report_qs++; 3258 rdp->n_rp_report_qs++;
3116 return 1; 3259 return 1;
3117 } 3260 }
@@ -3135,7 +3278,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3135 } 3278 }
3136 3279
3137 /* Has a new RCU grace period started? */ 3280 /* Has a new RCU grace period started? */
3138 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ 3281 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
3282 unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
3139 rdp->n_rp_gp_started++; 3283 rdp->n_rp_gp_started++;
3140 return 1; 3284 return 1;
3141 } 3285 }
@@ -3318,6 +3462,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
3318 } else { 3462 } else {
3319 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 3463 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
3320 rsp->n_barrier_done); 3464 rsp->n_barrier_done);
3465 smp_mb__before_atomic();
3321 atomic_inc(&rsp->barrier_cpu_count); 3466 atomic_inc(&rsp->barrier_cpu_count);
3322 __call_rcu(&rdp->barrier_head, 3467 __call_rcu(&rdp->barrier_head,
3323 rcu_barrier_callback, rsp, cpu, 0); 3468 rcu_barrier_callback, rsp, cpu, 0);
@@ -3385,9 +3530,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
3385 /* Set up local state, ensuring consistent view of global state. */ 3530 /* Set up local state, ensuring consistent view of global state. */
3386 raw_spin_lock_irqsave(&rnp->lock, flags); 3531 raw_spin_lock_irqsave(&rnp->lock, flags);
3387 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 3532 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
3388 init_callback_list(rdp);
3389 rdp->qlen_lazy = 0;
3390 ACCESS_ONCE(rdp->qlen) = 0;
3391 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 3533 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
3392 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 3534 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
3393 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 3535 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -3444,6 +3586,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3444 rdp->gpnum = rnp->completed; 3586 rdp->gpnum = rnp->completed;
3445 rdp->completed = rnp->completed; 3587 rdp->completed = rnp->completed;
3446 rdp->passed_quiesce = 0; 3588 rdp->passed_quiesce = 0;
3589 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
3447 rdp->qs_pending = 0; 3590 rdp->qs_pending = 0;
3448 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); 3591 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3449 } 3592 }
@@ -3535,17 +3678,35 @@ static int rcu_pm_notify(struct notifier_block *self,
3535static int __init rcu_spawn_gp_kthread(void) 3678static int __init rcu_spawn_gp_kthread(void)
3536{ 3679{
3537 unsigned long flags; 3680 unsigned long flags;
3681 int kthread_prio_in = kthread_prio;
3538 struct rcu_node *rnp; 3682 struct rcu_node *rnp;
3539 struct rcu_state *rsp; 3683 struct rcu_state *rsp;
3684 struct sched_param sp;
3540 struct task_struct *t; 3685 struct task_struct *t;
3541 3686
3687 /* Force priority into range. */
3688 if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
3689 kthread_prio = 1;
3690 else if (kthread_prio < 0)
3691 kthread_prio = 0;
3692 else if (kthread_prio > 99)
3693 kthread_prio = 99;
3694 if (kthread_prio != kthread_prio_in)
3695 pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
3696 kthread_prio, kthread_prio_in);
3697
3542 rcu_scheduler_fully_active = 1; 3698 rcu_scheduler_fully_active = 1;
3543 for_each_rcu_flavor(rsp) { 3699 for_each_rcu_flavor(rsp) {
3544 t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); 3700 t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
3545 BUG_ON(IS_ERR(t)); 3701 BUG_ON(IS_ERR(t));
3546 rnp = rcu_get_root(rsp); 3702 rnp = rcu_get_root(rsp);
3547 raw_spin_lock_irqsave(&rnp->lock, flags); 3703 raw_spin_lock_irqsave(&rnp->lock, flags);
3548 rsp->gp_kthread = t; 3704 rsp->gp_kthread = t;
3705 if (kthread_prio) {
3706 sp.sched_priority = kthread_prio;
3707 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
3708 }
3709 wake_up_process(t);
3549 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3710 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3550 } 3711 }
3551 rcu_spawn_nocb_kthreads(); 3712 rcu_spawn_nocb_kthreads();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b1843896e..119de399eb2f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,7 +27,6 @@
27#include <linux/threads.h> 27#include <linux/threads.h>
28#include <linux/cpumask.h> 28#include <linux/cpumask.h>
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30#include <linux/irq_work.h>
31 30
32/* 31/*
33 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and 32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -172,11 +171,6 @@ struct rcu_node {
172 /* queued on this rcu_node structure that */ 171 /* queued on this rcu_node structure that */
173 /* are blocking the current grace period, */ 172 /* are blocking the current grace period, */
174 /* there can be no such task. */ 173 /* there can be no such task. */
175 struct completion boost_completion;
176 /* Used to ensure that the rt_mutex used */
177 /* to carry out the boosting is fully */
178 /* released with no future boostee accesses */
179 /* before that rt_mutex is re-initialized. */
180 struct rt_mutex boost_mtx; 174 struct rt_mutex boost_mtx;
181 /* Used only for the priority-boosting */ 175 /* Used only for the priority-boosting */
182 /* side effect, not as a lock. */ 176 /* side effect, not as a lock. */
@@ -257,9 +251,12 @@ struct rcu_data {
257 /* in order to detect GP end. */ 251 /* in order to detect GP end. */
258 unsigned long gpnum; /* Highest gp number that this CPU */ 252 unsigned long gpnum; /* Highest gp number that this CPU */
259 /* is aware of having started. */ 253 /* is aware of having started. */
254 unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
255 /* for rcu_all_qs() invocations. */
260 bool passed_quiesce; /* User-mode/idle loop etc. */ 256 bool passed_quiesce; /* User-mode/idle loop etc. */
261 bool qs_pending; /* Core waits for quiesc state. */ 257 bool qs_pending; /* Core waits for quiesc state. */
262 bool beenonline; /* CPU online at least once. */ 258 bool beenonline; /* CPU online at least once. */
259 bool gpwrap; /* Possible gpnum/completed wrap. */
263 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 260 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
264 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 261 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
265#ifdef CONFIG_RCU_CPU_STALL_INFO 262#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -340,14 +337,10 @@ struct rcu_data {
340#ifdef CONFIG_RCU_NOCB_CPU 337#ifdef CONFIG_RCU_NOCB_CPU
341 struct rcu_head *nocb_head; /* CBs waiting for kthread. */ 338 struct rcu_head *nocb_head; /* CBs waiting for kthread. */
342 struct rcu_head **nocb_tail; 339 struct rcu_head **nocb_tail;
343 atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ 340 atomic_long_t nocb_q_count; /* # CBs waiting for nocb */
344 atomic_long_t nocb_q_count_lazy; /* (approximate). */ 341 atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
345 struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ 342 struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
346 struct rcu_head **nocb_follower_tail; 343 struct rcu_head **nocb_follower_tail;
347 atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
348 atomic_long_t nocb_follower_count_lazy; /* (approximate). */
349 int nocb_p_count; /* # CBs being invoked by kthread */
350 int nocb_p_count_lazy; /* (approximate). */
351 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ 344 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
352 struct task_struct *nocb_kthread; 345 struct task_struct *nocb_kthread;
353 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ 346 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
@@ -356,8 +349,6 @@ struct rcu_data {
356 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; 349 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
357 /* CBs waiting for GP. */ 350 /* CBs waiting for GP. */
358 struct rcu_head **nocb_gp_tail; 351 struct rcu_head **nocb_gp_tail;
359 long nocb_gp_count;
360 long nocb_gp_count_lazy;
361 bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ 352 bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
362 struct rcu_data *nocb_next_follower; 353 struct rcu_data *nocb_next_follower;
363 /* Next follower in wakeup chain. */ 354 /* Next follower in wakeup chain. */
@@ -488,10 +479,14 @@ struct rcu_state {
488 /* due to no GP active. */ 479 /* due to no GP active. */
489 unsigned long gp_start; /* Time at which GP started, */ 480 unsigned long gp_start; /* Time at which GP started, */
490 /* but in jiffies. */ 481 /* but in jiffies. */
482 unsigned long gp_activity; /* Time of last GP kthread */
483 /* activity in jiffies. */
491 unsigned long jiffies_stall; /* Time at which to check */ 484 unsigned long jiffies_stall; /* Time at which to check */
492 /* for CPU stalls. */ 485 /* for CPU stalls. */
493 unsigned long jiffies_resched; /* Time at which to resched */ 486 unsigned long jiffies_resched; /* Time at which to resched */
494 /* a reluctant CPU. */ 487 /* a reluctant CPU. */
488 unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
489 /* GP start. */
495 unsigned long gp_max; /* Maximum GP duration in */ 490 unsigned long gp_max; /* Maximum GP duration in */
496 /* jiffies. */ 491 /* jiffies. */
497 const char *name; /* Name of structure. */ 492 const char *name; /* Name of structure. */
@@ -514,13 +509,6 @@ extern struct list_head rcu_struct_flavors;
514#define for_each_rcu_flavor(rsp) \ 509#define for_each_rcu_flavor(rsp) \
515 list_for_each_entry((rsp), &rcu_struct_flavors, flavors) 510 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
516 511
517/* Return values for rcu_preempt_offline_tasks(). */
518
519#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
520 /* GP were moved to root. */
521#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
522 /* GP were moved to root. */
523
524/* 512/*
525 * RCU implementation internal declarations: 513 * RCU implementation internal declarations:
526 */ 514 */
@@ -546,27 +534,16 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
546 534
547/* Forward declarations for rcutree_plugin.h */ 535/* Forward declarations for rcutree_plugin.h */
548static void rcu_bootup_announce(void); 536static void rcu_bootup_announce(void);
549long rcu_batches_completed(void);
550static void rcu_preempt_note_context_switch(void); 537static void rcu_preempt_note_context_switch(void);
551static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 538static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
552#ifdef CONFIG_HOTPLUG_CPU 539#ifdef CONFIG_HOTPLUG_CPU
553static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 540static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
554 unsigned long flags);
555#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 541#endif /* #ifdef CONFIG_HOTPLUG_CPU */
556static void rcu_print_detail_task_stall(struct rcu_state *rsp); 542static void rcu_print_detail_task_stall(struct rcu_state *rsp);
557static int rcu_print_task_stall(struct rcu_node *rnp); 543static int rcu_print_task_stall(struct rcu_node *rnp);
558static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 544static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
559#ifdef CONFIG_HOTPLUG_CPU
560static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
561 struct rcu_node *rnp,
562 struct rcu_data *rdp);
563#endif /* #ifdef CONFIG_HOTPLUG_CPU */
564static void rcu_preempt_check_callbacks(void); 545static void rcu_preempt_check_callbacks(void);
565void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 546void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
566#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
567static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
568 bool wake);
569#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
570static void __init __rcu_init_preempt(void); 547static void __init __rcu_init_preempt(void);
571static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 548static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
572static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 549static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -622,24 +599,15 @@ static void rcu_dynticks_task_exit(void);
622#endif /* #ifndef RCU_TREE_NONCORE */ 599#endif /* #ifndef RCU_TREE_NONCORE */
623 600
624#ifdef CONFIG_RCU_TRACE 601#ifdef CONFIG_RCU_TRACE
625#ifdef CONFIG_RCU_NOCB_CPU 602/* Read out queue lengths for tracing. */
626/* Sum up queue lengths for tracing. */
627static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) 603static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
628{ 604{
629 *ql = atomic_long_read(&rdp->nocb_q_count) + 605#ifdef CONFIG_RCU_NOCB_CPU
630 rdp->nocb_p_count + 606 *ql = atomic_long_read(&rdp->nocb_q_count);
631 atomic_long_read(&rdp->nocb_follower_count) + 607 *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
632 rdp->nocb_p_count + rdp->nocb_gp_count;
633 *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
634 rdp->nocb_p_count_lazy +
635 atomic_long_read(&rdp->nocb_follower_count_lazy) +
636 rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
637}
638#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 608#else /* #ifdef CONFIG_RCU_NOCB_CPU */
639static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
640{
641 *ql = 0; 609 *ql = 0;
642 *qll = 0; 610 *qll = 0;
643}
644#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 611#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
612}
645#endif /* #ifdef CONFIG_RCU_TRACE */ 613#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..0d7bbe3095ad 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -34,10 +34,6 @@
34 34
35#include "../locking/rtmutex_common.h" 35#include "../locking/rtmutex_common.h"
36 36
37/* rcuc/rcub kthread realtime priority */
38static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
39module_param(kthread_prio, int, 0644);
40
41/* 37/*
42 * Control variables for per-CPU and per-rcu_node kthreads. These 38 * Control variables for per-CPU and per-rcu_node kthreads. These
43 * handle all flavors of RCU. 39 * handle all flavors of RCU.
@@ -53,7 +49,6 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
53static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ 49static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
54static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ 50static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
55static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ 51static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
56static char __initdata nocb_buf[NR_CPUS * 5];
57#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 52#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
58 53
59/* 54/*
@@ -103,6 +98,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
103static struct rcu_state *rcu_state_p = &rcu_preempt_state; 98static struct rcu_state *rcu_state_p = &rcu_preempt_state;
104 99
105static int rcu_preempted_readers_exp(struct rcu_node *rnp); 100static int rcu_preempted_readers_exp(struct rcu_node *rnp);
101static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
102 bool wake);
106 103
107/* 104/*
108 * Tell them what RCU they are running. 105 * Tell them what RCU they are running.
@@ -114,25 +111,6 @@ static void __init rcu_bootup_announce(void)
114} 111}
115 112
116/* 113/*
117 * Return the number of RCU-preempt batches processed thus far
118 * for debug and statistics.
119 */
120static long rcu_batches_completed_preempt(void)
121{
122 return rcu_preempt_state.completed;
123}
124EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
125
126/*
127 * Return the number of RCU batches processed thus far for debug & stats.
128 */
129long rcu_batches_completed(void)
130{
131 return rcu_batches_completed_preempt();
132}
133EXPORT_SYMBOL_GPL(rcu_batches_completed);
134
135/*
136 * Record a preemptible-RCU quiescent state for the specified CPU. Note 114 * Record a preemptible-RCU quiescent state for the specified CPU. Note
137 * that this just means that the task currently running on the CPU is 115 * that this just means that the task currently running on the CPU is
138 * not in a quiescent state. There might be any number of tasks blocked 116 * not in a quiescent state. There might be any number of tasks blocked
@@ -307,15 +285,25 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
307} 285}
308 286
309/* 287/*
288 * Return true if the specified rcu_node structure has tasks that were
289 * preempted within an RCU read-side critical section.
290 */
291static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
292{
293 return !list_empty(&rnp->blkd_tasks);
294}
295
296/*
310 * Handle special cases during rcu_read_unlock(), such as needing to 297 * Handle special cases during rcu_read_unlock(), such as needing to
311 * notify RCU core processing or task having blocked during the RCU 298 * notify RCU core processing or task having blocked during the RCU
312 * read-side critical section. 299 * read-side critical section.
313 */ 300 */
314void rcu_read_unlock_special(struct task_struct *t) 301void rcu_read_unlock_special(struct task_struct *t)
315{ 302{
316 int empty; 303 bool empty;
317 int empty_exp; 304 bool empty_exp;
318 int empty_exp_now; 305 bool empty_norm;
306 bool empty_exp_now;
319 unsigned long flags; 307 unsigned long flags;
320 struct list_head *np; 308 struct list_head *np;
321#ifdef CONFIG_RCU_BOOST 309#ifdef CONFIG_RCU_BOOST
@@ -367,7 +355,8 @@ void rcu_read_unlock_special(struct task_struct *t)
367 break; 355 break;
368 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 356 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
369 } 357 }
370 empty = !rcu_preempt_blocked_readers_cgp(rnp); 358 empty = !rcu_preempt_has_tasks(rnp);
359 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
371 empty_exp = !rcu_preempted_readers_exp(rnp); 360 empty_exp = !rcu_preempted_readers_exp(rnp);
372 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 361 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
373 np = rcu_next_node_entry(t, rnp); 362 np = rcu_next_node_entry(t, rnp);
@@ -387,13 +376,21 @@ void rcu_read_unlock_special(struct task_struct *t)
387#endif /* #ifdef CONFIG_RCU_BOOST */ 376#endif /* #ifdef CONFIG_RCU_BOOST */
388 377
389 /* 378 /*
379 * If this was the last task on the list, go see if we
380 * need to propagate ->qsmaskinit bit clearing up the
381 * rcu_node tree.
382 */
383 if (!empty && !rcu_preempt_has_tasks(rnp))
384 rcu_cleanup_dead_rnp(rnp);
385
386 /*
390 * If this was the last task on the current list, and if 387 * If this was the last task on the current list, and if
391 * we aren't waiting on any CPUs, report the quiescent state. 388 * we aren't waiting on any CPUs, report the quiescent state.
392 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, 389 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
393 * so we must take a snapshot of the expedited state. 390 * so we must take a snapshot of the expedited state.
394 */ 391 */
395 empty_exp_now = !rcu_preempted_readers_exp(rnp); 392 empty_exp_now = !rcu_preempted_readers_exp(rnp);
396 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 393 if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
397 trace_rcu_quiescent_state_report(TPS("preempt_rcu"), 394 trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
398 rnp->gpnum, 395 rnp->gpnum,
399 0, rnp->qsmask, 396 0, rnp->qsmask,
@@ -408,10 +405,8 @@ void rcu_read_unlock_special(struct task_struct *t)
408 405
409#ifdef CONFIG_RCU_BOOST 406#ifdef CONFIG_RCU_BOOST
410 /* Unboost if we were boosted. */ 407 /* Unboost if we were boosted. */
411 if (drop_boost_mutex) { 408 if (drop_boost_mutex)
412 rt_mutex_unlock(&rnp->boost_mtx); 409 rt_mutex_unlock(&rnp->boost_mtx);
413 complete(&rnp->boost_completion);
414 }
415#endif /* #ifdef CONFIG_RCU_BOOST */ 410#endif /* #ifdef CONFIG_RCU_BOOST */
416 411
417 /* 412 /*
@@ -519,99 +514,13 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
519static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 514static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
520{ 515{
521 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); 516 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
522 if (!list_empty(&rnp->blkd_tasks)) 517 if (rcu_preempt_has_tasks(rnp))
523 rnp->gp_tasks = rnp->blkd_tasks.next; 518 rnp->gp_tasks = rnp->blkd_tasks.next;
524 WARN_ON_ONCE(rnp->qsmask); 519 WARN_ON_ONCE(rnp->qsmask);
525} 520}
526 521
527#ifdef CONFIG_HOTPLUG_CPU 522#ifdef CONFIG_HOTPLUG_CPU
528 523
529/*
530 * Handle tasklist migration for case in which all CPUs covered by the
531 * specified rcu_node have gone offline. Move them up to the root
532 * rcu_node. The reason for not just moving them to the immediate
533 * parent is to remove the need for rcu_read_unlock_special() to
534 * make more than two attempts to acquire the target rcu_node's lock.
535 * Returns true if there were tasks blocking the current RCU grace
536 * period.
537 *
538 * Returns 1 if there was previously a task blocking the current grace
539 * period on the specified rcu_node structure.
540 *
541 * The caller must hold rnp->lock with irqs disabled.
542 */
543static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
544 struct rcu_node *rnp,
545 struct rcu_data *rdp)
546{
547 struct list_head *lp;
548 struct list_head *lp_root;
549 int retval = 0;
550 struct rcu_node *rnp_root = rcu_get_root(rsp);
551 struct task_struct *t;
552
553 if (rnp == rnp_root) {
554 WARN_ONCE(1, "Last CPU thought to be offlined?");
555 return 0; /* Shouldn't happen: at least one CPU online. */
556 }
557
558 /* If we are on an internal node, complain bitterly. */
559 WARN_ON_ONCE(rnp != rdp->mynode);
560
561 /*
562 * Move tasks up to root rcu_node. Don't try to get fancy for
563 * this corner-case operation -- just put this node's tasks
564 * at the head of the root node's list, and update the root node's
565 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
566 * if non-NULL. This might result in waiting for more tasks than
567 * absolutely necessary, but this is a good performance/complexity
568 * tradeoff.
569 */
570 if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
571 retval |= RCU_OFL_TASKS_NORM_GP;
572 if (rcu_preempted_readers_exp(rnp))
573 retval |= RCU_OFL_TASKS_EXP_GP;
574 lp = &rnp->blkd_tasks;
575 lp_root = &rnp_root->blkd_tasks;
576 while (!list_empty(lp)) {
577 t = list_entry(lp->next, typeof(*t), rcu_node_entry);
578 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
579 smp_mb__after_unlock_lock();
580 list_del(&t->rcu_node_entry);
581 t->rcu_blocked_node = rnp_root;
582 list_add(&t->rcu_node_entry, lp_root);
583 if (&t->rcu_node_entry == rnp->gp_tasks)
584 rnp_root->gp_tasks = rnp->gp_tasks;
585 if (&t->rcu_node_entry == rnp->exp_tasks)
586 rnp_root->exp_tasks = rnp->exp_tasks;
587#ifdef CONFIG_RCU_BOOST
588 if (&t->rcu_node_entry == rnp->boost_tasks)
589 rnp_root->boost_tasks = rnp->boost_tasks;
590#endif /* #ifdef CONFIG_RCU_BOOST */
591 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
592 }
593
594 rnp->gp_tasks = NULL;
595 rnp->exp_tasks = NULL;
596#ifdef CONFIG_RCU_BOOST
597 rnp->boost_tasks = NULL;
598 /*
599 * In case root is being boosted and leaf was not. Make sure
600 * that we boost the tasks blocking the current grace period
601 * in this case.
602 */
603 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
604 smp_mb__after_unlock_lock();
605 if (rnp_root->boost_tasks != NULL &&
606 rnp_root->boost_tasks != rnp_root->gp_tasks &&
607 rnp_root->boost_tasks != rnp_root->exp_tasks)
608 rnp_root->boost_tasks = rnp_root->gp_tasks;
609 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
610#endif /* #ifdef CONFIG_RCU_BOOST */
611
612 return retval;
613}
614
615#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 524#endif /* #ifdef CONFIG_HOTPLUG_CPU */
616 525
617/* 526/*
@@ -771,7 +680,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
771 680
772 raw_spin_lock_irqsave(&rnp->lock, flags); 681 raw_spin_lock_irqsave(&rnp->lock, flags);
773 smp_mb__after_unlock_lock(); 682 smp_mb__after_unlock_lock();
774 if (list_empty(&rnp->blkd_tasks)) { 683 if (!rcu_preempt_has_tasks(rnp)) {
775 raw_spin_unlock_irqrestore(&rnp->lock, flags); 684 raw_spin_unlock_irqrestore(&rnp->lock, flags);
776 } else { 685 } else {
777 rnp->exp_tasks = rnp->blkd_tasks.next; 686 rnp->exp_tasks = rnp->blkd_tasks.next;
@@ -933,15 +842,6 @@ static void __init rcu_bootup_announce(void)
933} 842}
934 843
935/* 844/*
936 * Return the number of RCU batches processed thus far for debug & stats.
937 */
938long rcu_batches_completed(void)
939{
940 return rcu_batches_completed_sched();
941}
942EXPORT_SYMBOL_GPL(rcu_batches_completed);
943
944/*
945 * Because preemptible RCU does not exist, we never have to check for 845 * Because preemptible RCU does not exist, we never have to check for
946 * CPUs being in quiescent states. 846 * CPUs being in quiescent states.
947 */ 847 */
@@ -960,11 +860,12 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
960 860
961#ifdef CONFIG_HOTPLUG_CPU 861#ifdef CONFIG_HOTPLUG_CPU
962 862
963/* Because preemptible RCU does not exist, no quieting of tasks. */ 863/*
964static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 864 * Because there is no preemptible RCU, there can be no readers blocked.
965 __releases(rnp->lock) 865 */
866static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
966{ 867{
967 raw_spin_unlock_irqrestore(&rnp->lock, flags); 868 return false;
968} 869}
969 870
970#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 871#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -996,23 +897,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
996 WARN_ON_ONCE(rnp->qsmask); 897 WARN_ON_ONCE(rnp->qsmask);
997} 898}
998 899
999#ifdef CONFIG_HOTPLUG_CPU
1000
1001/*
1002 * Because preemptible RCU does not exist, it never needs to migrate
1003 * tasks that were blocked within RCU read-side critical sections, and
1004 * such non-existent tasks cannot possibly have been blocking the current
1005 * grace period.
1006 */
1007static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1008 struct rcu_node *rnp,
1009 struct rcu_data *rdp)
1010{
1011 return 0;
1012}
1013
1014#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1015
1016/* 900/*
1017 * Because preemptible RCU does not exist, it never has any callbacks 901 * Because preemptible RCU does not exist, it never has any callbacks
1018 * to check. 902 * to check.
@@ -1031,20 +915,6 @@ void synchronize_rcu_expedited(void)
1031} 915}
1032EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 916EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1033 917
1034#ifdef CONFIG_HOTPLUG_CPU
1035
1036/*
1037 * Because preemptible RCU does not exist, there is never any need to
1038 * report on tasks preempted in RCU read-side critical sections during
1039 * expedited RCU grace periods.
1040 */
1041static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1042 bool wake)
1043{
1044}
1045
1046#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1047
1048/* 918/*
1049 * Because preemptible RCU does not exist, rcu_barrier() is just 919 * Because preemptible RCU does not exist, rcu_barrier() is just
1050 * another name for rcu_barrier_sched(). 920 * another name for rcu_barrier_sched().
@@ -1080,7 +950,7 @@ void exit_rcu(void)
1080 950
1081static void rcu_initiate_boost_trace(struct rcu_node *rnp) 951static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1082{ 952{
1083 if (list_empty(&rnp->blkd_tasks)) 953 if (!rcu_preempt_has_tasks(rnp))
1084 rnp->n_balk_blkd_tasks++; 954 rnp->n_balk_blkd_tasks++;
1085 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) 955 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1086 rnp->n_balk_exp_gp_tasks++; 956 rnp->n_balk_exp_gp_tasks++;
@@ -1127,7 +997,8 @@ static int rcu_boost(struct rcu_node *rnp)
1127 struct task_struct *t; 997 struct task_struct *t;
1128 struct list_head *tb; 998 struct list_head *tb;
1129 999
1130 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) 1000 if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
1001 ACCESS_ONCE(rnp->boost_tasks) == NULL)
1131 return 0; /* Nothing left to boost. */ 1002 return 0; /* Nothing left to boost. */
1132 1003
1133 raw_spin_lock_irqsave(&rnp->lock, flags); 1004 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1175,15 +1046,11 @@ static int rcu_boost(struct rcu_node *rnp)
1175 */ 1046 */
1176 t = container_of(tb, struct task_struct, rcu_node_entry); 1047 t = container_of(tb, struct task_struct, rcu_node_entry);
1177 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); 1048 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
1178 init_completion(&rnp->boost_completion);
1179 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1049 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1180 /* Lock only for side effect: boosts task t's priority. */ 1050 /* Lock only for side effect: boosts task t's priority. */
1181 rt_mutex_lock(&rnp->boost_mtx); 1051 rt_mutex_lock(&rnp->boost_mtx);
1182 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ 1052 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
1183 1053
1184 /* Wait for boostee to be done w/boost_mtx before reinitializing. */
1185 wait_for_completion(&rnp->boost_completion);
1186
1187 return ACCESS_ONCE(rnp->exp_tasks) != NULL || 1054 return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1188 ACCESS_ONCE(rnp->boost_tasks) != NULL; 1055 ACCESS_ONCE(rnp->boost_tasks) != NULL;
1189} 1056}
@@ -1416,12 +1283,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1416 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) 1283 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1417 if ((mask & 0x1) && cpu != outgoingcpu) 1284 if ((mask & 0x1) && cpu != outgoingcpu)
1418 cpumask_set_cpu(cpu, cm); 1285 cpumask_set_cpu(cpu, cm);
1419 if (cpumask_weight(cm) == 0) { 1286 if (cpumask_weight(cm) == 0)
1420 cpumask_setall(cm); 1287 cpumask_setall(cm);
1421 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1422 cpumask_clear_cpu(cpu, cm);
1423 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1424 }
1425 set_cpus_allowed_ptr(t, cm); 1288 set_cpus_allowed_ptr(t, cm);
1426 free_cpumask_var(cm); 1289 free_cpumask_var(cm);
1427} 1290}
@@ -1446,12 +1309,8 @@ static void __init rcu_spawn_boost_kthreads(void)
1446 for_each_possible_cpu(cpu) 1309 for_each_possible_cpu(cpu)
1447 per_cpu(rcu_cpu_has_work, cpu) = 0; 1310 per_cpu(rcu_cpu_has_work, cpu) = 0;
1448 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); 1311 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
1449 rnp = rcu_get_root(rcu_state_p); 1312 rcu_for_each_leaf_node(rcu_state_p, rnp)
1450 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); 1313 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1451 if (NUM_RCU_NODES > 1) {
1452 rcu_for_each_leaf_node(rcu_state_p, rnp)
1453 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1454 }
1455} 1314}
1456 1315
1457static void rcu_prepare_kthreads(int cpu) 1316static void rcu_prepare_kthreads(int cpu)
@@ -1605,7 +1464,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
1605 * completed since we last checked and there are 1464 * completed since we last checked and there are
1606 * callbacks not yet ready to invoke. 1465 * callbacks not yet ready to invoke.
1607 */ 1466 */
1608 if (rdp->completed != rnp->completed && 1467 if ((rdp->completed != rnp->completed ||
1468 unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
1609 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) 1469 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1610 note_gp_changes(rsp, rdp); 1470 note_gp_changes(rsp, rdp);
1611 1471
@@ -1898,11 +1758,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1898 ticks_value = rsp->gpnum - rdp->gpnum; 1758 ticks_value = rsp->gpnum - rdp->gpnum;
1899 } 1759 }
1900 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1760 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
1901 pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", 1761 pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
1902 cpu, ticks_value, ticks_title, 1762 cpu, ticks_value, ticks_title,
1903 atomic_read(&rdtp->dynticks) & 0xfff, 1763 atomic_read(&rdtp->dynticks) & 0xfff,
1904 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 1764 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
1905 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), 1765 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
1766 ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
1906 fast_no_hz); 1767 fast_no_hz);
1907} 1768}
1908 1769
@@ -2056,9 +1917,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
2056static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) 1917static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2057{ 1918{
2058 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1919 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1920 unsigned long ret;
1921#ifdef CONFIG_PROVE_RCU
2059 struct rcu_head *rhp; 1922 struct rcu_head *rhp;
1923#endif /* #ifdef CONFIG_PROVE_RCU */
2060 1924
2061 /* No-CBs CPUs might have callbacks on any of three lists. */ 1925 /*
1926 * Check count of all no-CBs callbacks awaiting invocation.
1927 * There needs to be a barrier before this function is called,
1928 * but associated with a prior determination that no more
1929 * callbacks would be posted. In the worst case, the first
1930 * barrier in _rcu_barrier() suffices (but the caller cannot
1931 * necessarily rely on this, not a substitute for the caller
1932 * getting the concurrency design right!). There must also be
1933 * a barrier between the following load an posting of a callback
1934 * (if a callback is in fact needed). This is associated with an
1935 * atomic_inc() in the caller.
1936 */
1937 ret = atomic_long_read(&rdp->nocb_q_count);
1938
1939#ifdef CONFIG_PROVE_RCU
2062 rhp = ACCESS_ONCE(rdp->nocb_head); 1940 rhp = ACCESS_ONCE(rdp->nocb_head);
2063 if (!rhp) 1941 if (!rhp)
2064 rhp = ACCESS_ONCE(rdp->nocb_gp_head); 1942 rhp = ACCESS_ONCE(rdp->nocb_gp_head);
@@ -2072,8 +1950,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2072 cpu, rhp->func); 1950 cpu, rhp->func);
2073 WARN_ON_ONCE(1); 1951 WARN_ON_ONCE(1);
2074 } 1952 }
1953#endif /* #ifdef CONFIG_PROVE_RCU */
2075 1954
2076 return !!rhp; 1955 return !!ret;
2077} 1956}
2078 1957
2079/* 1958/*
@@ -2095,9 +1974,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2095 struct task_struct *t; 1974 struct task_struct *t;
2096 1975
2097 /* Enqueue the callback on the nocb list and update counts. */ 1976 /* Enqueue the callback on the nocb list and update counts. */
1977 atomic_long_add(rhcount, &rdp->nocb_q_count);
1978 /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
2098 old_rhpp = xchg(&rdp->nocb_tail, rhtp); 1979 old_rhpp = xchg(&rdp->nocb_tail, rhtp);
2099 ACCESS_ONCE(*old_rhpp) = rhp; 1980 ACCESS_ONCE(*old_rhpp) = rhp;
2100 atomic_long_add(rhcount, &rdp->nocb_q_count);
2101 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); 1981 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
2102 smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ 1982 smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
2103 1983
@@ -2288,9 +2168,6 @@ wait_again:
2288 /* Move callbacks to wait-for-GP list, which is empty. */ 2168 /* Move callbacks to wait-for-GP list, which is empty. */
2289 ACCESS_ONCE(rdp->nocb_head) = NULL; 2169 ACCESS_ONCE(rdp->nocb_head) = NULL;
2290 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); 2170 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2291 rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
2292 rdp->nocb_gp_count_lazy =
2293 atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2294 gotcbs = true; 2171 gotcbs = true;
2295 } 2172 }
2296 2173
@@ -2338,9 +2215,6 @@ wait_again:
2338 /* Append callbacks to follower's "done" list. */ 2215 /* Append callbacks to follower's "done" list. */
2339 tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); 2216 tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
2340 *tail = rdp->nocb_gp_head; 2217 *tail = rdp->nocb_gp_head;
2341 atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
2342 atomic_long_add(rdp->nocb_gp_count_lazy,
2343 &rdp->nocb_follower_count_lazy);
2344 smp_mb__after_atomic(); /* Store *tail before wakeup. */ 2218 smp_mb__after_atomic(); /* Store *tail before wakeup. */
2345 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { 2219 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
2346 /* 2220 /*
@@ -2415,13 +2289,11 @@ static int rcu_nocb_kthread(void *arg)
2415 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); 2289 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
2416 ACCESS_ONCE(rdp->nocb_follower_head) = NULL; 2290 ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
2417 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); 2291 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
2418 c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
2419 cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
2420 rdp->nocb_p_count += c;
2421 rdp->nocb_p_count_lazy += cl;
2422 2292
2423 /* Each pass through the following loop invokes a callback. */ 2293 /* Each pass through the following loop invokes a callback. */
2424 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); 2294 trace_rcu_batch_start(rdp->rsp->name,
2295 atomic_long_read(&rdp->nocb_q_count_lazy),
2296 atomic_long_read(&rdp->nocb_q_count), -1);
2425 c = cl = 0; 2297 c = cl = 0;
2426 while (list) { 2298 while (list) {
2427 next = list->next; 2299 next = list->next;
@@ -2443,9 +2315,9 @@ static int rcu_nocb_kthread(void *arg)
2443 list = next; 2315 list = next;
2444 } 2316 }
2445 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); 2317 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2446 ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c; 2318 smp_mb__before_atomic(); /* _add after CB invocation. */
2447 ACCESS_ONCE(rdp->nocb_p_count_lazy) = 2319 atomic_long_add(-c, &rdp->nocb_q_count);
2448 rdp->nocb_p_count_lazy - cl; 2320 atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
2449 rdp->n_nocbs_invoked += c; 2321 rdp->n_nocbs_invoked += c;
2450 } 2322 }
2451 return 0; 2323 return 0;
@@ -2513,8 +2385,8 @@ void __init rcu_init_nohz(void)
2513 cpumask_and(rcu_nocb_mask, cpu_possible_mask, 2385 cpumask_and(rcu_nocb_mask, cpu_possible_mask,
2514 rcu_nocb_mask); 2386 rcu_nocb_mask);
2515 } 2387 }
2516 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 2388 pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
2517 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); 2389 cpumask_pr_args(rcu_nocb_mask));
2518 if (rcu_nocb_poll) 2390 if (rcu_nocb_poll)
2519 pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); 2391 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
2520 2392
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 5cdc62e1beeb..fbb6240509ea 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -46,6 +46,8 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "tree.h" 47#include "tree.h"
48 48
49DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
50
49static int r_open(struct inode *inode, struct file *file, 51static int r_open(struct inode *inode, struct file *file,
50 const struct seq_operations *op) 52 const struct seq_operations *op)
51{ 53{
@@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
115 117
116 if (!rdp->beenonline) 118 if (!rdp->beenonline)
117 return; 119 return;
118 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", 120 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
119 rdp->cpu, 121 rdp->cpu,
120 cpu_is_offline(rdp->cpu) ? '!' : ' ', 122 cpu_is_offline(rdp->cpu) ? '!' : ' ',
121 ulong2long(rdp->completed), ulong2long(rdp->gpnum), 123 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
122 rdp->passed_quiesce, rdp->qs_pending); 124 rdp->passed_quiesce,
125 rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
126 rdp->qs_pending);
123 seq_printf(m, " dt=%d/%llx/%d df=%lu", 127 seq_printf(m, " dt=%d/%llx/%d df=%lu",
124 atomic_read(&rdp->dynticks->dynticks), 128 atomic_read(&rdp->dynticks->dynticks),
125 rdp->dynticks->dynticks_nesting, 129 rdp->dynticks->dynticks_nesting,
diff --git a/kernel/resource.c b/kernel/resource.c
index 0bcebffc4e77..19f2357dfda3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -22,6 +22,7 @@
22#include <linux/device.h> 22#include <linux/device.h>
23#include <linux/pfn.h> 23#include <linux/pfn.h>
24#include <linux/mm.h> 24#include <linux/mm.h>
25#include <linux/resource_ext.h>
25#include <asm/io.h> 26#include <asm/io.h>
26 27
27 28
@@ -1529,6 +1530,30 @@ int iomem_is_exclusive(u64 addr)
1529 return err; 1530 return err;
1530} 1531}
1531 1532
1533struct resource_entry *resource_list_create_entry(struct resource *res,
1534 size_t extra_size)
1535{
1536 struct resource_entry *entry;
1537
1538 entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
1539 if (entry) {
1540 INIT_LIST_HEAD(&entry->node);
1541 entry->res = res ? res : &entry->__res;
1542 }
1543
1544 return entry;
1545}
1546EXPORT_SYMBOL(resource_list_create_entry);
1547
1548void resource_list_free(struct list_head *head)
1549{
1550 struct resource_entry *entry, *tmp;
1551
1552 list_for_each_entry_safe(entry, tmp, head, node)
1553 resource_list_destroy_entry(entry);
1554}
1555EXPORT_SYMBOL(resource_list_free);
1556
1532static int __init strict_iomem(char *str) 1557static int __init strict_iomem(char *str)
1533{ 1558{
1534 if (strstr(str, "relaxed")) 1559 if (strstr(str, "relaxed"))
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index ab32b7b0db5c..46be87024875 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,5 +1,5 @@
1ifdef CONFIG_FUNCTION_TRACER 1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_clock.o = -pg 2CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
3endif 3endif
4 4
5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c27e4f8f4879..c0a205101c23 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -420,3 +420,16 @@ u64 local_clock(void)
420 420
421EXPORT_SYMBOL_GPL(cpu_clock); 421EXPORT_SYMBOL_GPL(cpu_clock);
422EXPORT_SYMBOL_GPL(local_clock); 422EXPORT_SYMBOL_GPL(local_clock);
423
424/*
425 * Running clock - returns the time that has elapsed while a guest has been
426 * running.
427 * On a guest this value should be local_clock minus the time the guest was
428 * suspended by the hypervisor (for any reason).
429 * On bare metal this function should return the same as local_clock.
430 * Architectures and sub-architectures can override this.
431 */
432u64 __weak running_clock(void)
433{
434 return local_clock();
435}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 607f852b4d04..7052d3fd4e7b 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -268,6 +268,15 @@ bool try_wait_for_completion(struct completion *x)
268 unsigned long flags; 268 unsigned long flags;
269 int ret = 1; 269 int ret = 1;
270 270
271 /*
272 * Since x->done will need to be locked only
273 * in the non-blocking case, we check x->done
274 * first without taking the lock so we can
275 * return early in the blocking case.
276 */
277 if (!ACCESS_ONCE(x->done))
278 return 0;
279
271 spin_lock_irqsave(&x->wait.lock, flags); 280 spin_lock_irqsave(&x->wait.lock, flags);
272 if (!x->done) 281 if (!x->done)
273 ret = 0; 282 ret = 0;
@@ -288,13 +297,6 @@ EXPORT_SYMBOL(try_wait_for_completion);
288 */ 297 */
289bool completion_done(struct completion *x) 298bool completion_done(struct completion *x)
290{ 299{
291 unsigned long flags; 300 return !!ACCESS_ONCE(x->done);
292 int ret = 1;
293
294 spin_lock_irqsave(&x->wait.lock, flags);
295 if (!x->done)
296 ret = 0;
297 spin_unlock_irqrestore(&x->wait.lock, flags);
298 return ret;
299} 301}
300EXPORT_SYMBOL(completion_done); 302EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5eab11d4b747..13049aac05a6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
119{ 119{
120 s64 delta; 120 s64 delta;
121 121
122 if (rq->skip_clock_update > 0) 122 lockdep_assert_held(&rq->lock);
123
124 if (rq->clock_skip_update & RQCF_ACT_SKIP)
123 return; 125 return;
124 126
125 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 127 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -490,6 +492,11 @@ static __init void init_hrtick(void)
490 */ 492 */
491void hrtick_start(struct rq *rq, u64 delay) 493void hrtick_start(struct rq *rq, u64 delay)
492{ 494{
495 /*
496 * Don't schedule slices shorter than 10000ns, that just
497 * doesn't make sense. Rely on vruntime for fairness.
498 */
499 delay = max_t(u64, delay, 10000LL);
493 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 500 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
494 HRTIMER_MODE_REL_PINNED, 0); 501 HRTIMER_MODE_REL_PINNED, 0);
495} 502}
@@ -1046,7 +1053,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1046 * this case, we can save a useless back to back clock update. 1053 * this case, we can save a useless back to back clock update.
1047 */ 1054 */
1048 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) 1055 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1049 rq->skip_clock_update = 1; 1056 rq_clock_skip_update(rq, true);
1050} 1057}
1051 1058
1052#ifdef CONFIG_SMP 1059#ifdef CONFIG_SMP
@@ -1082,7 +1089,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1082 if (p->sched_class->migrate_task_rq) 1089 if (p->sched_class->migrate_task_rq)
1083 p->sched_class->migrate_task_rq(p, new_cpu); 1090 p->sched_class->migrate_task_rq(p, new_cpu);
1084 p->se.nr_migrations++; 1091 p->se.nr_migrations++;
1085 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 1092 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
1086 } 1093 }
1087 1094
1088 __set_task_cpu(p, new_cpu); 1095 __set_task_cpu(p, new_cpu);
@@ -1836,6 +1843,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1836 p->se.prev_sum_exec_runtime = 0; 1843 p->se.prev_sum_exec_runtime = 0;
1837 p->se.nr_migrations = 0; 1844 p->se.nr_migrations = 0;
1838 p->se.vruntime = 0; 1845 p->se.vruntime = 0;
1846#ifdef CONFIG_SMP
1847 p->se.avg.decay_count = 0;
1848#endif
1839 INIT_LIST_HEAD(&p->se.group_node); 1849 INIT_LIST_HEAD(&p->se.group_node);
1840 1850
1841#ifdef CONFIG_SCHEDSTATS 1851#ifdef CONFIG_SCHEDSTATS
@@ -2755,6 +2765,10 @@ again:
2755 * - explicit schedule() call 2765 * - explicit schedule() call
2756 * - return from syscall or exception to user-space 2766 * - return from syscall or exception to user-space
2757 * - return from interrupt-handler to user-space 2767 * - return from interrupt-handler to user-space
2768 *
2769 * WARNING: all callers must re-check need_resched() afterward and reschedule
2770 * accordingly in case an event triggered the need for rescheduling (such as
2771 * an interrupt waking up a task) while preemption was disabled in __schedule().
2758 */ 2772 */
2759static void __sched __schedule(void) 2773static void __sched __schedule(void)
2760{ 2774{
@@ -2763,7 +2777,6 @@ static void __sched __schedule(void)
2763 struct rq *rq; 2777 struct rq *rq;
2764 int cpu; 2778 int cpu;
2765 2779
2766need_resched:
2767 preempt_disable(); 2780 preempt_disable();
2768 cpu = smp_processor_id(); 2781 cpu = smp_processor_id();
2769 rq = cpu_rq(cpu); 2782 rq = cpu_rq(cpu);
@@ -2783,6 +2796,8 @@ need_resched:
2783 smp_mb__before_spinlock(); 2796 smp_mb__before_spinlock();
2784 raw_spin_lock_irq(&rq->lock); 2797 raw_spin_lock_irq(&rq->lock);
2785 2798
2799 rq->clock_skip_update <<= 1; /* promote REQ to ACT */
2800
2786 switch_count = &prev->nivcsw; 2801 switch_count = &prev->nivcsw;
2787 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2802 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2788 if (unlikely(signal_pending_state(prev->state, prev))) { 2803 if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2807,13 +2822,13 @@ need_resched:
2807 switch_count = &prev->nvcsw; 2822 switch_count = &prev->nvcsw;
2808 } 2823 }
2809 2824
2810 if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) 2825 if (task_on_rq_queued(prev))
2811 update_rq_clock(rq); 2826 update_rq_clock(rq);
2812 2827
2813 next = pick_next_task(rq, prev); 2828 next = pick_next_task(rq, prev);
2814 clear_tsk_need_resched(prev); 2829 clear_tsk_need_resched(prev);
2815 clear_preempt_need_resched(); 2830 clear_preempt_need_resched();
2816 rq->skip_clock_update = 0; 2831 rq->clock_skip_update = 0;
2817 2832
2818 if (likely(prev != next)) { 2833 if (likely(prev != next)) {
2819 rq->nr_switches++; 2834 rq->nr_switches++;
@@ -2828,8 +2843,6 @@ need_resched:
2828 post_schedule(rq); 2843 post_schedule(rq);
2829 2844
2830 sched_preempt_enable_no_resched(); 2845 sched_preempt_enable_no_resched();
2831 if (need_resched())
2832 goto need_resched;
2833} 2846}
2834 2847
2835static inline void sched_submit_work(struct task_struct *tsk) 2848static inline void sched_submit_work(struct task_struct *tsk)
@@ -2849,7 +2862,9 @@ asmlinkage __visible void __sched schedule(void)
2849 struct task_struct *tsk = current; 2862 struct task_struct *tsk = current;
2850 2863
2851 sched_submit_work(tsk); 2864 sched_submit_work(tsk);
2852 __schedule(); 2865 do {
2866 __schedule();
2867 } while (need_resched());
2853} 2868}
2854EXPORT_SYMBOL(schedule); 2869EXPORT_SYMBOL(schedule);
2855 2870
@@ -2884,6 +2899,21 @@ void __sched schedule_preempt_disabled(void)
2884 preempt_disable(); 2899 preempt_disable();
2885} 2900}
2886 2901
2902static void preempt_schedule_common(void)
2903{
2904 do {
2905 __preempt_count_add(PREEMPT_ACTIVE);
2906 __schedule();
2907 __preempt_count_sub(PREEMPT_ACTIVE);
2908
2909 /*
2910 * Check again in case we missed a preemption opportunity
2911 * between schedule and now.
2912 */
2913 barrier();
2914 } while (need_resched());
2915}
2916
2887#ifdef CONFIG_PREEMPT 2917#ifdef CONFIG_PREEMPT
2888/* 2918/*
2889 * this is the entry point to schedule() from in-kernel preemption 2919 * this is the entry point to schedule() from in-kernel preemption
@@ -2899,17 +2929,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
2899 if (likely(!preemptible())) 2929 if (likely(!preemptible()))
2900 return; 2930 return;
2901 2931
2902 do { 2932 preempt_schedule_common();
2903 __preempt_count_add(PREEMPT_ACTIVE);
2904 __schedule();
2905 __preempt_count_sub(PREEMPT_ACTIVE);
2906
2907 /*
2908 * Check again in case we missed a preemption opportunity
2909 * between schedule and now.
2910 */
2911 barrier();
2912 } while (need_resched());
2913} 2933}
2914NOKPROBE_SYMBOL(preempt_schedule); 2934NOKPROBE_SYMBOL(preempt_schedule);
2915EXPORT_SYMBOL(preempt_schedule); 2935EXPORT_SYMBOL(preempt_schedule);
@@ -3405,6 +3425,20 @@ static bool check_same_owner(struct task_struct *p)
3405 return match; 3425 return match;
3406} 3426}
3407 3427
3428static bool dl_param_changed(struct task_struct *p,
3429 const struct sched_attr *attr)
3430{
3431 struct sched_dl_entity *dl_se = &p->dl;
3432
3433 if (dl_se->dl_runtime != attr->sched_runtime ||
3434 dl_se->dl_deadline != attr->sched_deadline ||
3435 dl_se->dl_period != attr->sched_period ||
3436 dl_se->flags != attr->sched_flags)
3437 return true;
3438
3439 return false;
3440}
3441
3408static int __sched_setscheduler(struct task_struct *p, 3442static int __sched_setscheduler(struct task_struct *p,
3409 const struct sched_attr *attr, 3443 const struct sched_attr *attr,
3410 bool user) 3444 bool user)
@@ -3533,7 +3567,7 @@ recheck:
3533 goto change; 3567 goto change;
3534 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3568 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3535 goto change; 3569 goto change;
3536 if (dl_policy(policy)) 3570 if (dl_policy(policy) && dl_param_changed(p, attr))
3537 goto change; 3571 goto change;
3538 3572
3539 p->sched_reset_on_fork = reset_on_fork; 3573 p->sched_reset_on_fork = reset_on_fork;
@@ -4225,17 +4259,10 @@ SYSCALL_DEFINE0(sched_yield)
4225 return 0; 4259 return 0;
4226} 4260}
4227 4261
4228static void __cond_resched(void)
4229{
4230 __preempt_count_add(PREEMPT_ACTIVE);
4231 __schedule();
4232 __preempt_count_sub(PREEMPT_ACTIVE);
4233}
4234
4235int __sched _cond_resched(void) 4262int __sched _cond_resched(void)
4236{ 4263{
4237 if (should_resched()) { 4264 if (should_resched()) {
4238 __cond_resched(); 4265 preempt_schedule_common();
4239 return 1; 4266 return 1;
4240 } 4267 }
4241 return 0; 4268 return 0;
@@ -4260,7 +4287,7 @@ int __cond_resched_lock(spinlock_t *lock)
4260 if (spin_needbreak(lock) || resched) { 4287 if (spin_needbreak(lock) || resched) {
4261 spin_unlock(lock); 4288 spin_unlock(lock);
4262 if (resched) 4289 if (resched)
4263 __cond_resched(); 4290 preempt_schedule_common();
4264 else 4291 else
4265 cpu_relax(); 4292 cpu_relax();
4266 ret = 1; 4293 ret = 1;
@@ -4276,7 +4303,7 @@ int __sched __cond_resched_softirq(void)
4276 4303
4277 if (should_resched()) { 4304 if (should_resched()) {
4278 local_bh_enable(); 4305 local_bh_enable();
4279 __cond_resched(); 4306 preempt_schedule_common();
4280 local_bh_disable(); 4307 local_bh_disable();
4281 return 1; 4308 return 1;
4282 } 4309 }
@@ -4531,9 +4558,10 @@ void sched_show_task(struct task_struct *p)
4531{ 4558{
4532 unsigned long free = 0; 4559 unsigned long free = 0;
4533 int ppid; 4560 int ppid;
4534 unsigned state; 4561 unsigned long state = p->state;
4535 4562
4536 state = p->state ? __ffs(p->state) + 1 : 0; 4563 if (state)
4564 state = __ffs(state) + 1;
4537 printk(KERN_INFO "%-15.15s %c", p->comm, 4565 printk(KERN_INFO "%-15.15s %c", p->comm,
4538 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4566 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4539#if BITS_PER_LONG == 32 4567#if BITS_PER_LONG == 32
@@ -4766,7 +4794,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
4766 4794
4767void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4795void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4768{ 4796{
4769 if (p->sched_class && p->sched_class->set_cpus_allowed) 4797 if (p->sched_class->set_cpus_allowed)
4770 p->sched_class->set_cpus_allowed(p, new_mask); 4798 p->sched_class->set_cpus_allowed(p, new_mask);
4771 4799
4772 cpumask_copy(&p->cpus_allowed, new_mask); 4800 cpumask_copy(&p->cpus_allowed, new_mask);
@@ -5434,9 +5462,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5434 struct cpumask *groupmask) 5462 struct cpumask *groupmask)
5435{ 5463{
5436 struct sched_group *group = sd->groups; 5464 struct sched_group *group = sd->groups;
5437 char str[256];
5438 5465
5439 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5440 cpumask_clear(groupmask); 5466 cpumask_clear(groupmask);
5441 5467
5442 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 5468 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -5449,7 +5475,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5449 return -1; 5475 return -1;
5450 } 5476 }
5451 5477
5452 printk(KERN_CONT "span %s level %s\n", str, sd->name); 5478 printk(KERN_CONT "span %*pbl level %s\n",
5479 cpumask_pr_args(sched_domain_span(sd)), sd->name);
5453 5480
5454 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 5481 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5455 printk(KERN_ERR "ERROR: domain->span does not contain " 5482 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -5494,9 +5521,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5494 5521
5495 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 5522 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5496 5523
5497 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 5524 printk(KERN_CONT " %*pbl",
5498 5525 cpumask_pr_args(sched_group_cpus(group)));
5499 printk(KERN_CONT " %s", str);
5500 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { 5526 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
5501 printk(KERN_CONT " (cpu_capacity = %d)", 5527 printk(KERN_CONT " (cpu_capacity = %d)",
5502 group->sgc->capacity); 5528 group->sgc->capacity);
@@ -7276,6 +7302,11 @@ void __init sched_init(void)
7276 enter_lazy_tlb(&init_mm, current); 7302 enter_lazy_tlb(&init_mm, current);
7277 7303
7278 /* 7304 /*
7305 * During early bootup we pretend to be a normal task:
7306 */
7307 current->sched_class = &fair_sched_class;
7308
7309 /*
7279 * Make us the idle thread. Technically, schedule() should not be 7310 * Make us the idle thread. Technically, schedule() should not be
7280 * called from this thread, however somewhere below it might be, 7311 * called from this thread, however somewhere below it might be,
7281 * but because we are the idle thread, we just pick up running again 7312 * but because we are the idle thread, we just pick up running again
@@ -7285,11 +7316,6 @@ void __init sched_init(void)
7285 7316
7286 calc_load_update = jiffies + LOAD_FREQ; 7317 calc_load_update = jiffies + LOAD_FREQ;
7287 7318
7288 /*
7289 * During early bootup we pretend to be a normal task:
7290 */
7291 current->sched_class = &fair_sched_class;
7292
7293#ifdef CONFIG_SMP 7319#ifdef CONFIG_SMP
7294 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 7320 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7295 /* May be allocated at isolcpus cmdline parse time */ 7321 /* May be allocated at isolcpus cmdline parse time */
@@ -7350,6 +7376,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
7350 in_atomic(), irqs_disabled(), 7376 in_atomic(), irqs_disabled(),
7351 current->pid, current->comm); 7377 current->pid, current->comm);
7352 7378
7379 if (task_stack_end_corrupted(current))
7380 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
7381
7353 debug_show_held_locks(current); 7382 debug_show_held_locks(current);
7354 if (irqs_disabled()) 7383 if (irqs_disabled())
7355 print_irqtrace_events(current); 7384 print_irqtrace_events(current);
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 539ca3ce071b..c6acb07466bb 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,7 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
107 int best_cpu = -1; 107 int best_cpu = -1;
108 const struct sched_dl_entity *dl_se = &p->dl; 108 const struct sched_dl_entity *dl_se = &p->dl;
109 109
110 if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { 110 if (later_mask &&
111 cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
111 best_cpu = cpumask_any(later_mask); 112 best_cpu = cpumask_any(later_mask);
112 goto out; 113 goto out;
113 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && 114 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
@@ -186,6 +187,26 @@ out:
186} 187}
187 188
188/* 189/*
190 * cpudl_set_freecpu - Set the cpudl.free_cpus
191 * @cp: the cpudl max-heap context
192 * @cpu: rd attached cpu
193 */
194void cpudl_set_freecpu(struct cpudl *cp, int cpu)
195{
196 cpumask_set_cpu(cpu, cp->free_cpus);
197}
198
199/*
200 * cpudl_clear_freecpu - Clear the cpudl.free_cpus
201 * @cp: the cpudl max-heap context
202 * @cpu: rd attached cpu
203 */
204void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
205{
206 cpumask_clear_cpu(cpu, cp->free_cpus);
207}
208
209/*
189 * cpudl_init - initialize the cpudl structure 210 * cpudl_init - initialize the cpudl structure
190 * @cp: the cpudl max-heap context 211 * @cp: the cpudl max-heap context
191 */ 212 */
@@ -203,7 +224,7 @@ int cpudl_init(struct cpudl *cp)
203 if (!cp->elements) 224 if (!cp->elements)
204 return -ENOMEM; 225 return -ENOMEM;
205 226
206 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { 227 if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
207 kfree(cp->elements); 228 kfree(cp->elements);
208 return -ENOMEM; 229 return -ENOMEM;
209 } 230 }
@@ -211,8 +232,6 @@ int cpudl_init(struct cpudl *cp)
211 for_each_possible_cpu(i) 232 for_each_possible_cpu(i)
212 cp->elements[i].idx = IDX_INVALID; 233 cp->elements[i].idx = IDX_INVALID;
213 234
214 cpumask_setall(cp->free_cpus);
215
216 return 0; 235 return 0;
217} 236}
218 237
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 020039bd1326..1a0a6ef2fbe1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
24 struct cpumask *later_mask); 24 struct cpumask *later_mask);
25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); 25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26int cpudl_init(struct cpudl *cp); 26int cpudl_init(struct cpudl *cp);
27void cpudl_set_freecpu(struct cpudl *cp, int cpu);
28void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
27void cpudl_cleanup(struct cpudl *cp); 29void cpudl_cleanup(struct cpudl *cp);
28#endif /* CONFIG_SMP */ 30#endif /* CONFIG_SMP */
29 31
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 726470d47f87..a027799ae130 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -350,6 +350,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
350 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 350 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
351 dl_se->runtime = pi_se->dl_runtime; 351 dl_se->runtime = pi_se->dl_runtime;
352 } 352 }
353
354 if (dl_se->dl_yielded)
355 dl_se->dl_yielded = 0;
356 if (dl_se->dl_throttled)
357 dl_se->dl_throttled = 0;
353} 358}
354 359
355/* 360/*
@@ -536,23 +541,19 @@ again:
536 541
537 sched_clock_tick(); 542 sched_clock_tick();
538 update_rq_clock(rq); 543 update_rq_clock(rq);
539 dl_se->dl_throttled = 0; 544 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
540 dl_se->dl_yielded = 0; 545 if (dl_task(rq->curr))
541 if (task_on_rq_queued(p)) { 546 check_preempt_curr_dl(rq, p, 0);
542 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 547 else
543 if (dl_task(rq->curr)) 548 resched_curr(rq);
544 check_preempt_curr_dl(rq, p, 0);
545 else
546 resched_curr(rq);
547#ifdef CONFIG_SMP 549#ifdef CONFIG_SMP
548 /* 550 /*
549 * Queueing this task back might have overloaded rq, 551 * Queueing this task back might have overloaded rq,
550 * check if we need to kick someone away. 552 * check if we need to kick someone away.
551 */ 553 */
552 if (has_pushable_dl_tasks(rq)) 554 if (has_pushable_dl_tasks(rq))
553 push_dl_task(rq); 555 push_dl_task(rq);
554#endif 556#endif
555 }
556unlock: 557unlock:
557 raw_spin_unlock(&rq->lock); 558 raw_spin_unlock(&rq->lock);
558 559
@@ -613,10 +614,9 @@ static void update_curr_dl(struct rq *rq)
613 614
614 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; 615 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
615 if (dl_runtime_exceeded(rq, dl_se)) { 616 if (dl_runtime_exceeded(rq, dl_se)) {
617 dl_se->dl_throttled = 1;
616 __dequeue_task_dl(rq, curr, 0); 618 __dequeue_task_dl(rq, curr, 0);
617 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) 619 if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
618 dl_se->dl_throttled = 1;
619 else
620 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); 620 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
621 621
622 if (!is_leftmost(curr, &rq->dl)) 622 if (!is_leftmost(curr, &rq->dl))
@@ -853,7 +853,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
853 * its rq, the bandwidth timer callback (which clearly has not 853 * its rq, the bandwidth timer callback (which clearly has not
854 * run yet) will take care of this. 854 * run yet) will take care of this.
855 */ 855 */
856 if (p->dl.dl_throttled) 856 if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
857 return; 857 return;
858 858
859 enqueue_dl_entity(&p->dl, pi_se, flags); 859 enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -1073,7 +1073,13 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1073{ 1073{
1074 update_curr_dl(rq); 1074 update_curr_dl(rq);
1075 1075
1076 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) 1076 /*
1077 * Even when we have runtime, update_curr_dl() might have resulted in us
1078 * not being the leftmost task anymore. In that case NEED_RESCHED will
1079 * be set and schedule() will start a new hrtick for the next task.
1080 */
1081 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
1082 is_leftmost(p, &rq->dl))
1077 start_hrtick_dl(rq, p); 1083 start_hrtick_dl(rq, p);
1078} 1084}
1079 1085
@@ -1166,9 +1172,6 @@ static int find_later_rq(struct task_struct *task)
1166 * We have to consider system topology and task affinity 1172 * We have to consider system topology and task affinity
1167 * first, then we can look for a suitable cpu. 1173 * first, then we can look for a suitable cpu.
1168 */ 1174 */
1169 cpumask_copy(later_mask, task_rq(task)->rd->span);
1170 cpumask_and(later_mask, later_mask, cpu_active_mask);
1171 cpumask_and(later_mask, later_mask, &task->cpus_allowed);
1172 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, 1175 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
1173 task, later_mask); 1176 task, later_mask);
1174 if (best_cpu == -1) 1177 if (best_cpu == -1)
@@ -1563,6 +1566,7 @@ static void rq_online_dl(struct rq *rq)
1563 if (rq->dl.overloaded) 1566 if (rq->dl.overloaded)
1564 dl_set_overload(rq); 1567 dl_set_overload(rq);
1565 1568
1569 cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
1566 if (rq->dl.dl_nr_running > 0) 1570 if (rq->dl.dl_nr_running > 0)
1567 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); 1571 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
1568} 1572}
@@ -1574,6 +1578,7 @@ static void rq_offline_dl(struct rq *rq)
1574 dl_clear_overload(rq); 1578 dl_clear_overload(rq);
1575 1579
1576 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); 1580 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
1581 cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
1577} 1582}
1578 1583
1579void init_sched_dl_class(void) 1584void init_sched_dl_class(void)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 92cc52001e74..8baaf858d25c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -305,6 +305,7 @@ do { \
305 PN(next_balance); 305 PN(next_balance);
306 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); 306 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
307 PN(clock); 307 PN(clock);
308 PN(clock_task);
308 P(cpu_load[0]); 309 P(cpu_load[0]);
309 P(cpu_load[1]); 310 P(cpu_load[1]);
310 P(cpu_load[2]); 311 P(cpu_load[2]);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe331fc391f5..7ce18f3c097a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -676,7 +676,6 @@ void init_task_runnable_average(struct task_struct *p)
676{ 676{
677 u32 slice; 677 u32 slice;
678 678
679 p->se.avg.decay_count = 0;
680 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; 679 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
681 p->se.avg.runnable_avg_sum = slice; 680 p->se.avg.runnable_avg_sum = slice;
682 p->se.avg.runnable_avg_period = slice; 681 p->se.avg.runnable_avg_period = slice;
@@ -2574,11 +2573,11 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
2574 u64 decays = atomic64_read(&cfs_rq->decay_counter); 2573 u64 decays = atomic64_read(&cfs_rq->decay_counter);
2575 2574
2576 decays -= se->avg.decay_count; 2575 decays -= se->avg.decay_count;
2576 se->avg.decay_count = 0;
2577 if (!decays) 2577 if (!decays)
2578 return 0; 2578 return 0;
2579 2579
2580 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); 2580 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
2581 se->avg.decay_count = 0;
2582 2581
2583 return decays; 2582 return decays;
2584} 2583}
@@ -5157,7 +5156,7 @@ static void yield_task_fair(struct rq *rq)
5157 * so we don't do microscopic update in schedule() 5156 * so we don't do microscopic update in schedule()
5158 * and double the fastpath cost. 5157 * and double the fastpath cost.
5159 */ 5158 */
5160 rq->skip_clock_update = 1; 5159 rq_clock_skip_update(rq, true);
5161 } 5160 }
5162 5161
5163 set_skip_buddy(se); 5162 set_skip_buddy(se);
@@ -5949,8 +5948,8 @@ static unsigned long scale_rt_capacity(int cpu)
5949 */ 5948 */
5950 age_stamp = ACCESS_ONCE(rq->age_stamp); 5949 age_stamp = ACCESS_ONCE(rq->age_stamp);
5951 avg = ACCESS_ONCE(rq->rt_avg); 5950 avg = ACCESS_ONCE(rq->rt_avg);
5951 delta = __rq_clock_broken(rq) - age_stamp;
5952 5952
5953 delta = rq_clock(rq) - age_stamp;
5954 if (unlikely(delta < 0)) 5953 if (unlikely(delta < 0))
5955 delta = 0; 5954 delta = 0;
5956 5955
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..94b2d7b88a27 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
7#include <linux/tick.h> 7#include <linux/tick.h>
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/stackprotector.h> 9#include <linux/stackprotector.h>
10#include <linux/suspend.h>
10 11
11#include <asm/tlb.h> 12#include <asm/tlb.h>
12 13
@@ -47,7 +48,8 @@ static inline int cpu_idle_poll(void)
47 rcu_idle_enter(); 48 rcu_idle_enter();
48 trace_cpu_idle_rcuidle(0, smp_processor_id()); 49 trace_cpu_idle_rcuidle(0, smp_processor_id());
49 local_irq_enable(); 50 local_irq_enable();
50 while (!tif_need_resched()) 51 while (!tif_need_resched() &&
52 (cpu_idle_force_poll || tick_check_broadcast_expired()))
51 cpu_relax(); 53 cpu_relax();
52 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 54 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
53 rcu_idle_exit(); 55 rcu_idle_exit();
@@ -104,6 +106,21 @@ static void cpuidle_idle_call(void)
104 rcu_idle_enter(); 106 rcu_idle_enter();
105 107
106 /* 108 /*
109 * Suspend-to-idle ("freeze") is a system state in which all user space
110 * has been frozen, all I/O devices have been suspended and the only
111 * activity happens here and in iterrupts (if any). In that case bypass
112 * the cpuidle governor and go stratight for the deepest idle state
113 * available. Possibly also suspend the local tick and the entire
114 * timekeeping to prevent timer interrupts from kicking us out of idle
115 * until a proper wakeup interrupt happens.
116 */
117 if (idle_should_freeze()) {
118 cpuidle_enter_freeze();
119 local_irq_enable();
120 goto exit_idle;
121 }
122
123 /*
107 * Ask the cpuidle framework to choose a convenient idle state. 124 * Ask the cpuidle framework to choose a convenient idle state.
108 * Fall back to the default arch idle method on errors. 125 * Fall back to the default arch idle method on errors.
109 */ 126 */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ee15f5a0d1c1..f4d4b077eba0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -831,11 +831,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
831 enqueue = 1; 831 enqueue = 1;
832 832
833 /* 833 /*
834 * Force a clock update if the CPU was idle, 834 * When we're idle and a woken (rt) task is
835 * lest wakeup -> unthrottle time accumulate. 835 * throttled check_preempt_curr() will set
836 * skip_update and the time between the wakeup
837 * and this unthrottle will get accounted as
838 * 'runtime'.
836 */ 839 */
837 if (rt_rq->rt_nr_running && rq->curr == rq->idle) 840 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
838 rq->skip_clock_update = -1; 841 rq_clock_skip_update(rq, false);
839 } 842 }
840 if (rt_rq->rt_time || rt_rq->rt_nr_running) 843 if (rt_rq->rt_time || rt_rq->rt_nr_running)
841 idle = 0; 844 idle = 0;
@@ -1337,7 +1340,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1337 curr->prio <= p->prio)) { 1340 curr->prio <= p->prio)) {
1338 int target = find_lowest_rq(p); 1341 int target = find_lowest_rq(p);
1339 1342
1340 if (target != -1) 1343 /*
1344 * Don't bother moving it if the destination CPU is
1345 * not running a lower priority task.
1346 */
1347 if (target != -1 &&
1348 p->prio < cpu_rq(target)->rt.highest_prio.curr)
1341 cpu = target; 1349 cpu = target;
1342 } 1350 }
1343 rcu_read_unlock(); 1351 rcu_read_unlock();
@@ -1614,6 +1622,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1614 1622
1615 lowest_rq = cpu_rq(cpu); 1623 lowest_rq = cpu_rq(cpu);
1616 1624
1625 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1626 /*
1627 * Target rq has tasks of equal or higher priority,
1628 * retrying does not release any lock and is unlikely
1629 * to yield a different result.
1630 */
1631 lowest_rq = NULL;
1632 break;
1633 }
1634
1617 /* if the prio of this runqueue changed, try again */ 1635 /* if the prio of this runqueue changed, try again */
1618 if (double_lock_balance(rq, lowest_rq)) { 1636 if (double_lock_balance(rq, lowest_rq)) {
1619 /* 1637 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a2a45c970e7..0870db23d79c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -558,8 +558,6 @@ struct rq {
558#ifdef CONFIG_NO_HZ_FULL 558#ifdef CONFIG_NO_HZ_FULL
559 unsigned long last_sched_tick; 559 unsigned long last_sched_tick;
560#endif 560#endif
561 int skip_clock_update;
562
563 /* capture load from *all* tasks on this cpu: */ 561 /* capture load from *all* tasks on this cpu: */
564 struct load_weight load; 562 struct load_weight load;
565 unsigned long nr_load_updates; 563 unsigned long nr_load_updates;
@@ -588,6 +586,7 @@ struct rq {
588 unsigned long next_balance; 586 unsigned long next_balance;
589 struct mm_struct *prev_mm; 587 struct mm_struct *prev_mm;
590 588
589 unsigned int clock_skip_update;
591 u64 clock; 590 u64 clock;
592 u64 clock_task; 591 u64 clock_task;
593 592
@@ -687,16 +686,35 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
687#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 686#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
688#define raw_rq() raw_cpu_ptr(&runqueues) 687#define raw_rq() raw_cpu_ptr(&runqueues)
689 688
689static inline u64 __rq_clock_broken(struct rq *rq)
690{
691 return ACCESS_ONCE(rq->clock);
692}
693
690static inline u64 rq_clock(struct rq *rq) 694static inline u64 rq_clock(struct rq *rq)
691{ 695{
696 lockdep_assert_held(&rq->lock);
692 return rq->clock; 697 return rq->clock;
693} 698}
694 699
695static inline u64 rq_clock_task(struct rq *rq) 700static inline u64 rq_clock_task(struct rq *rq)
696{ 701{
702 lockdep_assert_held(&rq->lock);
697 return rq->clock_task; 703 return rq->clock_task;
698} 704}
699 705
706#define RQCF_REQ_SKIP 0x01
707#define RQCF_ACT_SKIP 0x02
708
709static inline void rq_clock_skip_update(struct rq *rq, bool skip)
710{
711 lockdep_assert_held(&rq->lock);
712 if (skip)
713 rq->clock_skip_update |= RQCF_REQ_SKIP;
714 else
715 rq->clock_skip_update &= ~RQCF_REQ_SKIP;
716}
717
700#ifdef CONFIG_NUMA 718#ifdef CONFIG_NUMA
701enum numa_topology_type { 719enum numa_topology_type {
702 NUMA_DIRECT, 720 NUMA_DIRECT,
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index a476bea17fbc..87e2c9f0c33e 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -15,11 +15,6 @@
15static int show_schedstat(struct seq_file *seq, void *v) 15static int show_schedstat(struct seq_file *seq, void *v)
16{ 16{
17 int cpu; 17 int cpu;
18 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
19 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
20
21 if (mask_str == NULL)
22 return -ENOMEM;
23 18
24 if (v == (void *)1) { 19 if (v == (void *)1) {
25 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 20 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
@@ -50,9 +45,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
50 for_each_domain(cpu, sd) { 45 for_each_domain(cpu, sd) {
51 enum cpu_idle_type itype; 46 enum cpu_idle_type itype;
52 47
53 cpumask_scnprintf(mask_str, mask_len, 48 seq_printf(seq, "domain%d %*pb", dcount++,
54 sched_domain_span(sd)); 49 cpumask_pr_args(sched_domain_span(sd)));
55 seq_printf(seq, "domain%d %s", dcount++, mask_str);
56 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 50 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
57 itype++) { 51 itype++) {
58 seq_printf(seq, " %u %u %u %u %u %u %u %u", 52 seq_printf(seq, " %u %u %u %u %u %u %u %u",
@@ -76,7 +70,6 @@ static int show_schedstat(struct seq_file *seq, void *v)
76 rcu_read_unlock(); 70 rcu_read_unlock();
77#endif 71#endif
78 } 72 }
79 kfree(mask_str);
80 return 0; 73 return 0;
81} 74}
82 75
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4ef9687ac115..4f44028943e6 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -629,7 +629,9 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
629 629
630 switch (action) { 630 switch (action) {
631 case SECCOMP_RET_ERRNO: 631 case SECCOMP_RET_ERRNO:
632 /* Set the low-order 16-bits as a errno. */ 632 /* Set low-order bits as an errno, capped at MAX_ERRNO. */
633 if (data > MAX_ERRNO)
634 data = MAX_ERRNO;
633 syscall_set_return_value(current, task_pt_regs(current), 635 syscall_set_return_value(current, task_pt_regs(current),
634 -data, 0); 636 -data, 0);
635 goto skip; 637 goto skip;
diff --git a/kernel/signal.c b/kernel/signal.c
index 16a305295256..a390499943e4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2501,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals);
2501 */ 2501 */
2502SYSCALL_DEFINE0(restart_syscall) 2502SYSCALL_DEFINE0(restart_syscall)
2503{ 2503{
2504 struct restart_block *restart = &current_thread_info()->restart_block; 2504 struct restart_block *restart = &current->restart_block;
2505 return restart->fn(restart); 2505 return restart->fn(restart);
2506} 2506}
2507 2507
@@ -3550,7 +3550,7 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
3550SYSCALL_DEFINE0(pause) 3550SYSCALL_DEFINE0(pause)
3551{ 3551{
3552 while (!signal_pending(current)) { 3552 while (!signal_pending(current)) {
3553 current->state = TASK_INTERRUPTIBLE; 3553 __set_current_state(TASK_INTERRUPTIBLE);
3554 schedule(); 3554 schedule();
3555 } 3555 }
3556 return -ERESTARTNOHAND; 3556 return -ERESTARTNOHAND;
@@ -3563,7 +3563,7 @@ int sigsuspend(sigset_t *set)
3563 current->saved_sigmask = current->blocked; 3563 current->saved_sigmask = current->blocked;
3564 set_current_blocked(set); 3564 set_current_blocked(set);
3565 3565
3566 current->state = TASK_INTERRUPTIBLE; 3566 __set_current_state(TASK_INTERRUPTIBLE);
3567 schedule(); 3567 schedule();
3568 set_restore_sigmask(); 3568 set_restore_sigmask();
3569 return -ERESTARTNOHAND; 3569 return -ERESTARTNOHAND;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 501baa9ac1be..479e4436f787 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
114 trace_softirqs_off(ip); 114 trace_softirqs_off(ip);
115 raw_local_irq_restore(flags); 115 raw_local_irq_restore(flags);
116 116
117 if (preempt_count() == cnt) 117 if (preempt_count() == cnt) {
118#ifdef CONFIG_DEBUG_PREEMPT
119 current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
120#endif
118 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 121 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
122 }
119} 123}
120EXPORT_SYMBOL(__local_bh_disable_ip); 124EXPORT_SYMBOL(__local_bh_disable_ip);
121#endif /* CONFIG_TRACE_IRQFLAGS */ 125#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -656,9 +660,8 @@ static void run_ksoftirqd(unsigned int cpu)
656 * in the task stack here. 660 * in the task stack here.
657 */ 661 */
658 __do_softirq(); 662 __do_softirq();
659 rcu_note_context_switch();
660 local_irq_enable(); 663 local_irq_enable();
661 cond_resched(); 664 cond_resched_rcu_qs();
662 return; 665 return;
663 } 666 }
664 local_irq_enable(); 667 local_irq_enable();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 137c7f69b264..88ea2d6e0031 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1248,7 +1248,6 @@ static struct ctl_table vm_table[] = {
1248 .maxlen = sizeof(unsigned long), 1248 .maxlen = sizeof(unsigned long),
1249 .mode = 0644, 1249 .mode = 0644,
1250 .proc_handler = hugetlb_sysctl_handler, 1250 .proc_handler = hugetlb_sysctl_handler,
1251 .extra1 = &zero,
1252 }, 1251 },
1253#ifdef CONFIG_NUMA 1252#ifdef CONFIG_NUMA
1254 { 1253 {
@@ -1257,7 +1256,6 @@ static struct ctl_table vm_table[] = {
1257 .maxlen = sizeof(unsigned long), 1256 .maxlen = sizeof(unsigned long),
1258 .mode = 0644, 1257 .mode = 0644,
1259 .proc_handler = &hugetlb_mempolicy_sysctl_handler, 1258 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
1260 .extra1 = &zero,
1261 }, 1259 },
1262#endif 1260#endif
1263 { 1261 {
@@ -1280,7 +1278,6 @@ static struct ctl_table vm_table[] = {
1280 .maxlen = sizeof(unsigned long), 1278 .maxlen = sizeof(unsigned long),
1281 .mode = 0644, 1279 .mode = 0644,
1282 .proc_handler = hugetlb_overcommit_handler, 1280 .proc_handler = hugetlb_overcommit_handler,
1283 .extra1 = &zero,
1284 }, 1281 },
1285#endif 1282#endif
1286 { 1283 {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 670fff88a961..21f82c29c914 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -111,13 +111,8 @@ static int send_reply(struct sk_buff *skb, struct genl_info *info)
111{ 111{
112 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 112 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
113 void *reply = genlmsg_data(genlhdr); 113 void *reply = genlmsg_data(genlhdr);
114 int rc;
115 114
116 rc = genlmsg_end(skb, reply); 115 genlmsg_end(skb, reply);
117 if (rc < 0) {
118 nlmsg_free(skb);
119 return rc;
120 }
121 116
122 return genlmsg_reply(skb, info); 117 return genlmsg_reply(skb, info);
123} 118}
@@ -134,11 +129,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
134 void *reply = genlmsg_data(genlhdr); 129 void *reply = genlmsg_data(genlhdr);
135 int rc, delcount = 0; 130 int rc, delcount = 0;
136 131
137 rc = genlmsg_end(skb, reply); 132 genlmsg_end(skb, reply);
138 if (rc < 0) {
139 nlmsg_free(skb);
140 return;
141 }
142 133
143 rc = 0; 134 rc = 0;
144 down_read(&listeners->sem); 135 down_read(&listeners->sem);
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f622cf28628a..c09c07817d7a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,6 @@
1obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o 1obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
3obj-y += timeconv.o posix-clock.o alarmtimer.o 3obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
4 4
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
6obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 6obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index a7077d3ae52f..1b001ed1edb9 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -788,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
788 goto out; 788 goto out;
789 } 789 }
790 790
791 restart = &current_thread_info()->restart_block; 791 restart = &current->restart_block;
792 restart->fn = alarm_timer_nsleep_restart; 792 restart->fn = alarm_timer_nsleep_restart;
793 restart->nanosleep.clockid = type; 793 restart->nanosleep.clockid = type;
794 restart->nanosleep.expires = exp.tv64; 794 restart->nanosleep.expires = exp.tv64;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b79f39bda7e1..4892352f0e49 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -34,82 +34,6 @@
34#include "tick-internal.h" 34#include "tick-internal.h"
35#include "timekeeping_internal.h" 35#include "timekeeping_internal.h"
36 36
37void timecounter_init(struct timecounter *tc,
38 const struct cyclecounter *cc,
39 u64 start_tstamp)
40{
41 tc->cc = cc;
42 tc->cycle_last = cc->read(cc);
43 tc->nsec = start_tstamp;
44}
45EXPORT_SYMBOL_GPL(timecounter_init);
46
47/**
48 * timecounter_read_delta - get nanoseconds since last call of this function
49 * @tc: Pointer to time counter
50 *
51 * When the underlying cycle counter runs over, this will be handled
52 * correctly as long as it does not run over more than once between
53 * calls.
54 *
55 * The first call to this function for a new time counter initializes
56 * the time tracking and returns an undefined result.
57 */
58static u64 timecounter_read_delta(struct timecounter *tc)
59{
60 cycle_t cycle_now, cycle_delta;
61 u64 ns_offset;
62
63 /* read cycle counter: */
64 cycle_now = tc->cc->read(tc->cc);
65
66 /* calculate the delta since the last timecounter_read_delta(): */
67 cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
68
69 /* convert to nanoseconds: */
70 ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
71
72 /* update time stamp of timecounter_read_delta() call: */
73 tc->cycle_last = cycle_now;
74
75 return ns_offset;
76}
77
78u64 timecounter_read(struct timecounter *tc)
79{
80 u64 nsec;
81
82 /* increment time by nanoseconds since last call */
83 nsec = timecounter_read_delta(tc);
84 nsec += tc->nsec;
85 tc->nsec = nsec;
86
87 return nsec;
88}
89EXPORT_SYMBOL_GPL(timecounter_read);
90
91u64 timecounter_cyc2time(struct timecounter *tc,
92 cycle_t cycle_tstamp)
93{
94 u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
95 u64 nsec;
96
97 /*
98 * Instead of always treating cycle_tstamp as more recent
99 * than tc->cycle_last, detect when it is too far in the
100 * future and treat it as old time stamp instead.
101 */
102 if (cycle_delta > tc->cc->mask / 2) {
103 cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
104 nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
105 } else {
106 nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
107 }
108
109 return nsec;
110}
111EXPORT_SYMBOL_GPL(timecounter_cyc2time);
112
113/** 37/**
114 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks 38 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
115 * @mult: pointer to mult variable 39 * @mult: pointer to mult variable
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d8c724cda37b..bee0c1f78091 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -266,7 +266,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
266/* 266/*
267 * Divide a ktime value by a nanosecond value 267 * Divide a ktime value by a nanosecond value
268 */ 268 */
269u64 ktime_divns(const ktime_t kt, s64 div) 269u64 __ktime_divns(const ktime_t kt, s64 div)
270{ 270{
271 u64 dclc; 271 u64 dclc;
272 int sft = 0; 272 int sft = 0;
@@ -282,7 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
282 282
283 return dclc; 283 return dclc;
284} 284}
285EXPORT_SYMBOL_GPL(ktime_divns); 285EXPORT_SYMBOL_GPL(__ktime_divns);
286#endif /* BITS_PER_LONG >= 64 */ 286#endif /* BITS_PER_LONG >= 64 */
287 287
288/* 288/*
@@ -440,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer)
440 trace_hrtimer_cancel(timer); 440 trace_hrtimer_cancel(timer);
441} 441}
442 442
443#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
444static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
445{
446 struct hrtimer_clock_base *base = cpu_base->clock_base;
447 ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
448 int i;
449
450 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
451 struct timerqueue_node *next;
452 struct hrtimer *timer;
453
454 next = timerqueue_getnext(&base->active);
455 if (!next)
456 continue;
457
458 timer = container_of(next, struct hrtimer, node);
459 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
460 if (expires.tv64 < expires_next.tv64)
461 expires_next = expires;
462 }
463 /*
464 * clock_was_set() might have changed base->offset of any of
465 * the clock bases so the result might be negative. Fix it up
466 * to prevent a false positive in clockevents_program_event().
467 */
468 if (expires_next.tv64 < 0)
469 expires_next.tv64 = 0;
470 return expires_next;
471}
472#endif
473
443/* High resolution timer related functions */ 474/* High resolution timer related functions */
444#ifdef CONFIG_HIGH_RES_TIMERS 475#ifdef CONFIG_HIGH_RES_TIMERS
445 476
@@ -488,32 +519,7 @@ static inline int hrtimer_hres_active(void)
488static void 519static void
489hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) 520hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
490{ 521{
491 int i; 522 ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
492 struct hrtimer_clock_base *base = cpu_base->clock_base;
493 ktime_t expires, expires_next;
494
495 expires_next.tv64 = KTIME_MAX;
496
497 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
498 struct hrtimer *timer;
499 struct timerqueue_node *next;
500
501 next = timerqueue_getnext(&base->active);
502 if (!next)
503 continue;
504 timer = container_of(next, struct hrtimer, node);
505
506 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
507 /*
508 * clock_was_set() has changed base->offset so the
509 * result might be negative. Fix it up to prevent a
510 * false positive in clockevents_program_event()
511 */
512 if (expires.tv64 < 0)
513 expires.tv64 = 0;
514 if (expires.tv64 < expires_next.tv64)
515 expires_next = expires;
516 }
517 523
518 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) 524 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
519 return; 525 return;
@@ -587,6 +593,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
587 return 0; 593 return 0;
588 594
589 /* 595 /*
596 * When the target cpu of the timer is currently executing
597 * hrtimer_interrupt(), then we do not touch the clock event
598 * device. hrtimer_interrupt() will reevaluate all clock bases
599 * before reprogramming the device.
600 */
601 if (cpu_base->in_hrtirq)
602 return 0;
603
604 /*
590 * If a hang was detected in the last timer interrupt then we 605 * If a hang was detected in the last timer interrupt then we
591 * do not schedule a timer which is earlier than the expiry 606 * do not schedule a timer which is earlier than the expiry
592 * which we enforced in the hang detection. We want the system 607 * which we enforced in the hang detection. We want the system
@@ -1104,29 +1119,14 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1104ktime_t hrtimer_get_next_event(void) 1119ktime_t hrtimer_get_next_event(void)
1105{ 1120{
1106 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1121 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1107 struct hrtimer_clock_base *base = cpu_base->clock_base; 1122 ktime_t mindelta = { .tv64 = KTIME_MAX };
1108 ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
1109 unsigned long flags; 1123 unsigned long flags;
1110 int i;
1111 1124
1112 raw_spin_lock_irqsave(&cpu_base->lock, flags); 1125 raw_spin_lock_irqsave(&cpu_base->lock, flags);
1113 1126
1114 if (!hrtimer_hres_active()) { 1127 if (!hrtimer_hres_active())
1115 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1128 mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
1116 struct hrtimer *timer; 1129 ktime_get());
1117 struct timerqueue_node *next;
1118
1119 next = timerqueue_getnext(&base->active);
1120 if (!next)
1121 continue;
1122
1123 timer = container_of(next, struct hrtimer, node);
1124 delta.tv64 = hrtimer_get_expires_tv64(timer);
1125 delta = ktime_sub(delta, base->get_time());
1126 if (delta.tv64 < mindelta.tv64)
1127 mindelta.tv64 = delta.tv64;
1128 }
1129 }
1130 1130
1131 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1131 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1132 1132
@@ -1253,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1253 raw_spin_lock(&cpu_base->lock); 1253 raw_spin_lock(&cpu_base->lock);
1254 entry_time = now = hrtimer_update_base(cpu_base); 1254 entry_time = now = hrtimer_update_base(cpu_base);
1255retry: 1255retry:
1256 expires_next.tv64 = KTIME_MAX; 1256 cpu_base->in_hrtirq = 1;
1257 /* 1257 /*
1258 * We set expires_next to KTIME_MAX here with cpu_base->lock 1258 * We set expires_next to KTIME_MAX here with cpu_base->lock
1259 * held to prevent that a timer is enqueued in our queue via 1259 * held to prevent that a timer is enqueued in our queue via
@@ -1291,28 +1291,20 @@ retry:
1291 * are right-of a not yet expired timer, because that 1291 * are right-of a not yet expired timer, because that
1292 * timer will have to trigger a wakeup anyway. 1292 * timer will have to trigger a wakeup anyway.
1293 */ 1293 */
1294 1294 if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
1295 if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
1296 ktime_t expires;
1297
1298 expires = ktime_sub(hrtimer_get_expires(timer),
1299 base->offset);
1300 if (expires.tv64 < 0)
1301 expires.tv64 = KTIME_MAX;
1302 if (expires.tv64 < expires_next.tv64)
1303 expires_next = expires;
1304 break; 1295 break;
1305 }
1306 1296
1307 __run_hrtimer(timer, &basenow); 1297 __run_hrtimer(timer, &basenow);
1308 } 1298 }
1309 } 1299 }
1310 1300 /* Reevaluate the clock bases for the next expiry */
1301 expires_next = __hrtimer_get_next_event(cpu_base);
1311 /* 1302 /*
1312 * Store the new expiry value so the migration code can verify 1303 * Store the new expiry value so the migration code can verify
1313 * against it. 1304 * against it.
1314 */ 1305 */
1315 cpu_base->expires_next = expires_next; 1306 cpu_base->expires_next = expires_next;
1307 cpu_base->in_hrtirq = 0;
1316 raw_spin_unlock(&cpu_base->lock); 1308 raw_spin_unlock(&cpu_base->lock);
1317 1309
1318 /* Reprogramming necessary ? */ 1310 /* Reprogramming necessary ? */
@@ -1591,7 +1583,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1591 goto out; 1583 goto out;
1592 } 1584 }
1593 1585
1594 restart = &current_thread_info()->restart_block; 1586 restart = &current->restart_block;
1595 restart->fn = hrtimer_nanosleep_restart; 1587 restart->fn = hrtimer_nanosleep_restart;
1596 restart->nanosleep.clockid = t.timer.base->clockid; 1588 restart->nanosleep.clockid = t.timer.base->clockid;
1597 restart->nanosleep.rmtp = rmtp; 1589 restart->nanosleep.rmtp = rmtp;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 28bf91c60a0b..4b585e0fdd22 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -488,13 +488,13 @@ static void sync_cmos_clock(struct work_struct *work)
488 488
489 getnstimeofday64(&now); 489 getnstimeofday64(&now);
490 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { 490 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
491 struct timespec adjust = timespec64_to_timespec(now); 491 struct timespec64 adjust = now;
492 492
493 fail = -ENODEV; 493 fail = -ENODEV;
494 if (persistent_clock_is_local) 494 if (persistent_clock_is_local)
495 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); 495 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
496#ifdef CONFIG_GENERIC_CMOS_UPDATE 496#ifdef CONFIG_GENERIC_CMOS_UPDATE
497 fail = update_persistent_clock(adjust); 497 fail = update_persistent_clock(timespec64_to_timespec(adjust));
498#endif 498#endif
499#ifdef CONFIG_RTC_SYSTOHC 499#ifdef CONFIG_RTC_SYSTOHC
500 if (fail == -ENODEV) 500 if (fail == -ENODEV)
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a16b67859e2a..0075da74abf0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1334,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1334static int posix_cpu_nsleep(const clockid_t which_clock, int flags, 1334static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1335 struct timespec *rqtp, struct timespec __user *rmtp) 1335 struct timespec *rqtp, struct timespec __user *rmtp)
1336{ 1336{
1337 struct restart_block *restart_block = 1337 struct restart_block *restart_block = &current->restart_block;
1338 &current_thread_info()->restart_block;
1339 struct itimerspec it; 1338 struct itimerspec it;
1340 int error; 1339 int error;
1341 1340
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 7efeedf53ebd..f7c515595b42 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -394,6 +394,56 @@ void tick_resume(void)
394 } 394 }
395} 395}
396 396
397static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
398static unsigned int tick_freeze_depth;
399
400/**
401 * tick_freeze - Suspend the local tick and (possibly) timekeeping.
402 *
403 * Check if this is the last online CPU executing the function and if so,
404 * suspend timekeeping. Otherwise suspend the local tick.
405 *
406 * Call with interrupts disabled. Must be balanced with %tick_unfreeze().
407 * Interrupts must not be enabled before the subsequent %tick_unfreeze().
408 */
409void tick_freeze(void)
410{
411 raw_spin_lock(&tick_freeze_lock);
412
413 tick_freeze_depth++;
414 if (tick_freeze_depth == num_online_cpus()) {
415 timekeeping_suspend();
416 } else {
417 tick_suspend();
418 tick_suspend_broadcast();
419 }
420
421 raw_spin_unlock(&tick_freeze_lock);
422}
423
424/**
425 * tick_unfreeze - Resume the local tick and (possibly) timekeeping.
426 *
427 * Check if this is the first CPU executing the function and if so, resume
428 * timekeeping. Otherwise resume the local tick.
429 *
430 * Call with interrupts disabled. Must be balanced with %tick_freeze().
431 * Interrupts must not be enabled after the preceding %tick_freeze().
432 */
433void tick_unfreeze(void)
434{
435 raw_spin_lock(&tick_freeze_lock);
436
437 if (tick_freeze_depth == num_online_cpus())
438 timekeeping_resume();
439 else
440 tick_resume();
441
442 tick_freeze_depth--;
443
444 raw_spin_unlock(&tick_freeze_lock);
445}
446
397/** 447/**
398 * tick_init - initialize the tick control 448 * tick_init - initialize the tick control
399 */ 449 */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1363d58f07e9..a4c4edac4528 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -326,13 +326,6 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
326 return NOTIFY_OK; 326 return NOTIFY_OK;
327} 327}
328 328
329/*
330 * Worst case string length in chunks of CPU range seems 2 steps
331 * separations: 0,2,4,6,...
332 * This is NR_CPUS + sizeof('\0')
333 */
334static char __initdata nohz_full_buf[NR_CPUS + 1];
335
336static int tick_nohz_init_all(void) 329static int tick_nohz_init_all(void)
337{ 330{
338 int err = -1; 331 int err = -1;
@@ -393,8 +386,8 @@ void __init tick_nohz_init(void)
393 context_tracking_cpu_set(cpu); 386 context_tracking_cpu_set(cpu);
394 387
395 cpu_notifier(tick_nohz_cpu_down_callback, 0); 388 cpu_notifier(tick_nohz_cpu_down_callback, 0);
396 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); 389 pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
397 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); 390 cpumask_pr_args(tick_nohz_full_mask));
398} 391}
399#endif 392#endif
400 393
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
new file mode 100644
index 000000000000..4687b3104bae
--- /dev/null
+++ b/kernel/time/timecounter.c
@@ -0,0 +1,112 @@
1/*
2 * linux/kernel/time/timecounter.c
3 *
4 * based on code that migrated away from
5 * linux/kernel/time/clocksource.c
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 */
17
18#include <linux/export.h>
19#include <linux/timecounter.h>
20
21void timecounter_init(struct timecounter *tc,
22 const struct cyclecounter *cc,
23 u64 start_tstamp)
24{
25 tc->cc = cc;
26 tc->cycle_last = cc->read(cc);
27 tc->nsec = start_tstamp;
28 tc->mask = (1ULL << cc->shift) - 1;
29 tc->frac = 0;
30}
31EXPORT_SYMBOL_GPL(timecounter_init);
32
33/**
34 * timecounter_read_delta - get nanoseconds since last call of this function
35 * @tc: Pointer to time counter
36 *
37 * When the underlying cycle counter runs over, this will be handled
38 * correctly as long as it does not run over more than once between
39 * calls.
40 *
41 * The first call to this function for a new time counter initializes
42 * the time tracking and returns an undefined result.
43 */
44static u64 timecounter_read_delta(struct timecounter *tc)
45{
46 cycle_t cycle_now, cycle_delta;
47 u64 ns_offset;
48
49 /* read cycle counter: */
50 cycle_now = tc->cc->read(tc->cc);
51
52 /* calculate the delta since the last timecounter_read_delta(): */
53 cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
54
55 /* convert to nanoseconds: */
56 ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta,
57 tc->mask, &tc->frac);
58
59 /* update time stamp of timecounter_read_delta() call: */
60 tc->cycle_last = cycle_now;
61
62 return ns_offset;
63}
64
65u64 timecounter_read(struct timecounter *tc)
66{
67 u64 nsec;
68
69 /* increment time by nanoseconds since last call */
70 nsec = timecounter_read_delta(tc);
71 nsec += tc->nsec;
72 tc->nsec = nsec;
73
74 return nsec;
75}
76EXPORT_SYMBOL_GPL(timecounter_read);
77
78/*
79 * This is like cyclecounter_cyc2ns(), but it is used for computing a
80 * time previous to the time stored in the cycle counter.
81 */
82static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
83 cycle_t cycles, u64 mask, u64 frac)
84{
85 u64 ns = (u64) cycles;
86
87 ns = ((ns * cc->mult) - frac) >> cc->shift;
88
89 return ns;
90}
91
92u64 timecounter_cyc2time(struct timecounter *tc,
93 cycle_t cycle_tstamp)
94{
95 u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
96 u64 nsec = tc->nsec, frac = tc->frac;
97
98 /*
99 * Instead of always treating cycle_tstamp as more recent
100 * than tc->cycle_last, detect when it is too far in the
101 * future and treat it as old time stamp instead.
102 */
103 if (delta > tc->cc->mask / 2) {
104 delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
105 nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac);
106 } else {
107 nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac);
108 }
109
110 return nsec;
111}
112EXPORT_SYMBOL_GPL(timecounter_cyc2time);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6a931852082f..91db94136c10 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -230,9 +230,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
230 230
231/** 231/**
232 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. 232 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
233 * @tk: The timekeeper from which we take the update 233 * @tkr: Timekeeping readout base from which we take the update
234 * @tkf: The fast timekeeper to update
235 * @tbase: The time base for the fast timekeeper (mono/raw)
236 * 234 *
237 * We want to use this from any context including NMI and tracing / 235 * We want to use this from any context including NMI and tracing /
238 * instrumenting the timekeeping code itself. 236 * instrumenting the timekeeping code itself.
@@ -244,11 +242,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
244 * smp_wmb(); <- Ensure that the last base[1] update is visible 242 * smp_wmb(); <- Ensure that the last base[1] update is visible
245 * tkf->seq++; 243 * tkf->seq++;
246 * smp_wmb(); <- Ensure that the seqcount update is visible 244 * smp_wmb(); <- Ensure that the seqcount update is visible
247 * update(tkf->base[0], tk); 245 * update(tkf->base[0], tkr);
248 * smp_wmb(); <- Ensure that the base[0] update is visible 246 * smp_wmb(); <- Ensure that the base[0] update is visible
249 * tkf->seq++; 247 * tkf->seq++;
250 * smp_wmb(); <- Ensure that the seqcount update is visible 248 * smp_wmb(); <- Ensure that the seqcount update is visible
251 * update(tkf->base[1], tk); 249 * update(tkf->base[1], tkr);
252 * 250 *
253 * The reader side does: 251 * The reader side does:
254 * 252 *
@@ -269,7 +267,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
269 * slightly wrong timestamp (a few nanoseconds). See 267 * slightly wrong timestamp (a few nanoseconds). See
270 * @ktime_get_mono_fast_ns. 268 * @ktime_get_mono_fast_ns.
271 */ 269 */
272static void update_fast_timekeeper(struct timekeeper *tk) 270static void update_fast_timekeeper(struct tk_read_base *tkr)
273{ 271{
274 struct tk_read_base *base = tk_fast_mono.base; 272 struct tk_read_base *base = tk_fast_mono.base;
275 273
@@ -277,7 +275,7 @@ static void update_fast_timekeeper(struct timekeeper *tk)
277 raw_write_seqcount_latch(&tk_fast_mono.seq); 275 raw_write_seqcount_latch(&tk_fast_mono.seq);
278 276
279 /* Update base[0] */ 277 /* Update base[0] */
280 memcpy(base, &tk->tkr, sizeof(*base)); 278 memcpy(base, tkr, sizeof(*base));
281 279
282 /* Force readers back to base[0] */ 280 /* Force readers back to base[0] */
283 raw_write_seqcount_latch(&tk_fast_mono.seq); 281 raw_write_seqcount_latch(&tk_fast_mono.seq);
@@ -334,6 +332,35 @@ u64 notrace ktime_get_mono_fast_ns(void)
334} 332}
335EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); 333EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
336 334
335/* Suspend-time cycles value for halted fast timekeeper. */
336static cycle_t cycles_at_suspend;
337
338static cycle_t dummy_clock_read(struct clocksource *cs)
339{
340 return cycles_at_suspend;
341}
342
343/**
344 * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
345 * @tk: Timekeeper to snapshot.
346 *
347 * It generally is unsafe to access the clocksource after timekeeping has been
348 * suspended, so take a snapshot of the readout base of @tk and use it as the
349 * fast timekeeper's readout base while suspended. It will return the same
350 * number of cycles every time until timekeeping is resumed at which time the
351 * proper readout base for the fast timekeeper will be restored automatically.
352 */
353static void halt_fast_timekeeper(struct timekeeper *tk)
354{
355 static struct tk_read_base tkr_dummy;
356 struct tk_read_base *tkr = &tk->tkr;
357
358 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
359 cycles_at_suspend = tkr->read(tkr->clock);
360 tkr_dummy.read = dummy_clock_read;
361 update_fast_timekeeper(&tkr_dummy);
362}
363
337#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD 364#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
338 365
339static inline void update_vsyscall(struct timekeeper *tk) 366static inline void update_vsyscall(struct timekeeper *tk)
@@ -462,7 +489,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
462 memcpy(&shadow_timekeeper, &tk_core.timekeeper, 489 memcpy(&shadow_timekeeper, &tk_core.timekeeper,
463 sizeof(tk_core.timekeeper)); 490 sizeof(tk_core.timekeeper));
464 491
465 update_fast_timekeeper(tk); 492 update_fast_timekeeper(&tk->tkr);
466} 493}
467 494
468/** 495/**
@@ -1170,7 +1197,7 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
1170 * xtime/wall_to_monotonic/jiffies/etc are 1197 * xtime/wall_to_monotonic/jiffies/etc are
1171 * still managed by arch specific suspend/resume code. 1198 * still managed by arch specific suspend/resume code.
1172 */ 1199 */
1173static void timekeeping_resume(void) 1200void timekeeping_resume(void)
1174{ 1201{
1175 struct timekeeper *tk = &tk_core.timekeeper; 1202 struct timekeeper *tk = &tk_core.timekeeper;
1176 struct clocksource *clock = tk->tkr.clock; 1203 struct clocksource *clock = tk->tkr.clock;
@@ -1251,7 +1278,7 @@ static void timekeeping_resume(void)
1251 hrtimers_resume(); 1278 hrtimers_resume();
1252} 1279}
1253 1280
1254static int timekeeping_suspend(void) 1281int timekeeping_suspend(void)
1255{ 1282{
1256 struct timekeeper *tk = &tk_core.timekeeper; 1283 struct timekeeper *tk = &tk_core.timekeeper;
1257 unsigned long flags; 1284 unsigned long flags;
@@ -1296,6 +1323,7 @@ static int timekeeping_suspend(void)
1296 } 1323 }
1297 1324
1298 timekeeping_update(tk, TK_MIRROR); 1325 timekeeping_update(tk, TK_MIRROR);
1326 halt_fast_timekeeper(tk);
1299 write_seqcount_end(&tk_core.seq); 1327 write_seqcount_end(&tk_core.seq);
1300 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1328 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1301 1329
@@ -1659,24 +1687,24 @@ out:
1659} 1687}
1660 1688
1661/** 1689/**
1662 * getboottime - Return the real time of system boot. 1690 * getboottime64 - Return the real time of system boot.
1663 * @ts: pointer to the timespec to be set 1691 * @ts: pointer to the timespec64 to be set
1664 * 1692 *
1665 * Returns the wall-time of boot in a timespec. 1693 * Returns the wall-time of boot in a timespec64.
1666 * 1694 *
1667 * This is based on the wall_to_monotonic offset and the total suspend 1695 * This is based on the wall_to_monotonic offset and the total suspend
1668 * time. Calls to settimeofday will affect the value returned (which 1696 * time. Calls to settimeofday will affect the value returned (which
1669 * basically means that however wrong your real time clock is at boot time, 1697 * basically means that however wrong your real time clock is at boot time,
1670 * you get the right time here). 1698 * you get the right time here).
1671 */ 1699 */
1672void getboottime(struct timespec *ts) 1700void getboottime64(struct timespec64 *ts)
1673{ 1701{
1674 struct timekeeper *tk = &tk_core.timekeeper; 1702 struct timekeeper *tk = &tk_core.timekeeper;
1675 ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); 1703 ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
1676 1704
1677 *ts = ktime_to_timespec(t); 1705 *ts = ktime_to_timespec64(t);
1678} 1706}
1679EXPORT_SYMBOL_GPL(getboottime); 1707EXPORT_SYMBOL_GPL(getboottime64);
1680 1708
1681unsigned long get_seconds(void) 1709unsigned long get_seconds(void)
1682{ 1710{
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index adc1fc98bde3..1d91416055d5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -16,5 +16,7 @@ extern int timekeeping_inject_offset(struct timespec *ts);
16extern s32 timekeeping_get_tai_offset(void); 16extern s32 timekeeping_get_tai_offset(void);
17extern void timekeeping_set_tai_offset(s32 tai_offset); 17extern void timekeeping_set_tai_offset(s32 tai_offset);
18extern void timekeeping_clocktai(struct timespec *ts); 18extern void timekeeping_clocktai(struct timespec *ts);
19extern int timekeeping_suspend(void);
20extern void timekeeping_resume(void);
19 21
20#endif 22#endif
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979ccde26720..98f26588255e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -3,11 +3,11 @@
3 3
4ifdef CONFIG_FUNCTION_TRACER 4ifdef CONFIG_FUNCTION_TRACER
5ORIG_CFLAGS := $(KBUILD_CFLAGS) 5ORIG_CFLAGS := $(KBUILD_CFLAGS)
6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) 6KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
7 7
8ifdef CONFIG_FTRACE_SELFTEST 8ifdef CONFIG_FTRACE_SELFTEST
9# selftest needs instrumentation 9# selftest needs instrumentation
10CFLAGS_trace_selftest_dynamic.o = -pg 10CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE)
11obj-y += trace_selftest_dynamic.o 11obj-y += trace_selftest_dynamic.o
12endif 12endif
13endif 13endif
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 224e768bdc73..45e5cb143d17 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5456,7 +5456,7 @@ static __init int ftrace_init_debugfs(void)
5456 struct dentry *d_tracer; 5456 struct dentry *d_tracer;
5457 5457
5458 d_tracer = tracing_init_dentry(); 5458 d_tracer = tracing_init_dentry();
5459 if (!d_tracer) 5459 if (IS_ERR(d_tracer))
5460 return 0; 5460 return 0;
5461 5461
5462 ftrace_init_dyn_debugfs(d_tracer); 5462 ftrace_init_dyn_debugfs(d_tracer);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 1c71382b283d..eb4220a132ec 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,6 @@
13#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
16EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); 17EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
17 18
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7a4104cb95cb..5040d44fe5a3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -9,7 +9,6 @@
9#include <linux/trace_seq.h> 9#include <linux/trace_seq.h>
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/irq_work.h> 11#include <linux/irq_work.h>
12#include <linux/debugfs.h>
13#include <linux/uaccess.h> 12#include <linux/uaccess.h>
14#include <linux/hardirq.h> 13#include <linux/hardirq.h>
15#include <linux/kthread.h> /* for self test */ 14#include <linux/kthread.h> /* for self test */
@@ -23,7 +22,6 @@
23#include <linux/hash.h> 22#include <linux/hash.h>
24#include <linux/list.h> 23#include <linux/list.h>
25#include <linux/cpu.h> 24#include <linux/cpu.h>
26#include <linux/fs.h>
27 25
28#include <asm/local.h> 26#include <asm/local.h>
29 27
@@ -447,7 +445,10 @@ int ring_buffer_print_page_header(struct trace_seq *s)
447struct rb_irq_work { 445struct rb_irq_work {
448 struct irq_work work; 446 struct irq_work work;
449 wait_queue_head_t waiters; 447 wait_queue_head_t waiters;
448 wait_queue_head_t full_waiters;
450 bool waiters_pending; 449 bool waiters_pending;
450 bool full_waiters_pending;
451 bool wakeup_full;
451}; 452};
452 453
453/* 454/*
@@ -529,6 +530,10 @@ static void rb_wake_up_waiters(struct irq_work *work)
529 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); 530 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
530 531
531 wake_up_all(&rbwork->waiters); 532 wake_up_all(&rbwork->waiters);
533 if (rbwork->wakeup_full) {
534 rbwork->wakeup_full = false;
535 wake_up_all(&rbwork->full_waiters);
536 }
532} 537}
533 538
534/** 539/**
@@ -553,9 +558,11 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
553 * data in any cpu buffer, or a specific buffer, put the 558 * data in any cpu buffer, or a specific buffer, put the
554 * caller on the appropriate wait queue. 559 * caller on the appropriate wait queue.
555 */ 560 */
556 if (cpu == RING_BUFFER_ALL_CPUS) 561 if (cpu == RING_BUFFER_ALL_CPUS) {
557 work = &buffer->irq_work; 562 work = &buffer->irq_work;
558 else { 563 /* Full only makes sense on per cpu reads */
564 full = false;
565 } else {
559 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 566 if (!cpumask_test_cpu(cpu, buffer->cpumask))
560 return -ENODEV; 567 return -ENODEV;
561 cpu_buffer = buffer->buffers[cpu]; 568 cpu_buffer = buffer->buffers[cpu];
@@ -564,7 +571,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
564 571
565 572
566 while (true) { 573 while (true) {
567 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); 574 if (full)
575 prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
576 else
577 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
568 578
569 /* 579 /*
570 * The events can happen in critical sections where 580 * The events can happen in critical sections where
@@ -586,7 +596,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
586 * that is necessary is that the wake up happens after 596 * that is necessary is that the wake up happens after
587 * a task has been queued. It's OK for spurious wake ups. 597 * a task has been queued. It's OK for spurious wake ups.
588 */ 598 */
589 work->waiters_pending = true; 599 if (full)
600 work->full_waiters_pending = true;
601 else
602 work->waiters_pending = true;
590 603
591 if (signal_pending(current)) { 604 if (signal_pending(current)) {
592 ret = -EINTR; 605 ret = -EINTR;
@@ -615,7 +628,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
615 schedule(); 628 schedule();
616 } 629 }
617 630
618 finish_wait(&work->waiters, &wait); 631 if (full)
632 finish_wait(&work->full_waiters, &wait);
633 else
634 finish_wait(&work->waiters, &wait);
619 635
620 return ret; 636 return ret;
621} 637}
@@ -1230,6 +1246,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1230 init_completion(&cpu_buffer->update_done); 1246 init_completion(&cpu_buffer->update_done);
1231 init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); 1247 init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
1232 init_waitqueue_head(&cpu_buffer->irq_work.waiters); 1248 init_waitqueue_head(&cpu_buffer->irq_work.waiters);
1249 init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
1233 1250
1234 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1251 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1235 GFP_KERNEL, cpu_to_node(cpu)); 1252 GFP_KERNEL, cpu_to_node(cpu));
@@ -2801,6 +2818,8 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2801static __always_inline void 2818static __always_inline void
2802rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) 2819rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2803{ 2820{
2821 bool pagebusy;
2822
2804 if (buffer->irq_work.waiters_pending) { 2823 if (buffer->irq_work.waiters_pending) {
2805 buffer->irq_work.waiters_pending = false; 2824 buffer->irq_work.waiters_pending = false;
2806 /* irq_work_queue() supplies it's own memory barriers */ 2825 /* irq_work_queue() supplies it's own memory barriers */
@@ -2812,6 +2831,15 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2812 /* irq_work_queue() supplies it's own memory barriers */ 2831 /* irq_work_queue() supplies it's own memory barriers */
2813 irq_work_queue(&cpu_buffer->irq_work.work); 2832 irq_work_queue(&cpu_buffer->irq_work.work);
2814 } 2833 }
2834
2835 pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
2836
2837 if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
2838 cpu_buffer->irq_work.wakeup_full = true;
2839 cpu_buffer->irq_work.full_waiters_pending = false;
2840 /* irq_work_queue() supplies it's own memory barriers */
2841 irq_work_queue(&cpu_buffer->irq_work.work);
2842 }
2815} 2843}
2816 2844
2817/** 2845/**
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 3f9e328c30b5..13d945c0d03f 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -7,7 +7,7 @@
7#include <linux/completion.h> 7#include <linux/completion.h>
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/time.h> 10#include <linux/ktime.h>
11#include <asm/local.h> 11#include <asm/local.h>
12 12
13struct rb_page { 13struct rb_page {
@@ -17,7 +17,7 @@ struct rb_page {
17}; 17};
18 18
19/* run time and sleep time in seconds */ 19/* run time and sleep time in seconds */
20#define RUN_TIME 10 20#define RUN_TIME 10ULL
21#define SLEEP_TIME 10 21#define SLEEP_TIME 10
22 22
23/* number of events for writer to wake up the reader */ 23/* number of events for writer to wake up the reader */
@@ -212,8 +212,7 @@ static void ring_buffer_consumer(void)
212 212
213static void ring_buffer_producer(void) 213static void ring_buffer_producer(void)
214{ 214{
215 struct timeval start_tv; 215 ktime_t start_time, end_time, timeout;
216 struct timeval end_tv;
217 unsigned long long time; 216 unsigned long long time;
218 unsigned long long entries; 217 unsigned long long entries;
219 unsigned long long overruns; 218 unsigned long long overruns;
@@ -227,7 +226,8 @@ static void ring_buffer_producer(void)
227 * make the system stall) 226 * make the system stall)
228 */ 227 */
229 trace_printk("Starting ring buffer hammer\n"); 228 trace_printk("Starting ring buffer hammer\n");
230 do_gettimeofday(&start_tv); 229 start_time = ktime_get();
230 timeout = ktime_add_ns(start_time, RUN_TIME * NSEC_PER_SEC);
231 do { 231 do {
232 struct ring_buffer_event *event; 232 struct ring_buffer_event *event;
233 int *entry; 233 int *entry;
@@ -244,7 +244,7 @@ static void ring_buffer_producer(void)
244 ring_buffer_unlock_commit(buffer, event); 244 ring_buffer_unlock_commit(buffer, event);
245 } 245 }
246 } 246 }
247 do_gettimeofday(&end_tv); 247 end_time = ktime_get();
248 248
249 cnt++; 249 cnt++;
250 if (consumer && !(cnt % wakeup_interval)) 250 if (consumer && !(cnt % wakeup_interval))
@@ -264,7 +264,7 @@ static void ring_buffer_producer(void)
264 cond_resched(); 264 cond_resched();
265#endif 265#endif
266 266
267 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); 267 } while (ktime_before(end_time, timeout) && !kill_test);
268 trace_printk("End ring buffer hammer\n"); 268 trace_printk("End ring buffer hammer\n");
269 269
270 if (consumer) { 270 if (consumer) {
@@ -280,9 +280,7 @@ static void ring_buffer_producer(void)
280 wait_for_completion(&read_done); 280 wait_for_completion(&read_done);
281 } 281 }
282 282
283 time = end_tv.tv_sec - start_tv.tv_sec; 283 time = ktime_us_delta(end_time, start_time);
284 time *= USEC_PER_SEC;
285 time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
286 284
287 entries = ring_buffer_entries(buffer); 285 entries = ring_buffer_entries(buffer);
288 overruns = ring_buffer_overruns(buffer); 286 overruns = ring_buffer_overruns(buffer);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4a9079b9f082..62c6506d663f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2036,7 +2036,8 @@ void trace_printk_init_buffers(void)
2036 2036
2037 /* trace_printk() is for debug use only. Don't use it in production. */ 2037 /* trace_printk() is for debug use only. Don't use it in production. */
2038 2038
2039 pr_warning("\n**********************************************************\n"); 2039 pr_warning("\n");
2040 pr_warning("**********************************************************\n");
2040 pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); 2041 pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
2041 pr_warning("** **\n"); 2042 pr_warning("** **\n");
2042 pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); 2043 pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
@@ -3352,12 +3353,12 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
3352 3353
3353 mutex_lock(&tracing_cpumask_update_lock); 3354 mutex_lock(&tracing_cpumask_update_lock);
3354 3355
3355 len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask); 3356 len = snprintf(mask_str, count, "%*pb\n",
3356 if (count - len < 2) { 3357 cpumask_pr_args(tr->tracing_cpumask));
3358 if (len >= count) {
3357 count = -EINVAL; 3359 count = -EINVAL;
3358 goto out_err; 3360 goto out_err;
3359 } 3361 }
3360 len += sprintf(mask_str + len, "\n");
3361 count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); 3362 count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
3362 3363
3363out_err: 3364out_err:
@@ -4140,6 +4141,12 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
4140 goto out; 4141 goto out;
4141 } 4142 }
4142 4143
4144 /* If trace pipe files are being read, we can't change the tracer */
4145 if (tr->current_trace->ref) {
4146 ret = -EBUSY;
4147 goto out;
4148 }
4149
4143 trace_branch_disable(); 4150 trace_branch_disable();
4144 4151
4145 tr->current_trace->enabled--; 4152 tr->current_trace->enabled--;
@@ -4326,17 +4333,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
4326 } 4333 }
4327 4334
4328 trace_seq_init(&iter->seq); 4335 trace_seq_init(&iter->seq);
4329 4336 iter->trace = tr->current_trace;
4330 /*
4331 * We make a copy of the current tracer to avoid concurrent
4332 * changes on it while we are reading.
4333 */
4334 iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL);
4335 if (!iter->trace) {
4336 ret = -ENOMEM;
4337 goto fail;
4338 }
4339 *iter->trace = *tr->current_trace;
4340 4337
4341 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 4338 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
4342 ret = -ENOMEM; 4339 ret = -ENOMEM;
@@ -4363,6 +4360,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
4363 iter->trace->pipe_open(iter); 4360 iter->trace->pipe_open(iter);
4364 4361
4365 nonseekable_open(inode, filp); 4362 nonseekable_open(inode, filp);
4363
4364 tr->current_trace->ref++;
4366out: 4365out:
4367 mutex_unlock(&trace_types_lock); 4366 mutex_unlock(&trace_types_lock);
4368 return ret; 4367 return ret;
@@ -4382,6 +4381,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
4382 4381
4383 mutex_lock(&trace_types_lock); 4382 mutex_lock(&trace_types_lock);
4384 4383
4384 tr->current_trace->ref--;
4385
4385 if (iter->trace->pipe_close) 4386 if (iter->trace->pipe_close)
4386 iter->trace->pipe_close(iter); 4387 iter->trace->pipe_close(iter);
4387 4388
@@ -4389,7 +4390,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
4389 4390
4390 free_cpumask_var(iter->started); 4391 free_cpumask_var(iter->started);
4391 mutex_destroy(&iter->mutex); 4392 mutex_destroy(&iter->mutex);
4392 kfree(iter->trace);
4393 kfree(iter); 4393 kfree(iter);
4394 4394
4395 trace_array_put(tr); 4395 trace_array_put(tr);
@@ -4422,7 +4422,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
4422 return trace_poll(iter, filp, poll_table); 4422 return trace_poll(iter, filp, poll_table);
4423} 4423}
4424 4424
4425/* Must be called with trace_types_lock mutex held. */ 4425/* Must be called with iter->mutex held. */
4426static int tracing_wait_pipe(struct file *filp) 4426static int tracing_wait_pipe(struct file *filp)
4427{ 4427{
4428 struct trace_iterator *iter = filp->private_data; 4428 struct trace_iterator *iter = filp->private_data;
@@ -4467,7 +4467,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
4467 size_t cnt, loff_t *ppos) 4467 size_t cnt, loff_t *ppos)
4468{ 4468{
4469 struct trace_iterator *iter = filp->private_data; 4469 struct trace_iterator *iter = filp->private_data;
4470 struct trace_array *tr = iter->tr;
4471 ssize_t sret; 4470 ssize_t sret;
4472 4471
4473 /* return any leftover data */ 4472 /* return any leftover data */
@@ -4477,12 +4476,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
4477 4476
4478 trace_seq_init(&iter->seq); 4477 trace_seq_init(&iter->seq);
4479 4478
4480 /* copy the tracer to avoid using a global lock all around */
4481 mutex_lock(&trace_types_lock);
4482 if (unlikely(iter->trace->name != tr->current_trace->name))
4483 *iter->trace = *tr->current_trace;
4484 mutex_unlock(&trace_types_lock);
4485
4486 /* 4479 /*
4487 * Avoid more than one consumer on a single file descriptor 4480 * Avoid more than one consumer on a single file descriptor
4488 * This is just a matter of traces coherency, the ring buffer itself 4481 * This is just a matter of traces coherency, the ring buffer itself
@@ -4642,7 +4635,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
4642 .ops = &tracing_pipe_buf_ops, 4635 .ops = &tracing_pipe_buf_ops,
4643 .spd_release = tracing_spd_release_pipe, 4636 .spd_release = tracing_spd_release_pipe,
4644 }; 4637 };
4645 struct trace_array *tr = iter->tr;
4646 ssize_t ret; 4638 ssize_t ret;
4647 size_t rem; 4639 size_t rem;
4648 unsigned int i; 4640 unsigned int i;
@@ -4650,12 +4642,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
4650 if (splice_grow_spd(pipe, &spd)) 4642 if (splice_grow_spd(pipe, &spd))
4651 return -ENOMEM; 4643 return -ENOMEM;
4652 4644
4653 /* copy the tracer to avoid using a global lock all around */
4654 mutex_lock(&trace_types_lock);
4655 if (unlikely(iter->trace->name != tr->current_trace->name))
4656 *iter->trace = *tr->current_trace;
4657 mutex_unlock(&trace_types_lock);
4658
4659 mutex_lock(&iter->mutex); 4645 mutex_lock(&iter->mutex);
4660 4646
4661 if (iter->trace->splice_read) { 4647 if (iter->trace->splice_read) {
@@ -4942,7 +4928,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4942 *fpos += written; 4928 *fpos += written;
4943 4929
4944 out_unlock: 4930 out_unlock:
4945 for (i = 0; i < nr_pages; i++){ 4931 for (i = nr_pages - 1; i >= 0; i--) {
4946 kunmap_atomic(map_page[i]); 4932 kunmap_atomic(map_page[i]);
4947 put_page(pages[i]); 4933 put_page(pages[i]);
4948 } 4934 }
@@ -5331,6 +5317,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
5331 5317
5332 filp->private_data = info; 5318 filp->private_data = info;
5333 5319
5320 tr->current_trace->ref++;
5321
5334 mutex_unlock(&trace_types_lock); 5322 mutex_unlock(&trace_types_lock);
5335 5323
5336 ret = nonseekable_open(inode, filp); 5324 ret = nonseekable_open(inode, filp);
@@ -5361,21 +5349,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5361 if (!count) 5349 if (!count)
5362 return 0; 5350 return 0;
5363 5351
5364 mutex_lock(&trace_types_lock);
5365
5366#ifdef CONFIG_TRACER_MAX_TRACE 5352#ifdef CONFIG_TRACER_MAX_TRACE
5367 if (iter->snapshot && iter->tr->current_trace->use_max_tr) { 5353 if (iter->snapshot && iter->tr->current_trace->use_max_tr)
5368 size = -EBUSY; 5354 return -EBUSY;
5369 goto out_unlock;
5370 }
5371#endif 5355#endif
5372 5356
5373 if (!info->spare) 5357 if (!info->spare)
5374 info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, 5358 info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
5375 iter->cpu_file); 5359 iter->cpu_file);
5376 size = -ENOMEM;
5377 if (!info->spare) 5360 if (!info->spare)
5378 goto out_unlock; 5361 return -ENOMEM;
5379 5362
5380 /* Do we have previous read data to read? */ 5363 /* Do we have previous read data to read? */
5381 if (info->read < PAGE_SIZE) 5364 if (info->read < PAGE_SIZE)
@@ -5391,21 +5374,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5391 5374
5392 if (ret < 0) { 5375 if (ret < 0) {
5393 if (trace_empty(iter)) { 5376 if (trace_empty(iter)) {
5394 if ((filp->f_flags & O_NONBLOCK)) { 5377 if ((filp->f_flags & O_NONBLOCK))
5395 size = -EAGAIN; 5378 return -EAGAIN;
5396 goto out_unlock; 5379
5397 }
5398 mutex_unlock(&trace_types_lock);
5399 ret = wait_on_pipe(iter, false); 5380 ret = wait_on_pipe(iter, false);
5400 mutex_lock(&trace_types_lock); 5381 if (ret)
5401 if (ret) { 5382 return ret;
5402 size = ret; 5383
5403 goto out_unlock;
5404 }
5405 goto again; 5384 goto again;
5406 } 5385 }
5407 size = 0; 5386 return 0;
5408 goto out_unlock;
5409 } 5387 }
5410 5388
5411 info->read = 0; 5389 info->read = 0;
@@ -5415,18 +5393,14 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5415 size = count; 5393 size = count;
5416 5394
5417 ret = copy_to_user(ubuf, info->spare + info->read, size); 5395 ret = copy_to_user(ubuf, info->spare + info->read, size);
5418 if (ret == size) { 5396 if (ret == size)
5419 size = -EFAULT; 5397 return -EFAULT;
5420 goto out_unlock; 5398
5421 }
5422 size -= ret; 5399 size -= ret;
5423 5400
5424 *ppos += size; 5401 *ppos += size;
5425 info->read += size; 5402 info->read += size;
5426 5403
5427 out_unlock:
5428 mutex_unlock(&trace_types_lock);
5429
5430 return size; 5404 return size;
5431} 5405}
5432 5406
@@ -5437,6 +5411,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
5437 5411
5438 mutex_lock(&trace_types_lock); 5412 mutex_lock(&trace_types_lock);
5439 5413
5414 iter->tr->current_trace->ref--;
5415
5440 __trace_array_put(iter->tr); 5416 __trace_array_put(iter->tr);
5441 5417
5442 if (info->spare) 5418 if (info->spare)
@@ -5522,30 +5498,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5522 int entries, size, i; 5498 int entries, size, i;
5523 ssize_t ret = 0; 5499 ssize_t ret = 0;
5524 5500
5525 mutex_lock(&trace_types_lock);
5526
5527#ifdef CONFIG_TRACER_MAX_TRACE 5501#ifdef CONFIG_TRACER_MAX_TRACE
5528 if (iter->snapshot && iter->tr->current_trace->use_max_tr) { 5502 if (iter->snapshot && iter->tr->current_trace->use_max_tr)
5529 ret = -EBUSY; 5503 return -EBUSY;
5530 goto out;
5531 }
5532#endif 5504#endif
5533 5505
5534 if (splice_grow_spd(pipe, &spd)) { 5506 if (splice_grow_spd(pipe, &spd))
5535 ret = -ENOMEM; 5507 return -ENOMEM;
5536 goto out;
5537 }
5538 5508
5539 if (*ppos & (PAGE_SIZE - 1)) { 5509 if (*ppos & (PAGE_SIZE - 1))
5540 ret = -EINVAL; 5510 return -EINVAL;
5541 goto out;
5542 }
5543 5511
5544 if (len & (PAGE_SIZE - 1)) { 5512 if (len & (PAGE_SIZE - 1)) {
5545 if (len < PAGE_SIZE) { 5513 if (len < PAGE_SIZE)
5546 ret = -EINVAL; 5514 return -EINVAL;
5547 goto out;
5548 }
5549 len &= PAGE_MASK; 5515 len &= PAGE_MASK;
5550 } 5516 }
5551 5517
@@ -5606,25 +5572,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5606 /* did we read anything? */ 5572 /* did we read anything? */
5607 if (!spd.nr_pages) { 5573 if (!spd.nr_pages) {
5608 if (ret) 5574 if (ret)
5609 goto out; 5575 return ret;
5576
5577 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
5578 return -EAGAIN;
5610 5579
5611 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
5612 ret = -EAGAIN;
5613 goto out;
5614 }
5615 mutex_unlock(&trace_types_lock);
5616 ret = wait_on_pipe(iter, true); 5580 ret = wait_on_pipe(iter, true);
5617 mutex_lock(&trace_types_lock);
5618 if (ret) 5581 if (ret)
5619 goto out; 5582 return ret;
5620 5583
5621 goto again; 5584 goto again;
5622 } 5585 }
5623 5586
5624 ret = splice_to_pipe(pipe, &spd); 5587 ret = splice_to_pipe(pipe, &spd);
5625 splice_shrink_spd(&spd); 5588 splice_shrink_spd(&spd);
5626out:
5627 mutex_unlock(&trace_types_lock);
5628 5589
5629 return ret; 5590 return ret;
5630} 5591}
@@ -5854,28 +5815,11 @@ static __init int register_snapshot_cmd(void)
5854static inline __init int register_snapshot_cmd(void) { return 0; } 5815static inline __init int register_snapshot_cmd(void) { return 0; }
5855#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ 5816#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
5856 5817
5857struct dentry *tracing_init_dentry_tr(struct trace_array *tr) 5818static struct dentry *tracing_get_dentry(struct trace_array *tr)
5858{ 5819{
5859 if (tr->dir)
5860 return tr->dir;
5861
5862 if (!debugfs_initialized())
5863 return NULL;
5864
5865 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
5866 tr->dir = debugfs_create_dir("tracing", NULL);
5867
5868 if (!tr->dir)
5869 pr_warn_once("Could not create debugfs directory 'tracing'\n");
5870
5871 return tr->dir; 5820 return tr->dir;
5872} 5821}
5873 5822
5874struct dentry *tracing_init_dentry(void)
5875{
5876 return tracing_init_dentry_tr(&global_trace);
5877}
5878
5879static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) 5823static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
5880{ 5824{
5881 struct dentry *d_tracer; 5825 struct dentry *d_tracer;
@@ -5883,8 +5827,8 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
5883 if (tr->percpu_dir) 5827 if (tr->percpu_dir)
5884 return tr->percpu_dir; 5828 return tr->percpu_dir;
5885 5829
5886 d_tracer = tracing_init_dentry_tr(tr); 5830 d_tracer = tracing_get_dentry(tr);
5887 if (!d_tracer) 5831 if (IS_ERR(d_tracer))
5888 return NULL; 5832 return NULL;
5889 5833
5890 tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); 5834 tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
@@ -6086,8 +6030,8 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
6086 if (tr->options) 6030 if (tr->options)
6087 return tr->options; 6031 return tr->options;
6088 6032
6089 d_tracer = tracing_init_dentry_tr(tr); 6033 d_tracer = tracing_get_dentry(tr);
6090 if (!d_tracer) 6034 if (IS_ERR(d_tracer))
6091 return NULL; 6035 return NULL;
6092 6036
6093 tr->options = debugfs_create_dir("options", d_tracer); 6037 tr->options = debugfs_create_dir("options", d_tracer);
@@ -6416,7 +6360,7 @@ static int instance_delete(const char *name)
6416 goto out_unlock; 6360 goto out_unlock;
6417 6361
6418 ret = -EBUSY; 6362 ret = -EBUSY;
6419 if (tr->ref) 6363 if (tr->ref || (tr->current_trace && tr->current_trace->ref))
6420 goto out_unlock; 6364 goto out_unlock;
6421 6365
6422 list_del(&tr->list); 6366 list_del(&tr->list);
@@ -6571,6 +6515,33 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6571 6515
6572} 6516}
6573 6517
6518/**
6519 * tracing_init_dentry - initialize top level trace array
6520 *
6521 * This is called when creating files or directories in the tracing
6522 * directory. It is called via fs_initcall() by any of the boot up code
6523 * and expects to return the dentry of the top level tracing directory.
6524 */
6525struct dentry *tracing_init_dentry(void)
6526{
6527 struct trace_array *tr = &global_trace;
6528
6529 if (tr->dir)
6530 return tr->dir;
6531
6532 if (WARN_ON(!debugfs_initialized()))
6533 return ERR_PTR(-ENODEV);
6534
6535 tr->dir = debugfs_create_dir("tracing", NULL);
6536
6537 if (!tr->dir) {
6538 pr_warn_once("Could not create debugfs directory 'tracing'\n");
6539 return ERR_PTR(-ENOMEM);
6540 }
6541
6542 return tr->dir;
6543}
6544
6574static __init int tracer_init_debugfs(void) 6545static __init int tracer_init_debugfs(void)
6575{ 6546{
6576 struct dentry *d_tracer; 6547 struct dentry *d_tracer;
@@ -6578,7 +6549,7 @@ static __init int tracer_init_debugfs(void)
6578 trace_access_lock_init(); 6549 trace_access_lock_init();
6579 6550
6580 d_tracer = tracing_init_dentry(); 6551 d_tracer = tracing_init_dentry();
6581 if (!d_tracer) 6552 if (IS_ERR(d_tracer))
6582 return 0; 6553 return 0;
6583 6554
6584 init_tracer_debugfs(&global_trace, d_tracer); 6555 init_tracer_debugfs(&global_trace, d_tracer);
@@ -6811,7 +6782,6 @@ __init static int tracer_alloc_buffers(void)
6811 int ring_buf_size; 6782 int ring_buf_size;
6812 int ret = -ENOMEM; 6783 int ret = -ENOMEM;
6813 6784
6814
6815 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 6785 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
6816 goto out; 6786 goto out;
6817 6787
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8de48bac1ce2..dd8205a35760 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -388,6 +388,7 @@ struct tracer {
388 struct tracer *next; 388 struct tracer *next;
389 struct tracer_flags *flags; 389 struct tracer_flags *flags;
390 int enabled; 390 int enabled;
391 int ref;
391 bool print_max; 392 bool print_max;
392 bool allow_instances; 393 bool allow_instances;
393#ifdef CONFIG_TRACER_MAX_TRACE 394#ifdef CONFIG_TRACER_MAX_TRACE
@@ -541,7 +542,6 @@ struct dentry *trace_create_file(const char *name,
541 void *data, 542 void *data,
542 const struct file_operations *fops); 543 const struct file_operations *fops);
543 544
544struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
545struct dentry *tracing_init_dentry(void); 545struct dentry *tracing_init_dentry(void);
546 546
547struct ring_buffer_event; 547struct ring_buffer_event;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7d6e2afde669..57cbf1efdd44 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -7,7 +7,6 @@
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/spinlock.h> 8#include <linux/spinlock.h>
9#include <linux/irqflags.h> 9#include <linux/irqflags.h>
10#include <linux/debugfs.h>
11#include <linux/uaccess.h> 10#include <linux/uaccess.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/ftrace.h> 12#include <linux/ftrace.h>
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4b9c114ee9de..6fa484de2ba1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags)
261} 261}
262 262
263void *perf_trace_buf_prepare(int size, unsigned short type, 263void *perf_trace_buf_prepare(int size, unsigned short type,
264 struct pt_regs *regs, int *rctxp) 264 struct pt_regs **regs, int *rctxp)
265{ 265{
266 struct trace_entry *entry; 266 struct trace_entry *entry;
267 unsigned long flags; 267 unsigned long flags;
@@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
280 if (*rctxp < 0) 280 if (*rctxp < 0)
281 return NULL; 281 return NULL;
282 282
283 if (regs)
284 *regs = this_cpu_ptr(&__perf_regs[*rctxp]);
283 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); 285 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
284 286
285 /* zero the dead bytes from align to not leak stack to user */ 287 /* zero the dead bytes from align to not leak stack to user */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b03a0ea77b99..db54dda10ccc 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2531,7 +2531,7 @@ static __init int event_trace_init(void)
2531 return -ENODEV; 2531 return -ENODEV;
2532 2532
2533 d_tracer = tracing_init_dentry(); 2533 d_tracer = tracing_init_dentry();
2534 if (!d_tracer) 2534 if (IS_ERR(d_tracer))
2535 return 0; 2535 return 0;
2536 2536
2537 entry = debugfs_create_file("available_events", 0444, d_tracer, 2537 entry = debugfs_create_file("available_events", 0444, d_tracer,
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4ddde28a81a..12e2b99be862 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -6,12 +6,10 @@
6#include <linux/stringify.h> 6#include <linux/stringify.h>
7#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 9#include <linux/uaccess.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/init.h> 12#include <linux/init.h>
14#include <linux/fs.h>
15 13
16#include "trace_output.h" 14#include "trace_output.h"
17 15
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ba476009e5de..2d25ad1526bb 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1437,7 +1437,7 @@ static __init int init_graph_debugfs(void)
1437 struct dentry *d_tracer; 1437 struct dentry *d_tracer;
1438 1438
1439 d_tracer = tracing_init_dentry(); 1439 d_tracer = tracing_init_dentry();
1440 if (!d_tracer) 1440 if (IS_ERR(d_tracer))
1441 return 0; 1441 return 0;
1442 1442
1443 trace_create_file("max_graph_depth", 0644, d_tracer, 1443 trace_create_file("max_graph_depth", 0644, d_tracer,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9bb104f748d0..8523ea345f2b 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -10,11 +10,9 @@
10 * Copyright (C) 2004 Nadia Yvette Chambers 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/kallsyms.h> 12#include <linux/kallsyms.h>
13#include <linux/debugfs.h>
14#include <linux/uaccess.h> 13#include <linux/uaccess.h>
15#include <linux/module.h> 14#include <linux/module.h>
16#include <linux/ftrace.h> 15#include <linux/ftrace.h>
17#include <linux/fs.h>
18 16
19#include "trace.h" 17#include "trace.h"
20 18
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5edb518be345..d73f565b4e06 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1148 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1148 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1149 size -= sizeof(u32); 1149 size -= sizeof(u32);
1150 1150
1151 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1151 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1152 if (!entry) 1152 if (!entry)
1153 return; 1153 return;
1154 1154
@@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1179 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1179 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1180 size -= sizeof(u32); 1180 size -= sizeof(u32);
1181 1181
1182 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1182 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1183 if (!entry) 1183 if (!entry)
1184 return; 1184 return;
1185 1185
@@ -1320,7 +1320,7 @@ static __init int init_kprobe_trace(void)
1320 return -EINVAL; 1320 return -EINVAL;
1321 1321
1322 d_tracer = tracing_init_dentry(); 1322 d_tracer = tracing_init_dentry();
1323 if (!d_tracer) 1323 if (IS_ERR(d_tracer))
1324 return 0; 1324 return 0;
1325 1325
1326 entry = debugfs_create_file("kprobe_events", 0644, d_tracer, 1326 entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index fcf0a9e48916..8bb2071474dd 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -6,8 +6,6 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h> 9#include <linux/ftrace.h>
12 10
13#include "trace.h" 11#include "trace.h"
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b77b9a697619..692bf7184c8c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -177,6 +177,50 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
177} 177}
178EXPORT_SYMBOL(ftrace_print_hex_seq); 178EXPORT_SYMBOL(ftrace_print_hex_seq);
179 179
180const char *
181ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
182 size_t el_size)
183{
184 const char *ret = trace_seq_buffer_ptr(p);
185 const char *prefix = "";
186 void *ptr = (void *)buf;
187
188 trace_seq_putc(p, '{');
189
190 while (ptr < buf + buf_len) {
191 switch (el_size) {
192 case 1:
193 trace_seq_printf(p, "%s0x%x", prefix,
194 *(u8 *)ptr);
195 break;
196 case 2:
197 trace_seq_printf(p, "%s0x%x", prefix,
198 *(u16 *)ptr);
199 break;
200 case 4:
201 trace_seq_printf(p, "%s0x%x", prefix,
202 *(u32 *)ptr);
203 break;
204 case 8:
205 trace_seq_printf(p, "%s0x%llx", prefix,
206 *(u64 *)ptr);
207 break;
208 default:
209 trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size,
210 *(u8 *)ptr);
211 el_size = 1;
212 }
213 prefix = ",";
214 ptr += el_size;
215 }
216
217 trace_seq_putc(p, '}');
218 trace_seq_putc(p, 0);
219
220 return ret;
221}
222EXPORT_SYMBOL(ftrace_print_array_seq);
223
180int ftrace_raw_output_prep(struct trace_iterator *iter, 224int ftrace_raw_output_prep(struct trace_iterator *iter,
181 struct trace_event *trace_event) 225 struct trace_event *trace_event)
182{ 226{
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index c4e70b6bd7fa..36c1455b7567 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -5,7 +5,6 @@
5 * 5 *
6 */ 6 */
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/debugfs.h>
9#include <linux/uaccess.h> 8#include <linux/uaccess.h>
10#include <linux/kernel.h> 9#include <linux/kernel.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
@@ -15,7 +14,6 @@
15#include <linux/ctype.h> 14#include <linux/ctype.h>
16#include <linux/list.h> 15#include <linux/list.h>
17#include <linux/slab.h> 16#include <linux/slab.h>
18#include <linux/fs.h>
19 17
20#include "trace.h" 18#include "trace.h"
21 19
@@ -349,7 +347,7 @@ static __init int init_trace_printk_function_export(void)
349 struct dentry *d_tracer; 347 struct dentry *d_tracer;
350 348
351 d_tracer = tracing_init_dentry(); 349 d_tracer = tracing_init_dentry();
352 if (!d_tracer) 350 if (IS_ERR(d_tracer))
353 return 0; 351 return 0;
354 352
355 trace_create_file("printk_formats", 0444, d_tracer, 353 trace_create_file("printk_formats", 0444, d_tracer,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 2e293beb186e..419ca37e72c9 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -5,8 +5,6 @@
5 * 5 *
6 */ 6 */
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/fs.h>
9#include <linux/debugfs.h>
10#include <linux/kallsyms.h> 8#include <linux/kallsyms.h>
11#include <linux/uaccess.h> 9#include <linux/uaccess.h>
12#include <linux/ftrace.h> 10#include <linux/ftrace.h>
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 8fb84b362816..d6e1003724e9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -10,8 +10,6 @@
10 * Copyright (C) 2004 Nadia Yvette Chambers 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/fs.h>
14#include <linux/debugfs.h>
15#include <linux/kallsyms.h> 13#include <linux/kallsyms.h>
16#include <linux/uaccess.h> 14#include <linux/uaccess.h>
17#include <linux/ftrace.h> 15#include <linux/ftrace.h>
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index f8b45d8792f9..e694c9f9efa4 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -120,7 +120,7 @@ void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
120 120
121 __trace_seq_init(s); 121 __trace_seq_init(s);
122 122
123 seq_buf_bitmask(&s->seq, maskp, nmaskbits); 123 seq_buf_printf(&s->seq, "%*pb", nmaskbits, maskp);
124 124
125 if (unlikely(seq_buf_has_overflowed(&s->seq))) { 125 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
126 s->seq.len = save_len; 126 s->seq.len = save_len;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 16eddb308c33..c3e4fcfddd45 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -7,12 +7,10 @@
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/spinlock.h> 8#include <linux/spinlock.h>
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/sysctl.h> 12#include <linux/sysctl.h>
14#include <linux/init.h> 13#include <linux/init.h>
15#include <linux/fs.h>
16 14
17#include <asm/setup.h> 15#include <asm/setup.h>
18 16
@@ -462,7 +460,7 @@ static __init int stack_trace_init(void)
462 struct dentry *d_tracer; 460 struct dentry *d_tracer;
463 461
464 d_tracer = tracing_init_dentry(); 462 d_tracer = tracing_init_dentry();
465 if (!d_tracer) 463 if (IS_ERR(d_tracer))
466 return 0; 464 return 0;
467 465
468 trace_create_file("stack_max_size", 0644, d_tracer, 466 trace_create_file("stack_max_size", 0644, d_tracer,
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 7af67360b330..75e19e86c954 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -276,7 +276,7 @@ static int tracing_stat_init(void)
276 struct dentry *d_tracing; 276 struct dentry *d_tracing;
277 277
278 d_tracing = tracing_init_dentry(); 278 d_tracing = tracing_init_dentry();
279 if (!d_tracing) 279 if (IS_ERR(d_tracing))
280 return 0; 280 return 0;
281 281
282 stat_dir = debugfs_create_dir("trace_stat", d_tracing); 282 stat_dir = debugfs_create_dir("trace_stat", d_tracing);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..f97f6e3a676c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
574 size -= sizeof(u32); 574 size -= sizeof(u32);
575 575
576 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 576 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
577 sys_data->enter_event->event.type, regs, &rctx); 577 sys_data->enter_event->event.type, NULL, &rctx);
578 if (!rec) 578 if (!rec)
579 return; 579 return;
580 580
@@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
647 size -= sizeof(u32); 647 size -= sizeof(u32);
648 648
649 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 649 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
650 sys_data->exit_event->event.type, regs, &rctx); 650 sys_data->exit_event->event.type, NULL, &rctx);
651 if (!rec) 651 if (!rec)
652 return; 652 return;
653 653
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8520acc34b18..7dc1c8abecd6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
1111 if (hlist_empty(head)) 1111 if (hlist_empty(head))
1112 goto out; 1112 goto out;
1113 1113
1114 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1114 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1115 if (!entry) 1115 if (!entry)
1116 goto out; 1116 goto out;
1117 1117
@@ -1321,7 +1321,7 @@ static __init int init_uprobe_trace(void)
1321 struct dentry *d_tracer; 1321 struct dentry *d_tracer;
1322 1322
1323 d_tracer = tracing_init_dentry(); 1323 d_tracer = tracing_init_dentry();
1324 if (!d_tracer) 1324 if (IS_ERR(d_tracer))
1325 return 0; 1325 return 0;
1326 1326
1327 trace_create_file("uprobe_events", 0644, d_tracer, 1327 trace_create_file("uprobe_events", 0644, d_tracer,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 70bf11815f84..3174bf8e3538 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -154,7 +154,7 @@ static int get_softlockup_thresh(void)
154 */ 154 */
155static unsigned long get_timestamp(void) 155static unsigned long get_timestamp(void)
156{ 156{
157 return local_clock() >> 30LL; /* 2^30 ~= 10^9 */ 157 return running_clock() >> 30LL; /* 2^30 ~= 10^9 */
158} 158}
159 159
160static void set_sample_period(void) 160static void set_sample_period(void)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index beeeac9e0e3e..f28849394791 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3083,10 +3083,9 @@ static ssize_t wq_cpumask_show(struct device *dev,
3083 int written; 3083 int written;
3084 3084
3085 mutex_lock(&wq->mutex); 3085 mutex_lock(&wq->mutex);
3086 written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask); 3086 written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
3087 cpumask_pr_args(wq->unbound_attrs->cpumask));
3087 mutex_unlock(&wq->mutex); 3088 mutex_unlock(&wq->mutex);
3088
3089 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
3090 return written; 3089 return written;
3091} 3090}
3092 3091