aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2015-02-26 06:24:50 -0500
committerIngo Molnar <mingo@kernel.org>2015-02-26 06:24:50 -0500
commite9e4e44309f866b115d08ab4a54834008c50a8a4 (patch)
treeae9f91e682a4d6592ef263f30a4a0b1a862b7987 /kernel
parent8a26ce4e544659256349551283414df504889a59 (diff)
parentc517d838eb7d07bbe9507871fab3931deccff539 (diff)
Merge tag 'v4.0-rc1' into perf/core, to refresh the tree
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks4
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/acct.c94
-rw-r--r--kernel/audit.h17
-rw-r--r--kernel/auditfilter.c2
-rw-r--r--kernel/auditsc.c176
-rw-r--r--kernel/cgroup.c12
-rw-r--r--kernel/compat.c5
-rw-r--r--kernel/cpu.c56
-rw-r--r--kernel/cpuset.c44
-rw-r--r--kernel/debug/debug_core.c19
-rw-r--r--kernel/debug/kdb/kdb_io.c46
-rw-r--r--kernel/debug/kdb/kdb_main.c16
-rw-r--r--kernel/debug/kdb/kdb_private.h4
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/core.c12
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/futex.c8
-rw-r--r--kernel/gcov/Makefile36
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/irq/proc.c11
-rw-r--r--kernel/kexec.c25
-rw-r--r--kernel/kprobes.c22
-rw-r--r--kernel/livepatch/Kconfig18
-rw-r--r--kernel/livepatch/Makefile3
-rw-r--r--kernel/livepatch/core.c1015
-rw-r--r--kernel/locking/Makefile11
-rw-r--r--kernel/locking/mcs_spinlock.h16
-rw-r--r--kernel/locking/mutex.c62
-rw-r--r--kernel/locking/osq_lock.c (renamed from kernel/locking/mcs_spinlock.c)9
-rw-r--r--kernel/locking/rtmutex.c10
-rw-r--r--kernel/locking/rwsem-spinlock.c2
-rw-r--r--kernel/locking/rwsem-xadd.c3
-rw-r--r--kernel/locking/spinlock.c8
-rw-r--r--kernel/module.c58
-rw-r--r--kernel/notifier.c3
-rw-r--r--kernel/padata.c11
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/process.c75
-rw-r--r--kernel/power/qos.c91
-rw-r--r--kernel/power/snapshot.c11
-rw-r--r--kernel/power/suspend.c43
-rw-r--r--kernel/printk/printk.c14
-rw-r--r--kernel/profile.c3
-rw-r--r--kernel/ptrace.c1
-rw-r--r--kernel/rcu/Makefile3
-rw-r--r--kernel/rcu/rcu.h6
-rw-r--r--kernel/rcu/rcutorture.c66
-rw-r--r--kernel/rcu/srcu.c2
-rw-r--r--kernel/rcu/tiny.c113
-rw-r--r--kernel/rcu/tiny_plugin.h9
-rw-r--r--kernel/rcu/tree.c289
-rw-r--r--kernel/rcu/tree.h62
-rw-r--r--kernel/rcu/tree_plugin.h277
-rw-r--r--kernel/rcu/tree_trace.c8
-rw-r--r--kernel/resource.c25
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c6
-rw-r--r--kernel/sched/clock.c13
-rw-r--r--kernel/sched/completion.c31
-rw-r--r--kernel/sched/core.c264
-rw-r--r--kernel/sched/cpudeadline.c27
-rw-r--r--kernel/sched/cpudeadline.h2
-rw-r--r--kernel/sched/deadline.c87
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c9
-rw-r--r--kernel/sched/idle.c19
-rw-r--r--kernel/sched/rt.c26
-rw-r--r--kernel/sched/sched.h98
-rw-r--r--kernel/sched/stats.c11
-rw-r--r--kernel/seccomp.c4
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/smpboot.c2
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sys.c12
-rw-r--r--kernel/sysctl.c3
-rw-r--r--kernel/taskstats.c13
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clocksource.c76
-rw-r--r--kernel/time/hrtimer.c116
-rw-r--r--kernel/time/ntp.c14
-rw-r--r--kernel/time/posix-cpu-timers.c3
-rw-r--r--kernel/time/tick-common.c50
-rw-r--r--kernel/time/tick-sched.c11
-rw-r--r--kernel/time/timecounter.c112
-rw-r--r--kernel/time/timekeeping.c60
-rw-r--r--kernel/time/timekeeping.h2
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c42
-rw-r--r--kernel/trace/ring_buffer_benchmark.c18
-rw-r--r--kernel/trace/trace.c194
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_branch.c1
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/trace/trace_nop.c2
-rw-r--r--kernel/trace/trace_output.c44
-rw-r--r--kernel/trace/trace_printk.c4
-rw-r--r--kernel/trace/trace_sched_switch.c2
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_seq.c2
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_stat.c2
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/watchdog.c2
-rw-r--r--kernel/workqueue.c5
114 files changed, 2832 insertions, 1577 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 76768ee812b2..08561f1acd13 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER
231 def_bool y 231 def_bool y
232 depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW 232 depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
233 233
234config LOCK_SPIN_ON_OWNER
235 def_bool y
236 depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
237
234config ARCH_USE_QUEUE_RWLOCK 238config ARCH_USE_QUEUE_RWLOCK
235 bool 239 bool
236 240
diff --git a/kernel/Makefile b/kernel/Makefile
index a59481a3fa6c..1408b3353a3c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -13,8 +13,8 @@ obj-y = fork.o exec_domain.o panic.o \
13 13
14ifdef CONFIG_FUNCTION_TRACER 14ifdef CONFIG_FUNCTION_TRACER
15# Do not trace debug files and internal ftrace files 15# Do not trace debug files and internal ftrace files
16CFLAGS_REMOVE_cgroup-debug.o = -pg 16CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
17CFLAGS_REMOVE_irq_work.o = -pg 17CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
18endif 18endif
19 19
20# cond_syscall is currently not LTO compatible 20# cond_syscall is currently not LTO compatible
@@ -26,6 +26,7 @@ obj-y += power/
26obj-y += printk/ 26obj-y += printk/
27obj-y += irq/ 27obj-y += irq/
28obj-y += rcu/ 28obj-y += rcu/
29obj-y += livepatch/
29 30
30obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 31obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
31obj-$(CONFIG_FREEZER) += freezer.o 32obj-$(CONFIG_FREEZER) += freezer.o
@@ -142,7 +143,7 @@ endif
142kernel/system_certificates.o: $(obj)/x509_certificate_list 143kernel/system_certificates.o: $(obj)/x509_certificate_list
143 144
144quiet_cmd_x509certs = CERTS $@ 145quiet_cmd_x509certs = CERTS $@
145 cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)") 146 cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)")
146 147
147targets += $(obj)/x509_certificate_list 148targets += $(obj)/x509_certificate_list
148$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list 149$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
diff --git a/kernel/acct.c b/kernel/acct.c
index 33738ef972f3..e6c10d1a4058 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -76,10 +76,11 @@ int acct_parm[3] = {4, 2, 30};
76/* 76/*
77 * External references and all of the globals. 77 * External references and all of the globals.
78 */ 78 */
79static void do_acct_process(struct bsd_acct_struct *acct);
80 79
81struct bsd_acct_struct { 80struct bsd_acct_struct {
82 struct fs_pin pin; 81 struct fs_pin pin;
82 atomic_long_t count;
83 struct rcu_head rcu;
83 struct mutex lock; 84 struct mutex lock;
84 int active; 85 int active;
85 unsigned long needcheck; 86 unsigned long needcheck;
@@ -89,6 +90,8 @@ struct bsd_acct_struct {
89 struct completion done; 90 struct completion done;
90}; 91};
91 92
93static void do_acct_process(struct bsd_acct_struct *acct);
94
92/* 95/*
93 * Check the amount of free space and suspend/resume accordingly. 96 * Check the amount of free space and suspend/resume accordingly.
94 */ 97 */
@@ -124,32 +127,56 @@ out:
124 return acct->active; 127 return acct->active;
125} 128}
126 129
130static void acct_put(struct bsd_acct_struct *p)
131{
132 if (atomic_long_dec_and_test(&p->count))
133 kfree_rcu(p, rcu);
134}
135
136static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
137{
138 return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
139}
140
127static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) 141static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
128{ 142{
129 struct bsd_acct_struct *res; 143 struct bsd_acct_struct *res;
130again: 144again:
131 smp_rmb(); 145 smp_rmb();
132 rcu_read_lock(); 146 rcu_read_lock();
133 res = ACCESS_ONCE(ns->bacct); 147 res = to_acct(ACCESS_ONCE(ns->bacct));
134 if (!res) { 148 if (!res) {
135 rcu_read_unlock(); 149 rcu_read_unlock();
136 return NULL; 150 return NULL;
137 } 151 }
138 if (!atomic_long_inc_not_zero(&res->pin.count)) { 152 if (!atomic_long_inc_not_zero(&res->count)) {
139 rcu_read_unlock(); 153 rcu_read_unlock();
140 cpu_relax(); 154 cpu_relax();
141 goto again; 155 goto again;
142 } 156 }
143 rcu_read_unlock(); 157 rcu_read_unlock();
144 mutex_lock(&res->lock); 158 mutex_lock(&res->lock);
145 if (!res->ns) { 159 if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
146 mutex_unlock(&res->lock); 160 mutex_unlock(&res->lock);
147 pin_put(&res->pin); 161 acct_put(res);
148 goto again; 162 goto again;
149 } 163 }
150 return res; 164 return res;
151} 165}
152 166
167static void acct_pin_kill(struct fs_pin *pin)
168{
169 struct bsd_acct_struct *acct = to_acct(pin);
170 mutex_lock(&acct->lock);
171 do_acct_process(acct);
172 schedule_work(&acct->work);
173 wait_for_completion(&acct->done);
174 cmpxchg(&acct->ns->bacct, pin, NULL);
175 mutex_unlock(&acct->lock);
176 pin_remove(pin);
177 acct_put(acct);
178}
179
153static void close_work(struct work_struct *work) 180static void close_work(struct work_struct *work)
154{ 181{
155 struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); 182 struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
@@ -160,44 +187,13 @@ static void close_work(struct work_struct *work)
160 complete(&acct->done); 187 complete(&acct->done);
161} 188}
162 189
163static void acct_kill(struct bsd_acct_struct *acct,
164 struct bsd_acct_struct *new)
165{
166 if (acct) {
167 struct pid_namespace *ns = acct->ns;
168 do_acct_process(acct);
169 INIT_WORK(&acct->work, close_work);
170 init_completion(&acct->done);
171 schedule_work(&acct->work);
172 wait_for_completion(&acct->done);
173 pin_remove(&acct->pin);
174 ns->bacct = new;
175 acct->ns = NULL;
176 atomic_long_dec(&acct->pin.count);
177 mutex_unlock(&acct->lock);
178 pin_put(&acct->pin);
179 }
180}
181
182static void acct_pin_kill(struct fs_pin *pin)
183{
184 struct bsd_acct_struct *acct;
185 acct = container_of(pin, struct bsd_acct_struct, pin);
186 mutex_lock(&acct->lock);
187 if (!acct->ns) {
188 mutex_unlock(&acct->lock);
189 pin_put(pin);
190 acct = NULL;
191 }
192 acct_kill(acct, NULL);
193}
194
195static int acct_on(struct filename *pathname) 190static int acct_on(struct filename *pathname)
196{ 191{
197 struct file *file; 192 struct file *file;
198 struct vfsmount *mnt, *internal; 193 struct vfsmount *mnt, *internal;
199 struct pid_namespace *ns = task_active_pid_ns(current); 194 struct pid_namespace *ns = task_active_pid_ns(current);
200 struct bsd_acct_struct *acct, *old; 195 struct bsd_acct_struct *acct;
196 struct fs_pin *old;
201 int err; 197 int err;
202 198
203 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); 199 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
@@ -238,21 +234,21 @@ static int acct_on(struct filename *pathname)
238 mnt = file->f_path.mnt; 234 mnt = file->f_path.mnt;
239 file->f_path.mnt = internal; 235 file->f_path.mnt = internal;
240 236
241 atomic_long_set(&acct->pin.count, 1); 237 atomic_long_set(&acct->count, 1);
242 acct->pin.kill = acct_pin_kill; 238 init_fs_pin(&acct->pin, acct_pin_kill);
243 acct->file = file; 239 acct->file = file;
244 acct->needcheck = jiffies; 240 acct->needcheck = jiffies;
245 acct->ns = ns; 241 acct->ns = ns;
246 mutex_init(&acct->lock); 242 mutex_init(&acct->lock);
243 INIT_WORK(&acct->work, close_work);
244 init_completion(&acct->done);
247 mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ 245 mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
248 pin_insert(&acct->pin, mnt); 246 pin_insert(&acct->pin, mnt);
249 247
250 old = acct_get(ns); 248 rcu_read_lock();
251 if (old) 249 old = xchg(&ns->bacct, &acct->pin);
252 acct_kill(old, acct);
253 else
254 ns->bacct = acct;
255 mutex_unlock(&acct->lock); 250 mutex_unlock(&acct->lock);
251 pin_kill(old);
256 mnt_drop_write(mnt); 252 mnt_drop_write(mnt);
257 mntput(mnt); 253 mntput(mnt);
258 return 0; 254 return 0;
@@ -288,7 +284,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
288 mutex_unlock(&acct_on_mutex); 284 mutex_unlock(&acct_on_mutex);
289 putname(tmp); 285 putname(tmp);
290 } else { 286 } else {
291 acct_kill(acct_get(task_active_pid_ns(current)), NULL); 287 rcu_read_lock();
288 pin_kill(task_active_pid_ns(current)->bacct);
292 } 289 }
293 290
294 return error; 291 return error;
@@ -296,7 +293,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
296 293
297void acct_exit_ns(struct pid_namespace *ns) 294void acct_exit_ns(struct pid_namespace *ns)
298{ 295{
299 acct_kill(acct_get(ns), NULL); 296 rcu_read_lock();
297 pin_kill(ns->bacct);
300} 298}
301 299
302/* 300/*
@@ -576,7 +574,7 @@ static void slow_acct_process(struct pid_namespace *ns)
576 if (acct) { 574 if (acct) {
577 do_acct_process(acct); 575 do_acct_process(acct);
578 mutex_unlock(&acct->lock); 576 mutex_unlock(&acct->lock);
579 pin_put(&acct->pin); 577 acct_put(acct);
580 } 578 }
581 } 579 }
582} 580}
diff --git a/kernel/audit.h b/kernel/audit.h
index 3cdffad5a1d9..1caa0d345d90 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -24,12 +24,6 @@
24#include <linux/skbuff.h> 24#include <linux/skbuff.h>
25#include <uapi/linux/mqueue.h> 25#include <uapi/linux/mqueue.h>
26 26
27/* 0 = no checking
28 1 = put_count checking
29 2 = verbose put_count checking
30*/
31#define AUDIT_DEBUG 0
32
33/* AUDIT_NAMES is the number of slots we reserve in the audit_context 27/* AUDIT_NAMES is the number of slots we reserve in the audit_context
34 * for saving names from getname(). If we get more names we will allocate 28 * for saving names from getname(). If we get more names we will allocate
35 * a name dynamically and also add those to the list anchored by names_list. */ 29 * a name dynamically and also add those to the list anchored by names_list. */
@@ -74,9 +68,8 @@ struct audit_cap_data {
74 }; 68 };
75}; 69};
76 70
77/* When fs/namei.c:getname() is called, we store the pointer in name and 71/* When fs/namei.c:getname() is called, we store the pointer in name and bump
78 * we don't let putname() free it (instead we free all of the saved 72 * the refcnt in the associated filename struct.
79 * pointers at syscall exit time).
80 * 73 *
81 * Further, in fs/namei.c:path_lookup() we store the inode and device. 74 * Further, in fs/namei.c:path_lookup() we store the inode and device.
82 */ 75 */
@@ -86,7 +79,6 @@ struct audit_names {
86 struct filename *name; 79 struct filename *name;
87 int name_len; /* number of chars to log */ 80 int name_len; /* number of chars to log */
88 bool hidden; /* don't log this record */ 81 bool hidden; /* don't log this record */
89 bool name_put; /* call __putname()? */
90 82
91 unsigned long ino; 83 unsigned long ino;
92 dev_t dev; 84 dev_t dev;
@@ -208,11 +200,6 @@ struct audit_context {
208 }; 200 };
209 int fds[2]; 201 int fds[2];
210 struct audit_proctitle proctitle; 202 struct audit_proctitle proctitle;
211
212#if AUDIT_DEBUG
213 int put_count;
214 int ino_count;
215#endif
216}; 203};
217 204
218extern u32 audit_ever_enabled; 205extern u32 audit_ever_enabled;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f68a326d92e..72e1660a79a3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -425,7 +425,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
425 goto exit_nofree; 425 goto exit_nofree;
426 426
427 bufp = data->buf; 427 bufp = data->buf;
428 entry->rule.vers_ops = 2;
429 for (i = 0; i < data->field_count; i++) { 428 for (i = 0; i < data->field_count; i++) {
430 struct audit_field *f = &entry->rule.fields[i]; 429 struct audit_field *f = &entry->rule.fields[i];
431 430
@@ -758,7 +757,6 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
758 return ERR_PTR(-ENOMEM); 757 return ERR_PTR(-ENOMEM);
759 758
760 new = &entry->rule; 759 new = &entry->rule;
761 new->vers_ops = old->vers_ops;
762 new->flags = old->flags; 760 new->flags = old->flags;
763 new->pflags = old->pflags; 761 new->pflags = old->pflags;
764 new->listnr = old->listnr; 762 new->listnr = old->listnr;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 072566dd0caf..dc4ae70a7413 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -866,33 +866,10 @@ static inline void audit_free_names(struct audit_context *context)
866{ 866{
867 struct audit_names *n, *next; 867 struct audit_names *n, *next;
868 868
869#if AUDIT_DEBUG == 2
870 if (context->put_count + context->ino_count != context->name_count) {
871 int i = 0;
872
873 pr_err("%s:%d(:%d): major=%d in_syscall=%d"
874 " name_count=%d put_count=%d ino_count=%d"
875 " [NOT freeing]\n", __FILE__, __LINE__,
876 context->serial, context->major, context->in_syscall,
877 context->name_count, context->put_count,
878 context->ino_count);
879 list_for_each_entry(n, &context->names_list, list) {
880 pr_err("names[%d] = %p = %s\n", i++, n->name,
881 n->name->name ?: "(null)");
882 }
883 dump_stack();
884 return;
885 }
886#endif
887#if AUDIT_DEBUG
888 context->put_count = 0;
889 context->ino_count = 0;
890#endif
891
892 list_for_each_entry_safe(n, next, &context->names_list, list) { 869 list_for_each_entry_safe(n, next, &context->names_list, list) {
893 list_del(&n->list); 870 list_del(&n->list);
894 if (n->name && n->name_put) 871 if (n->name)
895 final_putname(n->name); 872 putname(n->name);
896 if (n->should_free) 873 if (n->should_free)
897 kfree(n); 874 kfree(n);
898 } 875 }
@@ -1711,9 +1688,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
1711 list_add_tail(&aname->list, &context->names_list); 1688 list_add_tail(&aname->list, &context->names_list);
1712 1689
1713 context->name_count++; 1690 context->name_count++;
1714#if AUDIT_DEBUG
1715 context->ino_count++;
1716#endif
1717 return aname; 1691 return aname;
1718} 1692}
1719 1693
@@ -1734,8 +1708,10 @@ __audit_reusename(const __user char *uptr)
1734 list_for_each_entry(n, &context->names_list, list) { 1708 list_for_each_entry(n, &context->names_list, list) {
1735 if (!n->name) 1709 if (!n->name)
1736 continue; 1710 continue;
1737 if (n->name->uptr == uptr) 1711 if (n->name->uptr == uptr) {
1712 n->name->refcnt++;
1738 return n->name; 1713 return n->name;
1714 }
1739 } 1715 }
1740 return NULL; 1716 return NULL;
1741} 1717}
@@ -1752,19 +1728,8 @@ void __audit_getname(struct filename *name)
1752 struct audit_context *context = current->audit_context; 1728 struct audit_context *context = current->audit_context;
1753 struct audit_names *n; 1729 struct audit_names *n;
1754 1730
1755 if (!context->in_syscall) { 1731 if (!context->in_syscall)
1756#if AUDIT_DEBUG == 2
1757 pr_err("%s:%d(:%d): ignoring getname(%p)\n",
1758 __FILE__, __LINE__, context->serial, name);
1759 dump_stack();
1760#endif
1761 return; 1732 return;
1762 }
1763
1764#if AUDIT_DEBUG
1765 /* The filename _must_ have a populated ->name */
1766 BUG_ON(!name->name);
1767#endif
1768 1733
1769 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); 1734 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
1770 if (!n) 1735 if (!n)
@@ -1772,56 +1737,13 @@ void __audit_getname(struct filename *name)
1772 1737
1773 n->name = name; 1738 n->name = name;
1774 n->name_len = AUDIT_NAME_FULL; 1739 n->name_len = AUDIT_NAME_FULL;
1775 n->name_put = true;
1776 name->aname = n; 1740 name->aname = n;
1741 name->refcnt++;
1777 1742
1778 if (!context->pwd.dentry) 1743 if (!context->pwd.dentry)
1779 get_fs_pwd(current->fs, &context->pwd); 1744 get_fs_pwd(current->fs, &context->pwd);
1780} 1745}
1781 1746
1782/* audit_putname - intercept a putname request
1783 * @name: name to intercept and delay for putname
1784 *
1785 * If we have stored the name from getname in the audit context,
1786 * then we delay the putname until syscall exit.
1787 * Called from include/linux/fs.h:putname().
1788 */
1789void audit_putname(struct filename *name)
1790{
1791 struct audit_context *context = current->audit_context;
1792
1793 BUG_ON(!context);
1794 if (!name->aname || !context->in_syscall) {
1795#if AUDIT_DEBUG == 2
1796 pr_err("%s:%d(:%d): final_putname(%p)\n",
1797 __FILE__, __LINE__, context->serial, name);
1798 if (context->name_count) {
1799 struct audit_names *n;
1800 int i = 0;
1801
1802 list_for_each_entry(n, &context->names_list, list)
1803 pr_err("name[%d] = %p = %s\n", i++, n->name,
1804 n->name->name ?: "(null)");
1805 }
1806#endif
1807 final_putname(name);
1808 }
1809#if AUDIT_DEBUG
1810 else {
1811 ++context->put_count;
1812 if (context->put_count > context->name_count) {
1813 pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
1814 " name_count=%d put_count=%d\n",
1815 __FILE__, __LINE__,
1816 context->serial, context->major,
1817 context->in_syscall, name->name,
1818 context->name_count, context->put_count);
1819 dump_stack();
1820 }
1821 }
1822#endif
1823}
1824
1825/** 1747/**
1826 * __audit_inode - store the inode and device from a lookup 1748 * __audit_inode - store the inode and device from a lookup
1827 * @name: name being audited 1749 * @name: name being audited
@@ -1842,10 +1764,6 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
1842 if (!name) 1764 if (!name)
1843 goto out_alloc; 1765 goto out_alloc;
1844 1766
1845#if AUDIT_DEBUG
1846 /* The struct filename _must_ have a populated ->name */
1847 BUG_ON(!name->name);
1848#endif
1849 /* 1767 /*
1850 * If we have a pointer to an audit_names entry already, then we can 1768 * If we have a pointer to an audit_names entry already, then we can
1851 * just use it directly if the type is correct. 1769 * just use it directly if the type is correct.
@@ -1863,7 +1781,17 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
1863 } 1781 }
1864 1782
1865 list_for_each_entry_reverse(n, &context->names_list, list) { 1783 list_for_each_entry_reverse(n, &context->names_list, list) {
1866 if (!n->name || strcmp(n->name->name, name->name)) 1784 if (n->ino) {
1785 /* valid inode number, use that for the comparison */
1786 if (n->ino != inode->i_ino ||
1787 n->dev != inode->i_sb->s_dev)
1788 continue;
1789 } else if (n->name) {
1790 /* inode number has not been set, check the name */
1791 if (strcmp(n->name->name, name->name))
1792 continue;
1793 } else
1794 /* no inode and no name (?!) ... this is odd ... */
1867 continue; 1795 continue;
1868 1796
1869 /* match the correct record type */ 1797 /* match the correct record type */
@@ -1882,44 +1810,11 @@ out_alloc:
1882 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); 1810 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
1883 if (!n) 1811 if (!n)
1884 return; 1812 return;
1885 /* unfortunately, while we may have a path name to record with the
1886 * inode, we can't always rely on the string lasting until the end of
1887 * the syscall so we need to create our own copy, it may fail due to
1888 * memory allocation issues, but we do our best */
1889 if (name) { 1813 if (name) {
1890 /* we can't use getname_kernel() due to size limits */ 1814 n->name = name;
1891 size_t len = strlen(name->name) + 1; 1815 name->refcnt++;
1892 struct filename *new = __getname();
1893
1894 if (unlikely(!new))
1895 goto out;
1896
1897 if (len <= (PATH_MAX - sizeof(*new))) {
1898 new->name = (char *)(new) + sizeof(*new);
1899 new->separate = false;
1900 } else if (len <= PATH_MAX) {
1901 /* this looks odd, but is due to final_putname() */
1902 struct filename *new2;
1903
1904 new2 = kmalloc(sizeof(*new2), GFP_KERNEL);
1905 if (unlikely(!new2)) {
1906 __putname(new);
1907 goto out;
1908 }
1909 new2->name = (char *)new;
1910 new2->separate = true;
1911 new = new2;
1912 } else {
1913 /* we should never get here, but let's be safe */
1914 __putname(new);
1915 goto out;
1916 }
1917 strlcpy((char *)new->name, name->name, len);
1918 new->uptr = NULL;
1919 new->aname = n;
1920 n->name = new;
1921 n->name_put = true;
1922 } 1816 }
1817
1923out: 1818out:
1924 if (parent) { 1819 if (parent) {
1925 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; 1820 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
@@ -1970,11 +1865,16 @@ void __audit_inode_child(const struct inode *parent,
1970 1865
1971 /* look for a parent entry first */ 1866 /* look for a parent entry first */
1972 list_for_each_entry(n, &context->names_list, list) { 1867 list_for_each_entry(n, &context->names_list, list) {
1973 if (!n->name || n->type != AUDIT_TYPE_PARENT) 1868 if (!n->name ||
1869 (n->type != AUDIT_TYPE_PARENT &&
1870 n->type != AUDIT_TYPE_UNKNOWN))
1974 continue; 1871 continue;
1975 1872
1976 if (n->ino == parent->i_ino && 1873 if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
1977 !audit_compare_dname_path(dname, n->name->name, n->name_len)) { 1874 !audit_compare_dname_path(dname,
1875 n->name->name, n->name_len)) {
1876 if (n->type == AUDIT_TYPE_UNKNOWN)
1877 n->type = AUDIT_TYPE_PARENT;
1978 found_parent = n; 1878 found_parent = n;
1979 break; 1879 break;
1980 } 1880 }
@@ -1983,11 +1883,8 @@ void __audit_inode_child(const struct inode *parent,
1983 /* is there a matching child entry? */ 1883 /* is there a matching child entry? */
1984 list_for_each_entry(n, &context->names_list, list) { 1884 list_for_each_entry(n, &context->names_list, list) {
1985 /* can only match entries that have a name */ 1885 /* can only match entries that have a name */
1986 if (!n->name || n->type != type) 1886 if (!n->name ||
1987 continue; 1887 (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
1988
1989 /* if we found a parent, make sure this one is a child of it */
1990 if (found_parent && (n->name != found_parent->name))
1991 continue; 1888 continue;
1992 1889
1993 if (!strcmp(dname, n->name->name) || 1890 if (!strcmp(dname, n->name->name) ||
@@ -1995,6 +1892,8 @@ void __audit_inode_child(const struct inode *parent,
1995 found_parent ? 1892 found_parent ?
1996 found_parent->name_len : 1893 found_parent->name_len :
1997 AUDIT_NAME_FULL)) { 1894 AUDIT_NAME_FULL)) {
1895 if (n->type == AUDIT_TYPE_UNKNOWN)
1896 n->type = type;
1998 found_child = n; 1897 found_child = n;
1999 break; 1898 break;
2000 } 1899 }
@@ -2019,10 +1918,10 @@ void __audit_inode_child(const struct inode *parent,
2019 if (found_parent) { 1918 if (found_parent) {
2020 found_child->name = found_parent->name; 1919 found_child->name = found_parent->name;
2021 found_child->name_len = AUDIT_NAME_FULL; 1920 found_child->name_len = AUDIT_NAME_FULL;
2022 /* don't call __putname() */ 1921 found_child->name->refcnt++;
2023 found_child->name_put = false;
2024 } 1922 }
2025 } 1923 }
1924
2026 if (inode) 1925 if (inode)
2027 audit_copy_inode(found_child, dentry, inode); 1926 audit_copy_inode(found_child, dentry, inode);
2028 else 1927 else
@@ -2405,7 +2304,6 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2405 struct audit_aux_data_bprm_fcaps *ax; 2304 struct audit_aux_data_bprm_fcaps *ax;
2406 struct audit_context *context = current->audit_context; 2305 struct audit_context *context = current->audit_context;
2407 struct cpu_vfs_cap_data vcaps; 2306 struct cpu_vfs_cap_data vcaps;
2408 struct dentry *dentry;
2409 2307
2410 ax = kmalloc(sizeof(*ax), GFP_KERNEL); 2308 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2411 if (!ax) 2309 if (!ax)
@@ -2415,9 +2313,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2415 ax->d.next = context->aux; 2313 ax->d.next = context->aux;
2416 context->aux = (void *)ax; 2314 context->aux = (void *)ax;
2417 2315
2418 dentry = dget(bprm->file->f_path.dentry); 2316 get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
2419 get_vfs_caps_from_disk(dentry, &vcaps);
2420 dput(dentry);
2421 2317
2422 ax->fcap.permitted = vcaps.permitted; 2318 ax->fcap.permitted = vcaps.permitted;
2423 ax->fcap.inheritable = vcaps.inheritable; 2319 ax->fcap.inheritable = vcaps.inheritable;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 04cfe8ace520..29a7b2cc593e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3077,7 +3077,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
3077#endif 3077#endif
3078 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), 3078 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3079 cgroup_file_mode(cft), 0, cft->kf_ops, cft, 3079 cgroup_file_mode(cft), 0, cft->kf_ops, cft,
3080 NULL, false, key); 3080 NULL, key);
3081 if (IS_ERR(kn)) 3081 if (IS_ERR(kn))
3082 return PTR_ERR(kn); 3082 return PTR_ERR(kn);
3083 3083
@@ -4373,16 +4373,20 @@ static void css_free_work_fn(struct work_struct *work)
4373{ 4373{
4374 struct cgroup_subsys_state *css = 4374 struct cgroup_subsys_state *css =
4375 container_of(work, struct cgroup_subsys_state, destroy_work); 4375 container_of(work, struct cgroup_subsys_state, destroy_work);
4376 struct cgroup_subsys *ss = css->ss;
4376 struct cgroup *cgrp = css->cgroup; 4377 struct cgroup *cgrp = css->cgroup;
4377 4378
4378 percpu_ref_exit(&css->refcnt); 4379 percpu_ref_exit(&css->refcnt);
4379 4380
4380 if (css->ss) { 4381 if (ss) {
4381 /* css free path */ 4382 /* css free path */
4383 int id = css->id;
4384
4382 if (css->parent) 4385 if (css->parent)
4383 css_put(css->parent); 4386 css_put(css->parent);
4384 4387
4385 css->ss->css_free(css); 4388 ss->css_free(css);
4389 cgroup_idr_remove(&ss->css_idr, id);
4386 cgroup_put(cgrp); 4390 cgroup_put(cgrp);
4387 } else { 4391 } else {
4388 /* cgroup free path */ 4392 /* cgroup free path */
@@ -4434,7 +4438,7 @@ static void css_release_work_fn(struct work_struct *work)
4434 4438
4435 if (ss) { 4439 if (ss) {
4436 /* css release path */ 4440 /* css release path */
4437 cgroup_idr_remove(&ss->css_idr, css->id); 4441 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4438 if (ss->css_released) 4442 if (ss->css_released)
4439 ss->css_released(css); 4443 ss->css_released(css);
4440 } else { 4444 } else {
diff --git a/kernel/compat.c b/kernel/compat.c
index ebb3c369d03d..24f00610c575 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -276,8 +276,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
276 * core implementation decides to return random nonsense. 276 * core implementation decides to return random nonsense.
277 */ 277 */
278 if (ret == -ERESTART_RESTARTBLOCK) { 278 if (ret == -ERESTART_RESTARTBLOCK) {
279 struct restart_block *restart 279 struct restart_block *restart = &current->restart_block;
280 = &current_thread_info()->restart_block;
281 280
282 restart->fn = compat_nanosleep_restart; 281 restart->fn = compat_nanosleep_restart;
283 restart->nanosleep.compat_rmtp = rmtp; 282 restart->nanosleep.compat_rmtp = rmtp;
@@ -860,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
860 return -EFAULT; 859 return -EFAULT;
861 860
862 if (err == -ERESTART_RESTARTBLOCK) { 861 if (err == -ERESTART_RESTARTBLOCK) {
863 restart = &current_thread_info()->restart_block; 862 restart = &current->restart_block;
864 restart->fn = compat_clock_nanosleep_restart; 863 restart->fn = compat_clock_nanosleep_restart;
865 restart->nanosleep.compat_rmtp = rmtp; 864 restart->nanosleep.compat_rmtp = rmtp;
866 } 865 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5d220234b3ca..1972b161c61e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,22 +58,23 @@ static int cpu_hotplug_disabled;
58 58
59static struct { 59static struct {
60 struct task_struct *active_writer; 60 struct task_struct *active_writer;
61 struct mutex lock; /* Synchronizes accesses to refcount, */ 61 /* wait queue to wake up the active_writer */
62 wait_queue_head_t wq;
63 /* verifies that no writer will get active while readers are active */
64 struct mutex lock;
62 /* 65 /*
63 * Also blocks the new readers during 66 * Also blocks the new readers during
64 * an ongoing cpu hotplug operation. 67 * an ongoing cpu hotplug operation.
65 */ 68 */
66 int refcount; 69 atomic_t refcount;
67 /* And allows lockless put_online_cpus(). */
68 atomic_t puts_pending;
69 70
70#ifdef CONFIG_DEBUG_LOCK_ALLOC 71#ifdef CONFIG_DEBUG_LOCK_ALLOC
71 struct lockdep_map dep_map; 72 struct lockdep_map dep_map;
72#endif 73#endif
73} cpu_hotplug = { 74} cpu_hotplug = {
74 .active_writer = NULL, 75 .active_writer = NULL,
76 .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
75 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), 77 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
76 .refcount = 0,
77#ifdef CONFIG_DEBUG_LOCK_ALLOC 78#ifdef CONFIG_DEBUG_LOCK_ALLOC
78 .dep_map = {.name = "cpu_hotplug.lock" }, 79 .dep_map = {.name = "cpu_hotplug.lock" },
79#endif 80#endif
@@ -86,15 +87,6 @@ static struct {
86#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) 87#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
87#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) 88#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
88 89
89static void apply_puts_pending(int max)
90{
91 int delta;
92
93 if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
94 delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
95 cpu_hotplug.refcount -= delta;
96 }
97}
98 90
99void get_online_cpus(void) 91void get_online_cpus(void)
100{ 92{
@@ -103,8 +95,7 @@ void get_online_cpus(void)
103 return; 95 return;
104 cpuhp_lock_acquire_read(); 96 cpuhp_lock_acquire_read();
105 mutex_lock(&cpu_hotplug.lock); 97 mutex_lock(&cpu_hotplug.lock);
106 apply_puts_pending(65536); 98 atomic_inc(&cpu_hotplug.refcount);
107 cpu_hotplug.refcount++;
108 mutex_unlock(&cpu_hotplug.lock); 99 mutex_unlock(&cpu_hotplug.lock);
109} 100}
110EXPORT_SYMBOL_GPL(get_online_cpus); 101EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -116,8 +107,7 @@ bool try_get_online_cpus(void)
116 if (!mutex_trylock(&cpu_hotplug.lock)) 107 if (!mutex_trylock(&cpu_hotplug.lock))
117 return false; 108 return false;
118 cpuhp_lock_acquire_tryread(); 109 cpuhp_lock_acquire_tryread();
119 apply_puts_pending(65536); 110 atomic_inc(&cpu_hotplug.refcount);
120 cpu_hotplug.refcount++;
121 mutex_unlock(&cpu_hotplug.lock); 111 mutex_unlock(&cpu_hotplug.lock);
122 return true; 112 return true;
123} 113}
@@ -125,20 +115,18 @@ EXPORT_SYMBOL_GPL(try_get_online_cpus);
125 115
126void put_online_cpus(void) 116void put_online_cpus(void)
127{ 117{
118 int refcount;
119
128 if (cpu_hotplug.active_writer == current) 120 if (cpu_hotplug.active_writer == current)
129 return; 121 return;
130 if (!mutex_trylock(&cpu_hotplug.lock)) {
131 atomic_inc(&cpu_hotplug.puts_pending);
132 cpuhp_lock_release();
133 return;
134 }
135 122
136 if (WARN_ON(!cpu_hotplug.refcount)) 123 refcount = atomic_dec_return(&cpu_hotplug.refcount);
137 cpu_hotplug.refcount++; /* try to fix things up */ 124 if (WARN_ON(refcount < 0)) /* try to fix things up */
125 atomic_inc(&cpu_hotplug.refcount);
126
127 if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
128 wake_up(&cpu_hotplug.wq);
138 129
139 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
140 wake_up_process(cpu_hotplug.active_writer);
141 mutex_unlock(&cpu_hotplug.lock);
142 cpuhp_lock_release(); 130 cpuhp_lock_release();
143 131
144} 132}
@@ -168,18 +156,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
168 */ 156 */
169void cpu_hotplug_begin(void) 157void cpu_hotplug_begin(void)
170{ 158{
171 cpu_hotplug.active_writer = current; 159 DEFINE_WAIT(wait);
172 160
161 cpu_hotplug.active_writer = current;
173 cpuhp_lock_acquire(); 162 cpuhp_lock_acquire();
163
174 for (;;) { 164 for (;;) {
175 mutex_lock(&cpu_hotplug.lock); 165 mutex_lock(&cpu_hotplug.lock);
176 apply_puts_pending(1); 166 prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
177 if (likely(!cpu_hotplug.refcount)) 167 if (likely(!atomic_read(&cpu_hotplug.refcount)))
178 break; 168 break;
179 __set_current_state(TASK_UNINTERRUPTIBLE);
180 mutex_unlock(&cpu_hotplug.lock); 169 mutex_unlock(&cpu_hotplug.lock);
181 schedule(); 170 schedule();
182 } 171 }
172 finish_wait(&cpu_hotplug.wq, &wait);
183} 173}
184 174
185void cpu_hotplug_done(void) 175void cpu_hotplug_done(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64b257f6bca2..1d1fe9361d29 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1707,40 +1707,27 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1707{ 1707{
1708 struct cpuset *cs = css_cs(seq_css(sf)); 1708 struct cpuset *cs = css_cs(seq_css(sf));
1709 cpuset_filetype_t type = seq_cft(sf)->private; 1709 cpuset_filetype_t type = seq_cft(sf)->private;
1710 ssize_t count;
1711 char *buf, *s;
1712 int ret = 0; 1710 int ret = 0;
1713 1711
1714 count = seq_get_buf(sf, &buf);
1715 s = buf;
1716
1717 spin_lock_irq(&callback_lock); 1712 spin_lock_irq(&callback_lock);
1718 1713
1719 switch (type) { 1714 switch (type) {
1720 case FILE_CPULIST: 1715 case FILE_CPULIST:
1721 s += cpulist_scnprintf(s, count, cs->cpus_allowed); 1716 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
1722 break; 1717 break;
1723 case FILE_MEMLIST: 1718 case FILE_MEMLIST:
1724 s += nodelist_scnprintf(s, count, cs->mems_allowed); 1719 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
1725 break; 1720 break;
1726 case FILE_EFFECTIVE_CPULIST: 1721 case FILE_EFFECTIVE_CPULIST:
1727 s += cpulist_scnprintf(s, count, cs->effective_cpus); 1722 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
1728 break; 1723 break;
1729 case FILE_EFFECTIVE_MEMLIST: 1724 case FILE_EFFECTIVE_MEMLIST:
1730 s += nodelist_scnprintf(s, count, cs->effective_mems); 1725 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
1731 break; 1726 break;
1732 default: 1727 default:
1733 ret = -EINVAL; 1728 ret = -EINVAL;
1734 goto out_unlock;
1735 } 1729 }
1736 1730
1737 if (s < buf + count - 1) {
1738 *s++ = '\n';
1739 seq_commit(sf, s - buf);
1740 } else {
1741 seq_commit(sf, -1);
1742 }
1743out_unlock:
1744 spin_unlock_irq(&callback_lock); 1731 spin_unlock_irq(&callback_lock);
1745 return ret; 1732 return ret;
1746} 1733}
@@ -2400,7 +2387,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2400 */ 2387 */
2401} 2388}
2402 2389
2403void cpuset_init_current_mems_allowed(void) 2390void __init cpuset_init_current_mems_allowed(void)
2404{ 2391{
2405 nodes_setall(current->mems_allowed); 2392 nodes_setall(current->mems_allowed);
2406} 2393}
@@ -2610,8 +2597,6 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2610 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2597 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2611} 2598}
2612 2599
2613#define CPUSET_NODELIST_LEN (256)
2614
2615/** 2600/**
2616 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed 2601 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2617 * @tsk: pointer to task_struct of some task. 2602 * @tsk: pointer to task_struct of some task.
@@ -2621,23 +2606,16 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2621 */ 2606 */
2622void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2607void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2623{ 2608{
2624 /* Statically allocated to prevent using excess stack. */
2625 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
2626 static DEFINE_SPINLOCK(cpuset_buffer_lock);
2627 struct cgroup *cgrp; 2609 struct cgroup *cgrp;
2628 2610
2629 spin_lock(&cpuset_buffer_lock);
2630 rcu_read_lock(); 2611 rcu_read_lock();
2631 2612
2632 cgrp = task_cs(tsk)->css.cgroup; 2613 cgrp = task_cs(tsk)->css.cgroup;
2633 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2634 tsk->mems_allowed);
2635 pr_info("%s cpuset=", tsk->comm); 2614 pr_info("%s cpuset=", tsk->comm);
2636 pr_cont_cgroup_name(cgrp); 2615 pr_cont_cgroup_name(cgrp);
2637 pr_cont(" mems_allowed=%s\n", cpuset_nodelist); 2616 pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
2638 2617
2639 rcu_read_unlock(); 2618 rcu_read_unlock();
2640 spin_unlock(&cpuset_buffer_lock);
2641} 2619}
2642 2620
2643/* 2621/*
@@ -2715,10 +2693,8 @@ out:
2715/* Display task mems_allowed in /proc/<pid>/status file. */ 2693/* Display task mems_allowed in /proc/<pid>/status file. */
2716void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2694void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2717{ 2695{
2718 seq_puts(m, "Mems_allowed:\t"); 2696 seq_printf(m, "Mems_allowed:\t%*pb\n",
2719 seq_nodemask(m, &task->mems_allowed); 2697 nodemask_pr_args(&task->mems_allowed));
2720 seq_puts(m, "\n"); 2698 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
2721 seq_puts(m, "Mems_allowed_list:\t"); 2699 nodemask_pr_args(&task->mems_allowed));
2722 seq_nodemask_list(m, &task->mems_allowed);
2723 seq_puts(m, "\n");
2724} 2700}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 07ce18ca71e0..0874e2edd275 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -604,7 +604,7 @@ return_normal:
604 online_cpus) 604 online_cpus)
605 cpu_relax(); 605 cpu_relax();
606 if (!time_left) 606 if (!time_left)
607 pr_crit("KGDB: Timed out waiting for secondary CPUs.\n"); 607 pr_crit("Timed out waiting for secondary CPUs.\n");
608 608
609 /* 609 /*
610 * At this point the primary processor is completely 610 * At this point the primary processor is completely
@@ -696,6 +696,14 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
696 696
697 if (arch_kgdb_ops.enable_nmi) 697 if (arch_kgdb_ops.enable_nmi)
698 arch_kgdb_ops.enable_nmi(0); 698 arch_kgdb_ops.enable_nmi(0);
699 /*
700 * Avoid entering the debugger if we were triggered due to an oops
701 * but panic_timeout indicates the system should automatically
702 * reboot on panic. We don't want to get stuck waiting for input
703 * on such systems, especially if its "just" an oops.
704 */
705 if (signo != SIGTRAP && panic_timeout)
706 return 1;
699 707
700 memset(ks, 0, sizeof(struct kgdb_state)); 708 memset(ks, 0, sizeof(struct kgdb_state));
701 ks->cpu = raw_smp_processor_id(); 709 ks->cpu = raw_smp_processor_id();
@@ -828,6 +836,15 @@ static int kgdb_panic_event(struct notifier_block *self,
828 unsigned long val, 836 unsigned long val,
829 void *data) 837 void *data)
830{ 838{
839 /*
840 * Avoid entering the debugger if we were triggered due to a panic
841 * We don't want to get stuck waiting for input from user in such case.
842 * panic_timeout indicates the system should automatically
843 * reboot on panic.
844 */
845 if (panic_timeout)
846 return NOTIFY_DONE;
847
831 if (dbg_kdb_mode) 848 if (dbg_kdb_mode)
832 kdb_printf("PANIC: %s\n", (char *)data); 849 kdb_printf("PANIC: %s\n", (char *)data);
833 kgdb_breakpoint(); 850 kgdb_breakpoint();
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 7c70812caea5..fc1ef736253c 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -439,7 +439,7 @@ poll_again:
439 * substituted for %d, %x or %o in the prompt. 439 * substituted for %d, %x or %o in the prompt.
440 */ 440 */
441 441
442char *kdb_getstr(char *buffer, size_t bufsize, char *prompt) 442char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)
443{ 443{
444 if (prompt && kdb_prompt_str != prompt) 444 if (prompt && kdb_prompt_str != prompt)
445 strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); 445 strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
@@ -548,7 +548,7 @@ static int kdb_search_string(char *searched, char *searchfor)
548 return 0; 548 return 0;
549} 549}
550 550
551int vkdb_printf(const char *fmt, va_list ap) 551int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
552{ 552{
553 int diag; 553 int diag;
554 int linecount; 554 int linecount;
@@ -680,6 +680,12 @@ int vkdb_printf(const char *fmt, va_list ap)
680 size_avail = sizeof(kdb_buffer) - len; 680 size_avail = sizeof(kdb_buffer) - len;
681 goto kdb_print_out; 681 goto kdb_print_out;
682 } 682 }
683 if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH)
684 /*
685 * This was a interactive search (using '/' at more
686 * prompt) and it has completed. Clear the flag.
687 */
688 kdb_grepping_flag = 0;
683 /* 689 /*
684 * at this point the string is a full line and 690 * at this point the string is a full line and
685 * should be printed, up to the null. 691 * should be printed, up to the null.
@@ -691,19 +697,20 @@ kdb_printit:
691 * Write to all consoles. 697 * Write to all consoles.
692 */ 698 */
693 retlen = strlen(kdb_buffer); 699 retlen = strlen(kdb_buffer);
700 cp = (char *) printk_skip_level(kdb_buffer);
694 if (!dbg_kdb_mode && kgdb_connected) { 701 if (!dbg_kdb_mode && kgdb_connected) {
695 gdbstub_msg_write(kdb_buffer, retlen); 702 gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
696 } else { 703 } else {
697 if (dbg_io_ops && !dbg_io_ops->is_console) { 704 if (dbg_io_ops && !dbg_io_ops->is_console) {
698 len = retlen; 705 len = retlen - (cp - kdb_buffer);
699 cp = kdb_buffer; 706 cp2 = cp;
700 while (len--) { 707 while (len--) {
701 dbg_io_ops->write_char(*cp); 708 dbg_io_ops->write_char(*cp2);
702 cp++; 709 cp2++;
703 } 710 }
704 } 711 }
705 while (c) { 712 while (c) {
706 c->write(c, kdb_buffer, retlen); 713 c->write(c, cp, retlen - (cp - kdb_buffer));
707 touch_nmi_watchdog(); 714 touch_nmi_watchdog();
708 c = c->next; 715 c = c->next;
709 } 716 }
@@ -711,7 +718,10 @@ kdb_printit:
711 if (logging) { 718 if (logging) {
712 saved_loglevel = console_loglevel; 719 saved_loglevel = console_loglevel;
713 console_loglevel = CONSOLE_LOGLEVEL_SILENT; 720 console_loglevel = CONSOLE_LOGLEVEL_SILENT;
714 printk(KERN_INFO "%s", kdb_buffer); 721 if (printk_get_level(kdb_buffer) || src == KDB_MSGSRC_PRINTK)
722 printk("%s", kdb_buffer);
723 else
724 pr_info("%s", kdb_buffer);
715 } 725 }
716 726
717 if (KDB_STATE(PAGER)) { 727 if (KDB_STATE(PAGER)) {
@@ -794,11 +804,23 @@ kdb_printit:
794 kdb_nextline = linecount - 1; 804 kdb_nextline = linecount - 1;
795 kdb_printf("\r"); 805 kdb_printf("\r");
796 suspend_grep = 1; /* for this recursion */ 806 suspend_grep = 1; /* for this recursion */
807 } else if (buf1[0] == '/' && !kdb_grepping_flag) {
808 kdb_printf("\r");
809 kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN,
810 kdbgetenv("SEARCHPROMPT") ?: "search> ");
811 *strchrnul(kdb_grep_string, '\n') = '\0';
812 kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH;
813 suspend_grep = 1; /* for this recursion */
797 } else if (buf1[0] && buf1[0] != '\n') { 814 } else if (buf1[0] && buf1[0] != '\n') {
798 /* user hit something other than enter */ 815 /* user hit something other than enter */
799 suspend_grep = 1; /* for this recursion */ 816 suspend_grep = 1; /* for this recursion */
800 kdb_printf("\nOnly 'q' or 'Q' are processed at more " 817 if (buf1[0] != '/')
801 "prompt, input ignored\n"); 818 kdb_printf(
819 "\nOnly 'q', 'Q' or '/' are processed at "
820 "more prompt, input ignored\n");
821 else
822 kdb_printf("\n'/' cannot be used during | "
823 "grep filtering, input ignored\n");
802 } else if (kdb_grepping_flag) { 824 } else if (kdb_grepping_flag) {
803 /* user hit enter */ 825 /* user hit enter */
804 suspend_grep = 1; /* for this recursion */ 826 suspend_grep = 1; /* for this recursion */
@@ -844,7 +866,7 @@ int kdb_printf(const char *fmt, ...)
844 int r; 866 int r;
845 867
846 va_start(ap, fmt); 868 va_start(ap, fmt);
847 r = vkdb_printf(fmt, ap); 869 r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
848 va_end(ap); 870 va_end(ap);
849 871
850 return r; 872 return r;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 7b40c5f07dce..4121345498e0 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -50,8 +50,7 @@
50static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE; 50static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE;
51module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600); 51module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600);
52 52
53#define GREP_LEN 256 53char kdb_grep_string[KDB_GREP_STRLEN];
54char kdb_grep_string[GREP_LEN];
55int kdb_grepping_flag; 54int kdb_grepping_flag;
56EXPORT_SYMBOL(kdb_grepping_flag); 55EXPORT_SYMBOL(kdb_grepping_flag);
57int kdb_grep_leading; 56int kdb_grep_leading;
@@ -870,7 +869,7 @@ static void parse_grep(const char *str)
870 len = strlen(cp); 869 len = strlen(cp);
871 if (!len) 870 if (!len)
872 return; 871 return;
873 if (len >= GREP_LEN) { 872 if (len >= KDB_GREP_STRLEN) {
874 kdb_printf("search string too long\n"); 873 kdb_printf("search string too long\n");
875 return; 874 return;
876 } 875 }
@@ -915,13 +914,12 @@ int kdb_parse(const char *cmdstr)
915 char *cp; 914 char *cp;
916 char *cpp, quoted; 915 char *cpp, quoted;
917 kdbtab_t *tp; 916 kdbtab_t *tp;
918 int i, escaped, ignore_errors = 0, check_grep; 917 int i, escaped, ignore_errors = 0, check_grep = 0;
919 918
920 /* 919 /*
921 * First tokenize the command string. 920 * First tokenize the command string.
922 */ 921 */
923 cp = (char *)cmdstr; 922 cp = (char *)cmdstr;
924 kdb_grepping_flag = check_grep = 0;
925 923
926 if (KDB_FLAG(CMD_INTERRUPT)) { 924 if (KDB_FLAG(CMD_INTERRUPT)) {
927 /* Previous command was interrupted, newline must not 925 /* Previous command was interrupted, newline must not
@@ -1247,7 +1245,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1247 kdb_printf("due to NonMaskable Interrupt @ " 1245 kdb_printf("due to NonMaskable Interrupt @ "
1248 kdb_machreg_fmt "\n", 1246 kdb_machreg_fmt "\n",
1249 instruction_pointer(regs)); 1247 instruction_pointer(regs));
1250 kdb_dumpregs(regs);
1251 break; 1248 break;
1252 case KDB_REASON_SSTEP: 1249 case KDB_REASON_SSTEP:
1253 case KDB_REASON_BREAK: 1250 case KDB_REASON_BREAK:
@@ -1281,6 +1278,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1281 */ 1278 */
1282 kdb_nextline = 1; 1279 kdb_nextline = 1;
1283 KDB_STATE_CLEAR(SUPPRESS); 1280 KDB_STATE_CLEAR(SUPPRESS);
1281 kdb_grepping_flag = 0;
1282 /* ensure the old search does not leak into '/' commands */
1283 kdb_grep_string[0] = '\0';
1284 1284
1285 cmdbuf = cmd_cur; 1285 cmdbuf = cmd_cur;
1286 *cmdbuf = '\0'; 1286 *cmdbuf = '\0';
@@ -2256,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv)
2256 /* 2256 /*
2257 * Validate cpunum 2257 * Validate cpunum
2258 */ 2258 */
2259 if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb) 2259 if ((cpunum >= CONFIG_NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
2260 return KDB_BADCPUNUM; 2260 return KDB_BADCPUNUM;
2261 2261
2262 dbg_switch_cpu = cpunum; 2262 dbg_switch_cpu = cpunum;
@@ -2583,7 +2583,7 @@ static int kdb_summary(int argc, const char **argv)
2583#define K(x) ((x) << (PAGE_SHIFT - 10)) 2583#define K(x) ((x) << (PAGE_SHIFT - 10))
2584 kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" 2584 kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
2585 "Buffers: %8lu kB\n", 2585 "Buffers: %8lu kB\n",
2586 val.totalram, val.freeram, val.bufferram); 2586 K(val.totalram), K(val.freeram), K(val.bufferram));
2587 return 0; 2587 return 0;
2588} 2588}
2589 2589
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index eaacd1693954..75014d7f4568 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -196,7 +196,9 @@ extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
196 196
197/* Miscellaneous functions and data areas */ 197/* Miscellaneous functions and data areas */
198extern int kdb_grepping_flag; 198extern int kdb_grepping_flag;
199#define KDB_GREPPING_FLAG_SEARCH 0x8000
199extern char kdb_grep_string[]; 200extern char kdb_grep_string[];
201#define KDB_GREP_STRLEN 256
200extern int kdb_grep_leading; 202extern int kdb_grep_leading;
201extern int kdb_grep_trailing; 203extern int kdb_grep_trailing;
202extern char *kdb_cmds[]; 204extern char *kdb_cmds[];
@@ -209,7 +211,7 @@ extern void kdb_ps1(const struct task_struct *p);
209extern void kdb_print_nameval(const char *name, unsigned long val); 211extern void kdb_print_nameval(const char *name, unsigned long val);
210extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); 212extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
211extern void kdb_meminfo_proc_show(void); 213extern void kdb_meminfo_proc_show(void);
212extern char *kdb_getstr(char *, size_t, char *); 214extern char *kdb_getstr(char *, size_t, const char *);
213extern void kdb_gdb_state_pass(char *buf); 215extern void kdb_gdb_state_pass(char *buf);
214 216
215/* Defines for kdb_symbol_print */ 217/* Defines for kdb_symbol_print */
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 103f5d147b2f..2925188f50ea 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -1,5 +1,5 @@
1ifdef CONFIG_FUNCTION_TRACER 1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
3endif 3endif
4 4
5obj-y := core.o ring_buffer.o callchain.o 5obj-y := core.o ring_buffer.o callchain.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 20cece0a7aea..af924bc38121 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8539,6 +8539,18 @@ void __init perf_event_init(void)
8539 != 1024); 8539 != 1024);
8540} 8540}
8541 8541
8542ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
8543 char *page)
8544{
8545 struct perf_pmu_events_attr *pmu_attr =
8546 container_of(attr, struct perf_pmu_events_attr, attr);
8547
8548 if (pmu_attr->event_str)
8549 return sprintf(page, "%s\n", pmu_attr->event_str);
8550
8551 return 0;
8552}
8553
8542static int __init perf_event_sysfs_init(void) 8554static int __init perf_event_sysfs_init(void)
8543{ 8555{
8544 struct pmu *pmu; 8556 struct pmu *pmu;
diff --git a/kernel/exit.c b/kernel/exit.c
index 6806c55475ee..feff10bbb307 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk)
435 task_unlock(tsk); 435 task_unlock(tsk);
436 mm_update_next_owner(mm); 436 mm_update_next_owner(mm);
437 mmput(mm); 437 mmput(mm);
438 clear_thread_flag(TIF_MEMDIE); 438 if (test_thread_flag(TIF_MEMDIE))
439 unmark_oom_victim();
439} 440}
440 441
441static struct task_struct *find_alive_thread(struct task_struct *p) 442static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2ddade9f1..cf65139615a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
438 atomic_inc(&mapping->i_mmap_writable); 438 atomic_inc(&mapping->i_mmap_writable);
439 flush_dcache_mmap_lock(mapping); 439 flush_dcache_mmap_lock(mapping);
440 /* insert tmp into the share list, just after mpnt */ 440 /* insert tmp into the share list, just after mpnt */
441 if (unlikely(tmp->vm_flags & VM_NONLINEAR)) 441 vma_interval_tree_insert_after(tmp, mpnt,
442 vma_nonlinear_insert(tmp, 442 &mapping->i_mmap);
443 &mapping->i_mmap_nonlinear);
444 else
445 vma_interval_tree_insert_after(tmp, mpnt,
446 &mapping->i_mmap);
447 flush_dcache_mmap_unlock(mapping); 443 flush_dcache_mmap_unlock(mapping);
448 i_mmap_unlock_write(mapping); 444 i_mmap_unlock_write(mapping);
449 } 445 }
@@ -559,6 +555,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
559 INIT_LIST_HEAD(&mm->mmlist); 555 INIT_LIST_HEAD(&mm->mmlist);
560 mm->core_state = NULL; 556 mm->core_state = NULL;
561 atomic_long_set(&mm->nr_ptes, 0); 557 atomic_long_set(&mm->nr_ptes, 0);
558 mm_nr_pmds_init(mm);
562 mm->map_count = 0; 559 mm->map_count = 0;
563 mm->locked_vm = 0; 560 mm->locked_vm = 0;
564 mm->pinned_vm = 0; 561 mm->pinned_vm = 0;
@@ -607,6 +604,14 @@ static void check_mm(struct mm_struct *mm)
607 printk(KERN_ALERT "BUG: Bad rss-counter state " 604 printk(KERN_ALERT "BUG: Bad rss-counter state "
608 "mm:%p idx:%d val:%ld\n", mm, i, x); 605 "mm:%p idx:%d val:%ld\n", mm, i, x);
609 } 606 }
607
608 if (atomic_long_read(&mm->nr_ptes))
609 pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
610 atomic_long_read(&mm->nr_ptes));
611 if (mm_nr_pmds(mm))
612 pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
613 mm_nr_pmds(mm));
614
610#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 615#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
611 VM_BUG_ON_MM(mm->pmd_huge_pte, mm); 616 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
612#endif 617#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index 63678b573d61..2a5e3830e953 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2217,7 +2217,7 @@ retry:
2217 if (!abs_time) 2217 if (!abs_time)
2218 goto out; 2218 goto out;
2219 2219
2220 restart = &current_thread_info()->restart_block; 2220 restart = &current->restart_block;
2221 restart->fn = futex_wait_restart; 2221 restart->fn = futex_wait_restart;
2222 restart->futex.uaddr = uaddr; 2222 restart->futex.uaddr = uaddr;
2223 restart->futex.val = val; 2223 restart->futex.val = val;
@@ -2258,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart)
2258 * if there are waiters then it will block, it does PI, etc. (Due to 2258 * if there are waiters then it will block, it does PI, etc. (Due to
2259 * races the kernel might see a 0 value of the futex too.) 2259 * races the kernel might see a 0 value of the futex too.)
2260 */ 2260 */
2261static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, 2261static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
2262 ktime_t *time, int trylock) 2262 ktime_t *time, int trylock)
2263{ 2263{
2264 struct hrtimer_sleeper timeout, *to = NULL; 2264 struct hrtimer_sleeper timeout, *to = NULL;
@@ -2953,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2953 case FUTEX_WAKE_OP: 2953 case FUTEX_WAKE_OP:
2954 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); 2954 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2955 case FUTEX_LOCK_PI: 2955 case FUTEX_LOCK_PI:
2956 return futex_lock_pi(uaddr, flags, val, timeout, 0); 2956 return futex_lock_pi(uaddr, flags, timeout, 0);
2957 case FUTEX_UNLOCK_PI: 2957 case FUTEX_UNLOCK_PI:
2958 return futex_unlock_pi(uaddr, flags); 2958 return futex_unlock_pi(uaddr, flags);
2959 case FUTEX_TRYLOCK_PI: 2959 case FUTEX_TRYLOCK_PI:
2960 return futex_lock_pi(uaddr, flags, 0, timeout, 1); 2960 return futex_lock_pi(uaddr, flags, NULL, 1);
2961 case FUTEX_WAIT_REQUEUE_PI: 2961 case FUTEX_WAIT_REQUEUE_PI:
2962 val3 = FUTEX_BITSET_MATCH_ANY; 2962 val3 = FUTEX_BITSET_MATCH_ANY;
2963 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, 2963 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 52aa7e8de927..752d6486b67e 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,33 +1,7 @@
1ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' 1ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2 2
3# if-lt 3obj-y := base.o fs.o
4# Usage VAR := $(call if-lt, $(a), $(b)) 4obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
5# Returns 1 if (a < b) 5obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
6if-lt = $(shell [ $(1) -lt $(2) ] && echo 1) 6obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \
7 7 gcc_3_4.o, gcc_4_7.o)
8ifeq ($(CONFIG_GCOV_FORMAT_3_4),y)
9 cc-ver := 0304
10else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y)
11 cc-ver := 0407
12else
13# Use cc-version if available, otherwise set 0
14#
15# scripts/Kbuild.include, which contains cc-version function, is not included
16# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov"
17# Meaning cc-ver is empty causing if-lt test to fail with
18# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage.
19# This has no affect on the clean phase, but the error message could be
20# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version
21# is not available. We can probably move if-lt to Kbuild.include, so it's also
22# not defined during clean or to include Kbuild.include in
23# scripts/Makefile.clean. But the following workaround seems least invasive.
24 cc-ver := $(if $(call cc-version),$(call cc-version),0)
25endif
26
27obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
28
29ifeq ($(call if-lt, $(cc-ver), 0407),1)
30 obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o
31else
32 obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o
33endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 80692373abd6..196a06fbc122 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -243,6 +243,9 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
243 return -EINVAL; 243 return -EINVAL;
244 desc->affinity_hint = m; 244 desc->affinity_hint = m;
245 irq_put_desc_unlock(desc, flags); 245 irq_put_desc_unlock(desc, flags);
246 /* set the initial affinity to prevent every interrupt being on CPU0 */
247 if (m)
248 __irq_set_affinity(irq, m, false);
246 return 0; 249 return 0;
247} 250}
248EXPORT_SYMBOL_GPL(irq_set_affinity_hint); 251EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9dc9bfd8a678..df2f4642d1e7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -46,10 +46,9 @@ static int show_irq_affinity(int type, struct seq_file *m, void *v)
46 mask = desc->pending_mask; 46 mask = desc->pending_mask;
47#endif 47#endif
48 if (type) 48 if (type)
49 seq_cpumask_list(m, mask); 49 seq_printf(m, "%*pbl\n", cpumask_pr_args(mask));
50 else 50 else
51 seq_cpumask(m, mask); 51 seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
52 seq_putc(m, '\n');
53 return 0; 52 return 0;
54} 53}
55 54
@@ -67,8 +66,7 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
67 cpumask_copy(mask, desc->affinity_hint); 66 cpumask_copy(mask, desc->affinity_hint);
68 raw_spin_unlock_irqrestore(&desc->lock, flags); 67 raw_spin_unlock_irqrestore(&desc->lock, flags);
69 68
70 seq_cpumask(m, mask); 69 seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
71 seq_putc(m, '\n');
72 free_cpumask_var(mask); 70 free_cpumask_var(mask);
73 71
74 return 0; 72 return 0;
@@ -186,8 +184,7 @@ static const struct file_operations irq_affinity_list_proc_fops = {
186 184
187static int default_affinity_show(struct seq_file *m, void *v) 185static int default_affinity_show(struct seq_file *m, void *v)
188{ 186{
189 seq_cpumask(m, irq_default_affinity); 187 seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity));
190 seq_putc(m, '\n');
191 return 0; 188 return 0;
192} 189}
193 190
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9a8a01abbaed..38c25b1f2fd5 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -444,7 +444,7 @@ arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
444} 444}
445 445
446/* 446/*
447 * Free up memory used by kernel, initrd, and comand line. This is temporary 447 * Free up memory used by kernel, initrd, and command line. This is temporary
448 * memory allocation which is not needed any more after these buffers have 448 * memory allocation which is not needed any more after these buffers have
449 * been loaded into separate segments and have been copied elsewhere. 449 * been loaded into separate segments and have been copied elsewhere.
450 */ 450 */
@@ -856,8 +856,6 @@ static int kimage_set_destination(struct kimage *image,
856 856
857 destination &= PAGE_MASK; 857 destination &= PAGE_MASK;
858 result = kimage_add_entry(image, destination | IND_DESTINATION); 858 result = kimage_add_entry(image, destination | IND_DESTINATION);
859 if (result == 0)
860 image->destination = destination;
861 859
862 return result; 860 return result;
863} 861}
@@ -869,8 +867,6 @@ static int kimage_add_page(struct kimage *image, unsigned long page)
869 867
870 page &= PAGE_MASK; 868 page &= PAGE_MASK;
871 result = kimage_add_entry(image, page | IND_SOURCE); 869 result = kimage_add_entry(image, page | IND_SOURCE);
872 if (result == 0)
873 image->destination += PAGE_SIZE;
874 870
875 return result; 871 return result;
876} 872}
@@ -1288,19 +1284,22 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
1288 if (nr_segments > 0) { 1284 if (nr_segments > 0) {
1289 unsigned long i; 1285 unsigned long i;
1290 1286
1291 /* Loading another kernel to reboot into */ 1287 if (flags & KEXEC_ON_CRASH) {
1292 if ((flags & KEXEC_ON_CRASH) == 0) 1288 /*
1293 result = kimage_alloc_init(&image, entry, nr_segments, 1289 * Loading another kernel to switch to if this one
1294 segments, flags); 1290 * crashes. Free any current crash dump kernel before
1295 /* Loading another kernel to switch to if this one crashes */
1296 else if (flags & KEXEC_ON_CRASH) {
1297 /* Free any current crash dump kernel before
1298 * we corrupt it. 1291 * we corrupt it.
1299 */ 1292 */
1293
1300 kimage_free(xchg(&kexec_crash_image, NULL)); 1294 kimage_free(xchg(&kexec_crash_image, NULL));
1301 result = kimage_alloc_init(&image, entry, nr_segments, 1295 result = kimage_alloc_init(&image, entry, nr_segments,
1302 segments, flags); 1296 segments, flags);
1303 crash_map_reserved_pages(); 1297 crash_map_reserved_pages();
1298 } else {
1299 /* Loading another kernel to reboot into. */
1300
1301 result = kimage_alloc_init(&image, entry, nr_segments,
1302 segments, flags);
1304 } 1303 }
1305 if (result) 1304 if (result)
1306 goto out; 1305 goto out;
@@ -2512,7 +2511,7 @@ static int kexec_apply_relocations(struct kimage *image)
2512 continue; 2511 continue;
2513 2512
2514 /* 2513 /*
2515 * Respective archicture needs to provide support for applying 2514 * Respective architecture needs to provide support for applying
2516 * relocations of type SHT_RELA/SHT_REL. 2515 * relocations of type SHT_RELA/SHT_REL.
2517 */ 2516 */
2518 if (sechdrs[i].sh_type == SHT_RELA) 2517 if (sechdrs[i].sh_type == SHT_RELA)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ee619929cf90..c90e417bb963 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -717,7 +717,7 @@ static void prepare_optimized_kprobe(struct kprobe *p)
717 struct optimized_kprobe *op; 717 struct optimized_kprobe *op;
718 718
719 op = container_of(p, struct optimized_kprobe, kp); 719 op = container_of(p, struct optimized_kprobe, kp);
720 arch_prepare_optimized_kprobe(op); 720 arch_prepare_optimized_kprobe(op, p);
721} 721}
722 722
723/* Allocate new optimized_kprobe and try to prepare optimized instructions */ 723/* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -731,7 +731,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
731 731
732 INIT_LIST_HEAD(&op->list); 732 INIT_LIST_HEAD(&op->list);
733 op->kp.addr = p->addr; 733 op->kp.addr = p->addr;
734 arch_prepare_optimized_kprobe(op); 734 arch_prepare_optimized_kprobe(op, p);
735 735
736 return &op->kp; 736 return &op->kp;
737} 737}
@@ -869,7 +869,8 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt)
869{ 869{
870 struct kprobe *_p; 870 struct kprobe *_p;
871 871
872 unoptimize_kprobe(p, false); /* Try to unoptimize */ 872 /* Try to unoptimize */
873 unoptimize_kprobe(p, kprobes_all_disarmed);
873 874
874 if (!kprobe_queued(p)) { 875 if (!kprobe_queued(p)) {
875 arch_disarm_kprobe(p); 876 arch_disarm_kprobe(p);
@@ -1571,7 +1572,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
1571 1572
1572 /* Try to disarm and disable this/parent probe */ 1573 /* Try to disarm and disable this/parent probe */
1573 if (p == orig_p || aggr_kprobe_disabled(orig_p)) { 1574 if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
1574 disarm_kprobe(orig_p, true); 1575 /*
1576 * If kprobes_all_disarmed is set, orig_p
1577 * should have already been disarmed, so
1578 * skip unneed disarming process.
1579 */
1580 if (!kprobes_all_disarmed)
1581 disarm_kprobe(orig_p, true);
1575 orig_p->flags |= KPROBE_FLAG_DISABLED; 1582 orig_p->flags |= KPROBE_FLAG_DISABLED;
1576 } 1583 }
1577 } 1584 }
@@ -2320,6 +2327,12 @@ static void arm_all_kprobes(void)
2320 if (!kprobes_all_disarmed) 2327 if (!kprobes_all_disarmed)
2321 goto already_enabled; 2328 goto already_enabled;
2322 2329
2330 /*
2331 * optimize_kprobe() called by arm_kprobe() checks
2332 * kprobes_all_disarmed, so set kprobes_all_disarmed before
2333 * arm_kprobe.
2334 */
2335 kprobes_all_disarmed = false;
2323 /* Arming kprobes doesn't optimize kprobe itself */ 2336 /* Arming kprobes doesn't optimize kprobe itself */
2324 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 2337 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
2325 head = &kprobe_table[i]; 2338 head = &kprobe_table[i];
@@ -2328,7 +2341,6 @@ static void arm_all_kprobes(void)
2328 arm_kprobe(p); 2341 arm_kprobe(p);
2329 } 2342 }
2330 2343
2331 kprobes_all_disarmed = false;
2332 printk(KERN_INFO "Kprobes globally enabled\n"); 2344 printk(KERN_INFO "Kprobes globally enabled\n");
2333 2345
2334already_enabled: 2346already_enabled:
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
new file mode 100644
index 000000000000..045022557936
--- /dev/null
+++ b/kernel/livepatch/Kconfig
@@ -0,0 +1,18 @@
1config HAVE_LIVEPATCH
2 bool
3 help
4 Arch supports kernel live patching
5
6config LIVEPATCH
7 bool "Kernel Live Patching"
8 depends on DYNAMIC_FTRACE_WITH_REGS
9 depends on MODULES
10 depends on SYSFS
11 depends on KALLSYMS_ALL
12 depends on HAVE_LIVEPATCH
13 help
14 Say Y here if you want to support kernel live patching.
15 This option has no runtime impact until a kernel "patch"
16 module uses the interface provided by this option to register
17 a patch, causing calls to patched functions to be redirected
18 to new function code contained in the patch module.
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
new file mode 100644
index 000000000000..e8780c0901d9
--- /dev/null
+++ b/kernel/livepatch/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_LIVEPATCH) += livepatch.o
2
3livepatch-objs := core.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
new file mode 100644
index 000000000000..ff7f47d026ac
--- /dev/null
+++ b/kernel/livepatch/core.c
@@ -0,0 +1,1015 @@
1/*
2 * core.c - Kernel Live Patching Core
3 *
4 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
5 * Copyright (C) 2014 SUSE
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
23#include <linux/module.h>
24#include <linux/kernel.h>
25#include <linux/mutex.h>
26#include <linux/slab.h>
27#include <linux/ftrace.h>
28#include <linux/list.h>
29#include <linux/kallsyms.h>
30#include <linux/livepatch.h>
31
32/**
33 * struct klp_ops - structure for tracking registered ftrace ops structs
34 *
35 * A single ftrace_ops is shared between all enabled replacement functions
36 * (klp_func structs) which have the same old_addr. This allows the switch
37 * between function versions to happen instantaneously by updating the klp_ops
38 * struct's func_stack list. The winner is the klp_func at the top of the
39 * func_stack (front of the list).
40 *
41 * @node: node for the global klp_ops list
42 * @func_stack: list head for the stack of klp_func's (active func is on top)
43 * @fops: registered ftrace ops struct
44 */
45struct klp_ops {
46 struct list_head node;
47 struct list_head func_stack;
48 struct ftrace_ops fops;
49};
50
51/*
52 * The klp_mutex protects the global lists and state transitions of any
53 * structure reachable from them. References to any structure must be obtained
54 * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
55 * ensure it gets consistent data).
56 */
57static DEFINE_MUTEX(klp_mutex);
58
59static LIST_HEAD(klp_patches);
60static LIST_HEAD(klp_ops);
61
62static struct kobject *klp_root_kobj;
63
64static struct klp_ops *klp_find_ops(unsigned long old_addr)
65{
66 struct klp_ops *ops;
67 struct klp_func *func;
68
69 list_for_each_entry(ops, &klp_ops, node) {
70 func = list_first_entry(&ops->func_stack, struct klp_func,
71 stack_node);
72 if (func->old_addr == old_addr)
73 return ops;
74 }
75
76 return NULL;
77}
78
79static bool klp_is_module(struct klp_object *obj)
80{
81 return obj->name;
82}
83
84static bool klp_is_object_loaded(struct klp_object *obj)
85{
86 return !obj->name || obj->mod;
87}
88
89/* sets obj->mod if object is not vmlinux and module is found */
90static void klp_find_object_module(struct klp_object *obj)
91{
92 if (!klp_is_module(obj))
93 return;
94
95 mutex_lock(&module_mutex);
96 /*
97 * We don't need to take a reference on the module here because we have
98 * the klp_mutex, which is also taken by the module notifier. This
99 * prevents any module from unloading until we release the klp_mutex.
100 */
101 obj->mod = find_module(obj->name);
102 mutex_unlock(&module_mutex);
103}
104
105/* klp_mutex must be held by caller */
106static bool klp_is_patch_registered(struct klp_patch *patch)
107{
108 struct klp_patch *mypatch;
109
110 list_for_each_entry(mypatch, &klp_patches, list)
111 if (mypatch == patch)
112 return true;
113
114 return false;
115}
116
117static bool klp_initialized(void)
118{
119 return klp_root_kobj;
120}
121
122struct klp_find_arg {
123 const char *objname;
124 const char *name;
125 unsigned long addr;
126 /*
127 * If count == 0, the symbol was not found. If count == 1, a unique
128 * match was found and addr is set. If count > 1, there is
129 * unresolvable ambiguity among "count" number of symbols with the same
130 * name in the same object.
131 */
132 unsigned long count;
133};
134
135static int klp_find_callback(void *data, const char *name,
136 struct module *mod, unsigned long addr)
137{
138 struct klp_find_arg *args = data;
139
140 if ((mod && !args->objname) || (!mod && args->objname))
141 return 0;
142
143 if (strcmp(args->name, name))
144 return 0;
145
146 if (args->objname && strcmp(args->objname, mod->name))
147 return 0;
148
149 /*
150 * args->addr might be overwritten if another match is found
151 * but klp_find_object_symbol() handles this and only returns the
152 * addr if count == 1.
153 */
154 args->addr = addr;
155 args->count++;
156
157 return 0;
158}
159
160static int klp_find_object_symbol(const char *objname, const char *name,
161 unsigned long *addr)
162{
163 struct klp_find_arg args = {
164 .objname = objname,
165 .name = name,
166 .addr = 0,
167 .count = 0
168 };
169
170 kallsyms_on_each_symbol(klp_find_callback, &args);
171
172 if (args.count == 0)
173 pr_err("symbol '%s' not found in symbol table\n", name);
174 else if (args.count > 1)
175 pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
176 args.count, name, objname);
177 else {
178 *addr = args.addr;
179 return 0;
180 }
181
182 *addr = 0;
183 return -EINVAL;
184}
185
186struct klp_verify_args {
187 const char *name;
188 const unsigned long addr;
189};
190
191static int klp_verify_callback(void *data, const char *name,
192 struct module *mod, unsigned long addr)
193{
194 struct klp_verify_args *args = data;
195
196 if (!mod &&
197 !strcmp(args->name, name) &&
198 args->addr == addr)
199 return 1;
200
201 return 0;
202}
203
204static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
205{
206 struct klp_verify_args args = {
207 .name = name,
208 .addr = addr,
209 };
210
211 if (kallsyms_on_each_symbol(klp_verify_callback, &args))
212 return 0;
213
214 pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
215 name, addr);
216 return -EINVAL;
217}
218
219static int klp_find_verify_func_addr(struct klp_object *obj,
220 struct klp_func *func)
221{
222 int ret;
223
224#if defined(CONFIG_RANDOMIZE_BASE)
225 /* KASLR is enabled, disregard old_addr from user */
226 func->old_addr = 0;
227#endif
228
229 if (!func->old_addr || klp_is_module(obj))
230 ret = klp_find_object_symbol(obj->name, func->old_name,
231 &func->old_addr);
232 else
233 ret = klp_verify_vmlinux_symbol(func->old_name,
234 func->old_addr);
235
236 return ret;
237}
238
239/*
240 * external symbols are located outside the parent object (where the parent
241 * object is either vmlinux or the kmod being patched).
242 */
243static int klp_find_external_symbol(struct module *pmod, const char *name,
244 unsigned long *addr)
245{
246 const struct kernel_symbol *sym;
247
248 /* first, check if it's an exported symbol */
249 preempt_disable();
250 sym = find_symbol(name, NULL, NULL, true, true);
251 preempt_enable();
252 if (sym) {
253 *addr = sym->value;
254 return 0;
255 }
256
257 /* otherwise check if it's in another .o within the patch module */
258 return klp_find_object_symbol(pmod->name, name, addr);
259}
260
261static int klp_write_object_relocations(struct module *pmod,
262 struct klp_object *obj)
263{
264 int ret;
265 struct klp_reloc *reloc;
266
267 if (WARN_ON(!klp_is_object_loaded(obj)))
268 return -EINVAL;
269
270 if (WARN_ON(!obj->relocs))
271 return -EINVAL;
272
273 for (reloc = obj->relocs; reloc->name; reloc++) {
274 if (!klp_is_module(obj)) {
275 ret = klp_verify_vmlinux_symbol(reloc->name,
276 reloc->val);
277 if (ret)
278 return ret;
279 } else {
280 /* module, reloc->val needs to be discovered */
281 if (reloc->external)
282 ret = klp_find_external_symbol(pmod,
283 reloc->name,
284 &reloc->val);
285 else
286 ret = klp_find_object_symbol(obj->mod->name,
287 reloc->name,
288 &reloc->val);
289 if (ret)
290 return ret;
291 }
292 ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
293 reloc->val + reloc->addend);
294 if (ret) {
295 pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
296 reloc->name, reloc->val, ret);
297 return ret;
298 }
299 }
300
301 return 0;
302}
303
304static void notrace klp_ftrace_handler(unsigned long ip,
305 unsigned long parent_ip,
306 struct ftrace_ops *fops,
307 struct pt_regs *regs)
308{
309 struct klp_ops *ops;
310 struct klp_func *func;
311
312 ops = container_of(fops, struct klp_ops, fops);
313
314 rcu_read_lock();
315 func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
316 stack_node);
317 rcu_read_unlock();
318
319 if (WARN_ON_ONCE(!func))
320 return;
321
322 klp_arch_set_pc(regs, (unsigned long)func->new_func);
323}
324
325static int klp_disable_func(struct klp_func *func)
326{
327 struct klp_ops *ops;
328 int ret;
329
330 if (WARN_ON(func->state != KLP_ENABLED))
331 return -EINVAL;
332
333 if (WARN_ON(!func->old_addr))
334 return -EINVAL;
335
336 ops = klp_find_ops(func->old_addr);
337 if (WARN_ON(!ops))
338 return -EINVAL;
339
340 if (list_is_singular(&ops->func_stack)) {
341 ret = unregister_ftrace_function(&ops->fops);
342 if (ret) {
343 pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
344 func->old_name, ret);
345 return ret;
346 }
347
348 ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
349 if (ret)
350 pr_warn("function unregister succeeded but failed to clear the filter\n");
351
352 list_del_rcu(&func->stack_node);
353 list_del(&ops->node);
354 kfree(ops);
355 } else {
356 list_del_rcu(&func->stack_node);
357 }
358
359 func->state = KLP_DISABLED;
360
361 return 0;
362}
363
364static int klp_enable_func(struct klp_func *func)
365{
366 struct klp_ops *ops;
367 int ret;
368
369 if (WARN_ON(!func->old_addr))
370 return -EINVAL;
371
372 if (WARN_ON(func->state != KLP_DISABLED))
373 return -EINVAL;
374
375 ops = klp_find_ops(func->old_addr);
376 if (!ops) {
377 ops = kzalloc(sizeof(*ops), GFP_KERNEL);
378 if (!ops)
379 return -ENOMEM;
380
381 ops->fops.func = klp_ftrace_handler;
382 ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
383 FTRACE_OPS_FL_DYNAMIC |
384 FTRACE_OPS_FL_IPMODIFY;
385
386 list_add(&ops->node, &klp_ops);
387
388 INIT_LIST_HEAD(&ops->func_stack);
389 list_add_rcu(&func->stack_node, &ops->func_stack);
390
391 ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0);
392 if (ret) {
393 pr_err("failed to set ftrace filter for function '%s' (%d)\n",
394 func->old_name, ret);
395 goto err;
396 }
397
398 ret = register_ftrace_function(&ops->fops);
399 if (ret) {
400 pr_err("failed to register ftrace handler for function '%s' (%d)\n",
401 func->old_name, ret);
402 ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
403 goto err;
404 }
405
406
407 } else {
408 list_add_rcu(&func->stack_node, &ops->func_stack);
409 }
410
411 func->state = KLP_ENABLED;
412
413 return 0;
414
415err:
416 list_del_rcu(&func->stack_node);
417 list_del(&ops->node);
418 kfree(ops);
419 return ret;
420}
421
422static int klp_disable_object(struct klp_object *obj)
423{
424 struct klp_func *func;
425 int ret;
426
427 for (func = obj->funcs; func->old_name; func++) {
428 if (func->state != KLP_ENABLED)
429 continue;
430
431 ret = klp_disable_func(func);
432 if (ret)
433 return ret;
434 }
435
436 obj->state = KLP_DISABLED;
437
438 return 0;
439}
440
441static int klp_enable_object(struct klp_object *obj)
442{
443 struct klp_func *func;
444 int ret;
445
446 if (WARN_ON(obj->state != KLP_DISABLED))
447 return -EINVAL;
448
449 if (WARN_ON(!klp_is_object_loaded(obj)))
450 return -EINVAL;
451
452 for (func = obj->funcs; func->old_name; func++) {
453 ret = klp_enable_func(func);
454 if (ret)
455 goto unregister;
456 }
457 obj->state = KLP_ENABLED;
458
459 return 0;
460
461unregister:
462 WARN_ON(klp_disable_object(obj));
463 return ret;
464}
465
466static int __klp_disable_patch(struct klp_patch *patch)
467{
468 struct klp_object *obj;
469 int ret;
470
471 /* enforce stacking: only the last enabled patch can be disabled */
472 if (!list_is_last(&patch->list, &klp_patches) &&
473 list_next_entry(patch, list)->state == KLP_ENABLED)
474 return -EBUSY;
475
476 pr_notice("disabling patch '%s'\n", patch->mod->name);
477
478 for (obj = patch->objs; obj->funcs; obj++) {
479 if (obj->state != KLP_ENABLED)
480 continue;
481
482 ret = klp_disable_object(obj);
483 if (ret)
484 return ret;
485 }
486
487 patch->state = KLP_DISABLED;
488
489 return 0;
490}
491
492/**
493 * klp_disable_patch() - disables a registered patch
494 * @patch: The registered, enabled patch to be disabled
495 *
496 * Unregisters the patched functions from ftrace.
497 *
498 * Return: 0 on success, otherwise error
499 */
500int klp_disable_patch(struct klp_patch *patch)
501{
502 int ret;
503
504 mutex_lock(&klp_mutex);
505
506 if (!klp_is_patch_registered(patch)) {
507 ret = -EINVAL;
508 goto err;
509 }
510
511 if (patch->state == KLP_DISABLED) {
512 ret = -EINVAL;
513 goto err;
514 }
515
516 ret = __klp_disable_patch(patch);
517
518err:
519 mutex_unlock(&klp_mutex);
520 return ret;
521}
522EXPORT_SYMBOL_GPL(klp_disable_patch);
523
524static int __klp_enable_patch(struct klp_patch *patch)
525{
526 struct klp_object *obj;
527 int ret;
528
529 if (WARN_ON(patch->state != KLP_DISABLED))
530 return -EINVAL;
531
532 /* enforce stacking: only the first disabled patch can be enabled */
533 if (patch->list.prev != &klp_patches &&
534 list_prev_entry(patch, list)->state == KLP_DISABLED)
535 return -EBUSY;
536
537 pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
538 add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
539
540 pr_notice("enabling patch '%s'\n", patch->mod->name);
541
542 for (obj = patch->objs; obj->funcs; obj++) {
543 klp_find_object_module(obj);
544
545 if (!klp_is_object_loaded(obj))
546 continue;
547
548 ret = klp_enable_object(obj);
549 if (ret)
550 goto unregister;
551 }
552
553 patch->state = KLP_ENABLED;
554
555 return 0;
556
557unregister:
558 WARN_ON(__klp_disable_patch(patch));
559 return ret;
560}
561
562/**
563 * klp_enable_patch() - enables a registered patch
564 * @patch: The registered, disabled patch to be enabled
565 *
566 * Performs the needed symbol lookups and code relocations,
567 * then registers the patched functions with ftrace.
568 *
569 * Return: 0 on success, otherwise error
570 */
571int klp_enable_patch(struct klp_patch *patch)
572{
573 int ret;
574
575 mutex_lock(&klp_mutex);
576
577 if (!klp_is_patch_registered(patch)) {
578 ret = -EINVAL;
579 goto err;
580 }
581
582 ret = __klp_enable_patch(patch);
583
584err:
585 mutex_unlock(&klp_mutex);
586 return ret;
587}
588EXPORT_SYMBOL_GPL(klp_enable_patch);
589
590/*
591 * Sysfs Interface
592 *
593 * /sys/kernel/livepatch
594 * /sys/kernel/livepatch/<patch>
595 * /sys/kernel/livepatch/<patch>/enabled
596 * /sys/kernel/livepatch/<patch>/<object>
597 * /sys/kernel/livepatch/<patch>/<object>/<func>
598 */
599
600static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
601 const char *buf, size_t count)
602{
603 struct klp_patch *patch;
604 int ret;
605 unsigned long val;
606
607 ret = kstrtoul(buf, 10, &val);
608 if (ret)
609 return -EINVAL;
610
611 if (val != KLP_DISABLED && val != KLP_ENABLED)
612 return -EINVAL;
613
614 patch = container_of(kobj, struct klp_patch, kobj);
615
616 mutex_lock(&klp_mutex);
617
618 if (val == patch->state) {
619 /* already in requested state */
620 ret = -EINVAL;
621 goto err;
622 }
623
624 if (val == KLP_ENABLED) {
625 ret = __klp_enable_patch(patch);
626 if (ret)
627 goto err;
628 } else {
629 ret = __klp_disable_patch(patch);
630 if (ret)
631 goto err;
632 }
633
634 mutex_unlock(&klp_mutex);
635
636 return count;
637
638err:
639 mutex_unlock(&klp_mutex);
640 return ret;
641}
642
643static ssize_t enabled_show(struct kobject *kobj,
644 struct kobj_attribute *attr, char *buf)
645{
646 struct klp_patch *patch;
647
648 patch = container_of(kobj, struct klp_patch, kobj);
649 return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
650}
651
652static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
653static struct attribute *klp_patch_attrs[] = {
654 &enabled_kobj_attr.attr,
655 NULL
656};
657
658static void klp_kobj_release_patch(struct kobject *kobj)
659{
660 /*
661 * Once we have a consistency model we'll need to module_put() the
662 * patch module here. See klp_register_patch() for more details.
663 */
664}
665
666static struct kobj_type klp_ktype_patch = {
667 .release = klp_kobj_release_patch,
668 .sysfs_ops = &kobj_sysfs_ops,
669 .default_attrs = klp_patch_attrs,
670};
671
672static void klp_kobj_release_func(struct kobject *kobj)
673{
674}
675
676static struct kobj_type klp_ktype_func = {
677 .release = klp_kobj_release_func,
678 .sysfs_ops = &kobj_sysfs_ops,
679};
680
681/*
682 * Free all functions' kobjects in the array up to some limit. When limit is
683 * NULL, all kobjects are freed.
684 */
685static void klp_free_funcs_limited(struct klp_object *obj,
686 struct klp_func *limit)
687{
688 struct klp_func *func;
689
690 for (func = obj->funcs; func->old_name && func != limit; func++)
691 kobject_put(&func->kobj);
692}
693
694/* Clean up when a patched object is unloaded */
695static void klp_free_object_loaded(struct klp_object *obj)
696{
697 struct klp_func *func;
698
699 obj->mod = NULL;
700
701 for (func = obj->funcs; func->old_name; func++)
702 func->old_addr = 0;
703}
704
705/*
706 * Free all objects' kobjects in the array up to some limit. When limit is
707 * NULL, all kobjects are freed.
708 */
709static void klp_free_objects_limited(struct klp_patch *patch,
710 struct klp_object *limit)
711{
712 struct klp_object *obj;
713
714 for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
715 klp_free_funcs_limited(obj, NULL);
716 kobject_put(obj->kobj);
717 }
718}
719
720static void klp_free_patch(struct klp_patch *patch)
721{
722 klp_free_objects_limited(patch, NULL);
723 if (!list_empty(&patch->list))
724 list_del(&patch->list);
725 kobject_put(&patch->kobj);
726}
727
728static int klp_init_func(struct klp_object *obj, struct klp_func *func)
729{
730 INIT_LIST_HEAD(&func->stack_node);
731 func->state = KLP_DISABLED;
732
733 return kobject_init_and_add(&func->kobj, &klp_ktype_func,
734 obj->kobj, func->old_name);
735}
736
737/* parts of the initialization that is done only when the object is loaded */
738static int klp_init_object_loaded(struct klp_patch *patch,
739 struct klp_object *obj)
740{
741 struct klp_func *func;
742 int ret;
743
744 if (obj->relocs) {
745 ret = klp_write_object_relocations(patch->mod, obj);
746 if (ret)
747 return ret;
748 }
749
750 for (func = obj->funcs; func->old_name; func++) {
751 ret = klp_find_verify_func_addr(obj, func);
752 if (ret)
753 return ret;
754 }
755
756 return 0;
757}
758
759static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
760{
761 struct klp_func *func;
762 int ret;
763 const char *name;
764
765 if (!obj->funcs)
766 return -EINVAL;
767
768 obj->state = KLP_DISABLED;
769
770 klp_find_object_module(obj);
771
772 name = klp_is_module(obj) ? obj->name : "vmlinux";
773 obj->kobj = kobject_create_and_add(name, &patch->kobj);
774 if (!obj->kobj)
775 return -ENOMEM;
776
777 for (func = obj->funcs; func->old_name; func++) {
778 ret = klp_init_func(obj, func);
779 if (ret)
780 goto free;
781 }
782
783 if (klp_is_object_loaded(obj)) {
784 ret = klp_init_object_loaded(patch, obj);
785 if (ret)
786 goto free;
787 }
788
789 return 0;
790
791free:
792 klp_free_funcs_limited(obj, func);
793 kobject_put(obj->kobj);
794 return ret;
795}
796
797static int klp_init_patch(struct klp_patch *patch)
798{
799 struct klp_object *obj;
800 int ret;
801
802 if (!patch->objs)
803 return -EINVAL;
804
805 mutex_lock(&klp_mutex);
806
807 patch->state = KLP_DISABLED;
808
809 ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
810 klp_root_kobj, patch->mod->name);
811 if (ret)
812 goto unlock;
813
814 for (obj = patch->objs; obj->funcs; obj++) {
815 ret = klp_init_object(patch, obj);
816 if (ret)
817 goto free;
818 }
819
820 list_add_tail(&patch->list, &klp_patches);
821
822 mutex_unlock(&klp_mutex);
823
824 return 0;
825
826free:
827 klp_free_objects_limited(patch, obj);
828 kobject_put(&patch->kobj);
829unlock:
830 mutex_unlock(&klp_mutex);
831 return ret;
832}
833
834/**
835 * klp_unregister_patch() - unregisters a patch
836 * @patch: Disabled patch to be unregistered
837 *
838 * Frees the data structures and removes the sysfs interface.
839 *
840 * Return: 0 on success, otherwise error
841 */
842int klp_unregister_patch(struct klp_patch *patch)
843{
844 int ret = 0;
845
846 mutex_lock(&klp_mutex);
847
848 if (!klp_is_patch_registered(patch)) {
849 ret = -EINVAL;
850 goto out;
851 }
852
853 if (patch->state == KLP_ENABLED) {
854 ret = -EBUSY;
855 goto out;
856 }
857
858 klp_free_patch(patch);
859
860out:
861 mutex_unlock(&klp_mutex);
862 return ret;
863}
864EXPORT_SYMBOL_GPL(klp_unregister_patch);
865
866/**
867 * klp_register_patch() - registers a patch
868 * @patch: Patch to be registered
869 *
870 * Initializes the data structure associated with the patch and
871 * creates the sysfs interface.
872 *
873 * Return: 0 on success, otherwise error
874 */
875int klp_register_patch(struct klp_patch *patch)
876{
877 int ret;
878
879 if (!klp_initialized())
880 return -ENODEV;
881
882 if (!patch || !patch->mod)
883 return -EINVAL;
884
885 /*
886 * A reference is taken on the patch module to prevent it from being
887 * unloaded. Right now, we don't allow patch modules to unload since
888 * there is currently no method to determine if a thread is still
889 * running in the patched code contained in the patch module once
890 * the ftrace registration is successful.
891 */
892 if (!try_module_get(patch->mod))
893 return -ENODEV;
894
895 ret = klp_init_patch(patch);
896 if (ret)
897 module_put(patch->mod);
898
899 return ret;
900}
901EXPORT_SYMBOL_GPL(klp_register_patch);
902
903static void klp_module_notify_coming(struct klp_patch *patch,
904 struct klp_object *obj)
905{
906 struct module *pmod = patch->mod;
907 struct module *mod = obj->mod;
908 int ret;
909
910 ret = klp_init_object_loaded(patch, obj);
911 if (ret)
912 goto err;
913
914 if (patch->state == KLP_DISABLED)
915 return;
916
917 pr_notice("applying patch '%s' to loading module '%s'\n",
918 pmod->name, mod->name);
919
920 ret = klp_enable_object(obj);
921 if (!ret)
922 return;
923
924err:
925 pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
926 pmod->name, mod->name, ret);
927}
928
929static void klp_module_notify_going(struct klp_patch *patch,
930 struct klp_object *obj)
931{
932 struct module *pmod = patch->mod;
933 struct module *mod = obj->mod;
934 int ret;
935
936 if (patch->state == KLP_DISABLED)
937 goto disabled;
938
939 pr_notice("reverting patch '%s' on unloading module '%s'\n",
940 pmod->name, mod->name);
941
942 ret = klp_disable_object(obj);
943 if (ret)
944 pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
945 pmod->name, mod->name, ret);
946
947disabled:
948 klp_free_object_loaded(obj);
949}
950
951static int klp_module_notify(struct notifier_block *nb, unsigned long action,
952 void *data)
953{
954 struct module *mod = data;
955 struct klp_patch *patch;
956 struct klp_object *obj;
957
958 if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING)
959 return 0;
960
961 mutex_lock(&klp_mutex);
962
963 list_for_each_entry(patch, &klp_patches, list) {
964 for (obj = patch->objs; obj->funcs; obj++) {
965 if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
966 continue;
967
968 if (action == MODULE_STATE_COMING) {
969 obj->mod = mod;
970 klp_module_notify_coming(patch, obj);
971 } else /* MODULE_STATE_GOING */
972 klp_module_notify_going(patch, obj);
973
974 break;
975 }
976 }
977
978 mutex_unlock(&klp_mutex);
979
980 return 0;
981}
982
983static struct notifier_block klp_module_nb = {
984 .notifier_call = klp_module_notify,
985 .priority = INT_MIN+1, /* called late but before ftrace notifier */
986};
987
988static int klp_init(void)
989{
990 int ret;
991
992 ret = klp_check_compiler_support();
993 if (ret) {
994 pr_info("Your compiler is too old; turning off.\n");
995 return -EINVAL;
996 }
997
998 ret = register_module_notifier(&klp_module_nb);
999 if (ret)
1000 return ret;
1001
1002 klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
1003 if (!klp_root_kobj) {
1004 ret = -ENOMEM;
1005 goto unregister;
1006 }
1007
1008 return 0;
1009
1010unregister:
1011 unregister_module_notifier(&klp_module_nb);
1012 return ret;
1013}
1014
1015module_init(klp_init);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8541bfdfd232..de7a416cca2a 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,11 +1,11 @@
1 1
2obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o 2obj-y += mutex.o semaphore.o rwsem.o
3 3
4ifdef CONFIG_FUNCTION_TRACER 4ifdef CONFIG_FUNCTION_TRACER
5CFLAGS_REMOVE_lockdep.o = -pg 5CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
6CFLAGS_REMOVE_lockdep_proc.o = -pg 6CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
7CFLAGS_REMOVE_mutex-debug.o = -pg 7CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
8CFLAGS_REMOVE_rtmutex-debug.o = -pg 8CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
9endif 9endif
10 10
11obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 11obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
14obj-$(CONFIG_LOCKDEP) += lockdep_proc.o 14obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
15endif 15endif
16obj-$(CONFIG_SMP) += spinlock.o 16obj-$(CONFIG_SMP) += spinlock.o
17obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
17obj-$(CONFIG_SMP) += lglock.o 18obj-$(CONFIG_SMP) += lglock.o
18obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 19obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
19obj-$(CONFIG_RT_MUTEXES) += rtmutex.o 20obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 4d60986fcbee..d1fe2ba5bac9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
108 arch_mcs_spin_unlock_contended(&next->locked); 108 arch_mcs_spin_unlock_contended(&next->locked);
109} 109}
110 110
111/*
112 * Cancellable version of the MCS lock above.
113 *
114 * Intended for adaptive spinning of sleeping locks:
115 * mutex_lock()/rwsem_down_{read,write}() etc.
116 */
117
118struct optimistic_spin_node {
119 struct optimistic_spin_node *next, *prev;
120 int locked; /* 1 if lock acquired */
121 int cpu; /* encoded CPU # value */
122};
123
124extern bool osq_lock(struct optimistic_spin_queue *lock);
125extern void osq_unlock(struct optimistic_spin_queue *lock);
126
127#endif /* __LINUX_MCS_SPINLOCK_H */ 111#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 454195194d4a..94674e5919cb 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -81,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
81 * The mutex must later on be released by the same task that 81 * The mutex must later on be released by the same task that
82 * acquired it. Recursive locking is not allowed. The task 82 * acquired it. Recursive locking is not allowed. The task
83 * may not exit without first unlocking the mutex. Also, kernel 83 * may not exit without first unlocking the mutex. Also, kernel
84 * memory where the mutex resides mutex must not be freed with 84 * memory where the mutex resides must not be freed with
85 * the mutex still locked. The mutex must first be initialized 85 * the mutex still locked. The mutex must first be initialized
86 * (or statically defined) before it can be locked. memset()-ing 86 * (or statically defined) before it can be locked. memset()-ing
87 * the mutex to 0 is not allowed. 87 * the mutex to 0 is not allowed.
@@ -147,7 +147,7 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
147} 147}
148 148
149/* 149/*
150 * after acquiring lock with fastpath or when we lost out in contested 150 * After acquiring lock with fastpath or when we lost out in contested
151 * slowpath, set ctx and wake up any waiters so they can recheck. 151 * slowpath, set ctx and wake up any waiters so they can recheck.
152 * 152 *
153 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, 153 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
@@ -191,19 +191,32 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
191 spin_unlock_mutex(&lock->base.wait_lock, flags); 191 spin_unlock_mutex(&lock->base.wait_lock, flags);
192} 192}
193 193
194
195#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
196/* 194/*
197 * In order to avoid a stampede of mutex spinners from acquiring the mutex 195 * After acquiring lock in the slowpath set ctx and wake up any
198 * more or less simultaneously, the spinners need to acquire a MCS lock 196 * waiters so they can recheck.
199 * first before spinning on the owner field.
200 * 197 *
198 * Callers must hold the mutex wait_lock.
201 */ 199 */
200static __always_inline void
201ww_mutex_set_context_slowpath(struct ww_mutex *lock,
202 struct ww_acquire_ctx *ctx)
203{
204 struct mutex_waiter *cur;
202 205
203/* 206 ww_mutex_lock_acquired(lock, ctx);
204 * Mutex spinning code migrated from kernel/sched/core.c 207 lock->ctx = ctx;
205 */ 208
209 /*
210 * Give any possible sleeping processes the chance to wake up,
211 * so they can recheck if they have to back off.
212 */
213 list_for_each_entry(cur, &lock->base.wait_list, list) {
214 debug_mutex_wake_waiter(&lock->base, cur);
215 wake_up_process(cur->task);
216 }
217}
206 218
219#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
207static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 220static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
208{ 221{
209 if (lock->owner != owner) 222 if (lock->owner != owner)
@@ -307,6 +320,11 @@ static bool mutex_optimistic_spin(struct mutex *lock,
307 if (!mutex_can_spin_on_owner(lock)) 320 if (!mutex_can_spin_on_owner(lock))
308 goto done; 321 goto done;
309 322
323 /*
324 * In order to avoid a stampede of mutex spinners trying to
325 * acquire the mutex all at once, the spinners need to take a
326 * MCS (queued) lock first before spinning on the owner field.
327 */
310 if (!osq_lock(&lock->osq)) 328 if (!osq_lock(&lock->osq))
311 goto done; 329 goto done;
312 330
@@ -469,7 +487,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
469EXPORT_SYMBOL(ww_mutex_unlock); 487EXPORT_SYMBOL(ww_mutex_unlock);
470 488
471static inline int __sched 489static inline int __sched
472__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) 490__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
473{ 491{
474 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); 492 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
475 struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); 493 struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
@@ -557,7 +575,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
557 } 575 }
558 576
559 if (use_ww_ctx && ww_ctx->acquired > 0) { 577 if (use_ww_ctx && ww_ctx->acquired > 0) {
560 ret = __mutex_lock_check_stamp(lock, ww_ctx); 578 ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
561 if (ret) 579 if (ret)
562 goto err; 580 goto err;
563 } 581 }
@@ -569,6 +587,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
569 schedule_preempt_disabled(); 587 schedule_preempt_disabled();
570 spin_lock_mutex(&lock->wait_lock, flags); 588 spin_lock_mutex(&lock->wait_lock, flags);
571 } 589 }
590 __set_task_state(task, TASK_RUNNING);
591
572 mutex_remove_waiter(lock, &waiter, current_thread_info()); 592 mutex_remove_waiter(lock, &waiter, current_thread_info());
573 /* set it to 0 if there are no waiters left: */ 593 /* set it to 0 if there are no waiters left: */
574 if (likely(list_empty(&lock->wait_list))) 594 if (likely(list_empty(&lock->wait_list)))
@@ -582,23 +602,7 @@ skip_wait:
582 602
583 if (use_ww_ctx) { 603 if (use_ww_ctx) {
584 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); 604 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
585 struct mutex_waiter *cur; 605 ww_mutex_set_context_slowpath(ww, ww_ctx);
586
587 /*
588 * This branch gets optimized out for the common case,
589 * and is only important for ww_mutex_lock.
590 */
591 ww_mutex_lock_acquired(ww, ww_ctx);
592 ww->ctx = ww_ctx;
593
594 /*
595 * Give any possible sleeping processes the chance to wake up,
596 * so they can recheck if they have to back off.
597 */
598 list_for_each_entry(cur, &lock->wait_list, list) {
599 debug_mutex_wake_waiter(lock, cur);
600 wake_up_process(cur->task);
601 }
602 } 606 }
603 607
604 spin_unlock_mutex(&lock->wait_lock, flags); 608 spin_unlock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/osq_lock.c
index 9887a905a762..c112d00341b0 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,8 +1,6 @@
1#include <linux/percpu.h> 1#include <linux/percpu.h>
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include "mcs_spinlock.h" 3#include <linux/osq_lock.h>
4
5#ifdef CONFIG_SMP
6 4
7/* 5/*
8 * An MCS like lock especially tailored for optimistic spinning for sleeping 6 * An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -111,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
111 * cmpxchg in an attempt to undo our queueing. 109 * cmpxchg in an attempt to undo our queueing.
112 */ 110 */
113 111
114 while (!smp_load_acquire(&node->locked)) { 112 while (!ACCESS_ONCE(node->locked)) {
115 /* 113 /*
116 * If we need to reschedule bail... so we can block. 114 * If we need to reschedule bail... so we can block.
117 */ 115 */
@@ -203,6 +201,3 @@ void osq_unlock(struct optimistic_spin_queue *lock)
203 if (next) 201 if (next)
204 ACCESS_ONCE(next->locked) = 1; 202 ACCESS_ONCE(next->locked) = 1;
205} 203}
206
207#endif
208
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7c98873a3077..e16e5542bf13 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1130,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
1130 set_current_state(state); 1130 set_current_state(state);
1131 } 1131 }
1132 1132
1133 __set_current_state(TASK_RUNNING);
1133 return ret; 1134 return ret;
1134} 1135}
1135 1136
@@ -1188,12 +1189,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
1188 ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); 1189 ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
1189 1190
1190 if (likely(!ret)) 1191 if (likely(!ret))
1192 /* sleep on the mutex */
1191 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); 1193 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
1192 1194
1193 set_current_state(TASK_RUNNING);
1194
1195 if (unlikely(ret)) { 1195 if (unlikely(ret)) {
1196 remove_waiter(lock, &waiter); 1196 if (rt_mutex_has_waiters(lock))
1197 remove_waiter(lock, &waiter);
1197 rt_mutex_handle_deadlock(ret, chwalk, &waiter); 1198 rt_mutex_handle_deadlock(ret, chwalk, &waiter);
1198 } 1199 }
1199 1200
@@ -1626,10 +1627,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1626 1627
1627 set_current_state(TASK_INTERRUPTIBLE); 1628 set_current_state(TASK_INTERRUPTIBLE);
1628 1629
1630 /* sleep on the mutex */
1629 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); 1631 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
1630 1632
1631 set_current_state(TASK_RUNNING);
1632
1633 if (unlikely(ret)) 1633 if (unlikely(ret))
1634 remove_waiter(lock, waiter); 1634 remove_waiter(lock, waiter);
1635 1635
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2c93571162cb..2555ae15ec14 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -154,7 +154,7 @@ void __sched __down_read(struct rw_semaphore *sem)
154 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 154 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
155 } 155 }
156 156
157 tsk->state = TASK_RUNNING; 157 __set_task_state(tsk, TASK_RUNNING);
158 out: 158 out:
159 ; 159 ;
160} 160}
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 7628c3fc37ca..2f7cc4076f50 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -242,8 +242,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
242 schedule(); 242 schedule();
243 } 243 }
244 244
245 tsk->state = TASK_RUNNING; 245 __set_task_state(tsk, TASK_RUNNING);
246
247 return sem; 246 return sem;
248} 247}
249EXPORT_SYMBOL(rwsem_down_read_failed); 248EXPORT_SYMBOL(rwsem_down_read_failed);
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..db3ccb1dd614 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
363} 363}
364EXPORT_SYMBOL(_raw_spin_lock_nested); 364EXPORT_SYMBOL(_raw_spin_lock_nested);
365 365
366void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
367{
368 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
369 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
370 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
371}
372EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
373
366unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, 374unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
367 int subclass) 375 int subclass)
368{ 376{
diff --git a/kernel/module.c b/kernel/module.c
index d856e96a3cce..b34813f725e9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
56#include <linux/async.h> 56#include <linux/async.h>
57#include <linux/percpu.h> 57#include <linux/percpu.h>
58#include <linux/kmemleak.h> 58#include <linux/kmemleak.h>
59#include <linux/kasan.h>
59#include <linux/jump_label.h> 60#include <linux/jump_label.h>
60#include <linux/pfn.h> 61#include <linux/pfn.h>
61#include <linux/bsearch.h> 62#include <linux/bsearch.h>
@@ -1225,6 +1226,12 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
1225 const unsigned long *crc; 1226 const unsigned long *crc;
1226 int err; 1227 int err;
1227 1228
1229 /*
1230 * The module_mutex should not be a heavily contended lock;
1231 * if we get the occasional sleep here, we'll go an extra iteration
1232 * in the wait_event_interruptible(), which is harmless.
1233 */
1234 sched_annotate_sleep();
1228 mutex_lock(&module_mutex); 1235 mutex_lock(&module_mutex);
1229 sym = find_symbol(name, &owner, &crc, 1236 sym = find_symbol(name, &owner, &crc,
1230 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); 1237 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
@@ -1807,6 +1814,7 @@ static void unset_module_init_ro_nx(struct module *mod) { }
1807void __weak module_memfree(void *module_region) 1814void __weak module_memfree(void *module_region)
1808{ 1815{
1809 vfree(module_region); 1816 vfree(module_region);
1817 kasan_module_free(module_region);
1810} 1818}
1811 1819
1812void __weak module_arch_cleanup(struct module *mod) 1820void __weak module_arch_cleanup(struct module *mod)
@@ -2978,6 +2986,12 @@ static bool finished_loading(const char *name)
2978 struct module *mod; 2986 struct module *mod;
2979 bool ret; 2987 bool ret;
2980 2988
2989 /*
2990 * The module_mutex should not be a heavily contended lock;
2991 * if we get the occasional sleep here, we'll go an extra iteration
2992 * in the wait_event_interruptible(), which is harmless.
2993 */
2994 sched_annotate_sleep();
2981 mutex_lock(&module_mutex); 2995 mutex_lock(&module_mutex);
2982 mod = find_module_all(name, strlen(name), true); 2996 mod = find_module_all(name, strlen(name), true);
2983 ret = !mod || mod->state == MODULE_STATE_LIVE 2997 ret = !mod || mod->state == MODULE_STATE_LIVE
@@ -3011,8 +3025,13 @@ static void do_free_init(struct rcu_head *head)
3011 kfree(m); 3025 kfree(m);
3012} 3026}
3013 3027
3014/* This is where the real work happens */ 3028/*
3015static int do_init_module(struct module *mod) 3029 * This is where the real work happens.
3030 *
3031 * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
3032 * helper command 'lx-symbols'.
3033 */
3034static noinline int do_init_module(struct module *mod)
3016{ 3035{
3017 int ret = 0; 3036 int ret = 0;
3018 struct mod_initfree *freeinit; 3037 struct mod_initfree *freeinit;
@@ -3120,32 +3139,6 @@ static int may_init_module(void)
3120} 3139}
3121 3140
3122/* 3141/*
3123 * Can't use wait_event_interruptible() because our condition
3124 * 'finished_loading()' contains a blocking primitive itself (mutex_lock).
3125 */
3126static int wait_finished_loading(struct module *mod)
3127{
3128 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3129 int ret = 0;
3130
3131 add_wait_queue(&module_wq, &wait);
3132 for (;;) {
3133 if (finished_loading(mod->name))
3134 break;
3135
3136 if (signal_pending(current)) {
3137 ret = -ERESTARTSYS;
3138 break;
3139 }
3140
3141 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3142 }
3143 remove_wait_queue(&module_wq, &wait);
3144
3145 return ret;
3146}
3147
3148/*
3149 * We try to place it in the list now to make sure it's unique before 3142 * We try to place it in the list now to make sure it's unique before
3150 * we dedicate too many resources. In particular, temporary percpu 3143 * we dedicate too many resources. In particular, temporary percpu
3151 * memory exhaustion. 3144 * memory exhaustion.
@@ -3165,8 +3158,8 @@ again:
3165 || old->state == MODULE_STATE_UNFORMED) { 3158 || old->state == MODULE_STATE_UNFORMED) {
3166 /* Wait in case it fails to load. */ 3159 /* Wait in case it fails to load. */
3167 mutex_unlock(&module_mutex); 3160 mutex_unlock(&module_mutex);
3168 3161 err = wait_event_interruptible(module_wq,
3169 err = wait_finished_loading(mod); 3162 finished_loading(mod->name));
3170 if (err) 3163 if (err)
3171 goto out_unlocked; 3164 goto out_unlocked;
3172 goto again; 3165 goto again;
@@ -3265,7 +3258,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
3265 mod->sig_ok = info->sig_ok; 3258 mod->sig_ok = info->sig_ok;
3266 if (!mod->sig_ok) { 3259 if (!mod->sig_ok) {
3267 pr_notice_once("%s: module verification failed: signature " 3260 pr_notice_once("%s: module verification failed: signature "
3268 "and/or required key missing - tainting " 3261 "and/or required key missing - tainting "
3269 "kernel\n", mod->name); 3262 "kernel\n", mod->name);
3270 add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK); 3263 add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
3271 } 3264 }
@@ -3356,6 +3349,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
3356 module_bug_cleanup(mod); 3349 module_bug_cleanup(mod);
3357 mutex_unlock(&module_mutex); 3350 mutex_unlock(&module_mutex);
3358 3351
3352 /* Free lock-classes: */
3353 lockdep_free_key_range(mod->module_core, mod->core_size);
3354
3359 /* we can't deallocate the module until we clear memory protection */ 3355 /* we can't deallocate the module until we clear memory protection */
3360 unset_module_init_ro_nx(mod); 3356 unset_module_init_ro_nx(mod);
3361 unset_module_core_ro_nx(mod); 3357 unset_module_core_ro_nx(mod);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4803da6eab62..ae9fc7cc360e 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
402} 402}
403EXPORT_SYMBOL_GPL(raw_notifier_call_chain); 403EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
404 404
405#ifdef CONFIG_SRCU
405/* 406/*
406 * SRCU notifier chain routines. Registration and unregistration 407 * SRCU notifier chain routines. Registration and unregistration
407 * use a mutex, and call_chain is synchronized by SRCU (no locks). 408 * use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
528} 529}
529EXPORT_SYMBOL_GPL(srcu_init_notifier_head); 530EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
530 531
532#endif /* CONFIG_SRCU */
533
531static ATOMIC_NOTIFIER_HEAD(die_chain); 534static ATOMIC_NOTIFIER_HEAD(die_chain);
532 535
533int notrace notify_die(enum die_val val, const char *str, 536int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/padata.c b/kernel/padata.c
index 161402f0b517..b38bea9c466a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -917,15 +917,10 @@ static ssize_t show_cpumask(struct padata_instance *pinst,
917 else 917 else
918 cpumask = pinst->cpumask.pcpu; 918 cpumask = pinst->cpumask.pcpu;
919 919
920 len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask), 920 len = snprintf(buf, PAGE_SIZE, "%*pb\n",
921 nr_cpu_ids); 921 nr_cpu_ids, cpumask_bits(cpumask));
922 if (PAGE_SIZE - len < 2)
923 len = -EINVAL;
924 else
925 len += sprintf(buf + len, "\n");
926
927 mutex_unlock(&pinst->lock); 922 mutex_unlock(&pinst->lock);
928 return len; 923 return len < PAGE_SIZE ? len : -EINVAL;
929} 924}
930 925
931static ssize_t store_cpumask(struct padata_instance *pinst, 926static ssize_t store_cpumask(struct padata_instance *pinst,
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d8d6f906dec..8136ad76e5fd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -226,6 +226,7 @@ static const struct tnt tnts[] = {
226 { TAINT_OOT_MODULE, 'O', ' ' }, 226 { TAINT_OOT_MODULE, 'O', ' ' },
227 { TAINT_UNSIGNED_MODULE, 'E', ' ' }, 227 { TAINT_UNSIGNED_MODULE, 'E', ' ' },
228 { TAINT_SOFTLOCKUP, 'L', ' ' }, 228 { TAINT_SOFTLOCKUP, 'L', ' ' },
229 { TAINT_LIVEPATCH, 'K', ' ' },
229}; 230};
230 231
231/** 232/**
@@ -246,6 +247,7 @@ static const struct tnt tnts[] = {
246 * 'O' - Out-of-tree module has been loaded. 247 * 'O' - Out-of-tree module has been loaded.
247 * 'E' - Unsigned module has been loaded. 248 * 'E' - Unsigned module has been loaded.
248 * 'L' - A soft lockup has previously occurred. 249 * 'L' - A soft lockup has previously occurred.
250 * 'K' - Kernel has been live patched.
249 * 251 *
250 * The string is overwritten by the next call to print_tainted(). 252 * The string is overwritten by the next call to print_tainted().
251 */ 253 */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 48b28d387c7f..7e01f78f0417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -251,6 +251,7 @@ config APM_EMULATION
251 251
252config PM_OPP 252config PM_OPP
253 bool 253 bool
254 select SRCU
254 ---help--- 255 ---help---
255 SOCs have a standard set of tuples consisting of frequency and 256 SOCs have a standard set of tuples consisting of frequency and
256 voltage pairs that the device will support per voltage domain. This 257 voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5a6ec8678b9a..564f786df470 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -84,8 +84,8 @@ static int try_to_freeze_tasks(bool user_only)
84 elapsed_msecs = elapsed_msecs64; 84 elapsed_msecs = elapsed_msecs64;
85 85
86 if (todo) { 86 if (todo) {
87 printk("\n"); 87 pr_cont("\n");
88 printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " 88 pr_err("Freezing of tasks %s after %d.%03d seconds "
89 "(%d tasks refusing to freeze, wq_busy=%d):\n", 89 "(%d tasks refusing to freeze, wq_busy=%d):\n",
90 wakeup ? "aborted" : "failed", 90 wakeup ? "aborted" : "failed",
91 elapsed_msecs / 1000, elapsed_msecs % 1000, 91 elapsed_msecs / 1000, elapsed_msecs % 1000,
@@ -101,37 +101,13 @@ static int try_to_freeze_tasks(bool user_only)
101 read_unlock(&tasklist_lock); 101 read_unlock(&tasklist_lock);
102 } 102 }
103 } else { 103 } else {
104 printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, 104 pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
105 elapsed_msecs % 1000); 105 elapsed_msecs % 1000);
106 } 106 }
107 107
108 return todo ? -EBUSY : 0; 108 return todo ? -EBUSY : 0;
109} 109}
110 110
111static bool __check_frozen_processes(void)
112{
113 struct task_struct *g, *p;
114
115 for_each_process_thread(g, p)
116 if (p != current && !freezer_should_skip(p) && !frozen(p))
117 return false;
118
119 return true;
120}
121
122/*
123 * Returns true if all freezable tasks (except for current) are frozen already
124 */
125static bool check_frozen_processes(void)
126{
127 bool ret;
128
129 read_lock(&tasklist_lock);
130 ret = __check_frozen_processes();
131 read_unlock(&tasklist_lock);
132 return ret;
133}
134
135/** 111/**
136 * freeze_processes - Signal user space processes to enter the refrigerator. 112 * freeze_processes - Signal user space processes to enter the refrigerator.
137 * The current thread will not be frozen. The same process that calls 113 * The current thread will not be frozen. The same process that calls
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void)
142int freeze_processes(void) 118int freeze_processes(void)
143{ 119{
144 int error; 120 int error;
145 int oom_kills_saved;
146 121
147 error = __usermodehelper_disable(UMH_FREEZING); 122 error = __usermodehelper_disable(UMH_FREEZING);
148 if (error) 123 if (error)
@@ -155,31 +130,24 @@ int freeze_processes(void)
155 atomic_inc(&system_freezing_cnt); 130 atomic_inc(&system_freezing_cnt);
156 131
157 pm_wakeup_clear(); 132 pm_wakeup_clear();
158 printk("Freezing user space processes ... "); 133 pr_info("Freezing user space processes ... ");
159 pm_freezing = true; 134 pm_freezing = true;
160 oom_kills_saved = oom_kills_count();
161 error = try_to_freeze_tasks(true); 135 error = try_to_freeze_tasks(true);
162 if (!error) { 136 if (!error) {
163 __usermodehelper_set_disable_depth(UMH_DISABLED); 137 __usermodehelper_set_disable_depth(UMH_DISABLED);
164 oom_killer_disable(); 138 pr_cont("done.");
165
166 /*
167 * There might have been an OOM kill while we were
168 * freezing tasks and the killed task might be still
169 * on the way out so we have to double check for race.
170 */
171 if (oom_kills_count() != oom_kills_saved &&
172 !check_frozen_processes()) {
173 __usermodehelper_set_disable_depth(UMH_ENABLED);
174 printk("OOM in progress.");
175 error = -EBUSY;
176 } else {
177 printk("done.");
178 }
179 } 139 }
180 printk("\n"); 140 pr_cont("\n");
181 BUG_ON(in_atomic()); 141 BUG_ON(in_atomic());
182 142
143 /*
144 * Now that the whole userspace is frozen we need to disbale
145 * the OOM killer to disallow any further interference with
146 * killable tasks.
147 */
148 if (!error && !oom_killer_disable())
149 error = -EBUSY;
150
183 if (error) 151 if (error)
184 thaw_processes(); 152 thaw_processes();
185 return error; 153 return error;
@@ -197,13 +165,14 @@ int freeze_kernel_threads(void)
197{ 165{
198 int error; 166 int error;
199 167
200 printk("Freezing remaining freezable tasks ... "); 168 pr_info("Freezing remaining freezable tasks ... ");
169
201 pm_nosig_freezing = true; 170 pm_nosig_freezing = true;
202 error = try_to_freeze_tasks(false); 171 error = try_to_freeze_tasks(false);
203 if (!error) 172 if (!error)
204 printk("done."); 173 pr_cont("done.");
205 174
206 printk("\n"); 175 pr_cont("\n");
207 BUG_ON(in_atomic()); 176 BUG_ON(in_atomic());
208 177
209 if (error) 178 if (error)
@@ -224,7 +193,7 @@ void thaw_processes(void)
224 193
225 oom_killer_enable(); 194 oom_killer_enable();
226 195
227 printk("Restarting tasks ... "); 196 pr_info("Restarting tasks ... ");
228 197
229 __usermodehelper_set_disable_depth(UMH_FREEZING); 198 __usermodehelper_set_disable_depth(UMH_FREEZING);
230 thaw_workqueues(); 199 thaw_workqueues();
@@ -243,7 +212,7 @@ void thaw_processes(void)
243 usermodehelper_enable(); 212 usermodehelper_enable();
244 213
245 schedule(); 214 schedule();
246 printk("done.\n"); 215 pr_cont("done.\n");
247 trace_suspend_resume(TPS("thaw_processes"), 0, false); 216 trace_suspend_resume(TPS("thaw_processes"), 0, false);
248} 217}
249 218
@@ -252,7 +221,7 @@ void thaw_kernel_threads(void)
252 struct task_struct *g, *p; 221 struct task_struct *g, *p;
253 222
254 pm_nosig_freezing = false; 223 pm_nosig_freezing = false;
255 printk("Restarting kernel threads ... "); 224 pr_info("Restarting kernel threads ... ");
256 225
257 thaw_workqueues(); 226 thaw_workqueues();
258 227
@@ -264,5 +233,5 @@ void thaw_kernel_threads(void)
264 read_unlock(&tasklist_lock); 233 read_unlock(&tasklist_lock);
265 234
266 schedule(); 235 schedule();
267 printk("done.\n"); 236 pr_cont("done.\n");
268} 237}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 5f4c006c4b1e..97b0df71303e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -41,6 +41,8 @@
41#include <linux/platform_device.h> 41#include <linux/platform_device.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/kernel.h> 43#include <linux/kernel.h>
44#include <linux/debugfs.h>
45#include <linux/seq_file.h>
44 46
45#include <linux/uaccess.h> 47#include <linux/uaccess.h>
46#include <linux/export.h> 48#include <linux/export.h>
@@ -182,6 +184,81 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
182 c->target_value = value; 184 c->target_value = value;
183} 185}
184 186
187static inline int pm_qos_get_value(struct pm_qos_constraints *c);
188static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
189{
190 struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
191 struct pm_qos_constraints *c;
192 struct pm_qos_request *req;
193 char *type;
194 unsigned long flags;
195 int tot_reqs = 0;
196 int active_reqs = 0;
197
198 if (IS_ERR_OR_NULL(qos)) {
199 pr_err("%s: bad qos param!\n", __func__);
200 return -EINVAL;
201 }
202 c = qos->constraints;
203 if (IS_ERR_OR_NULL(c)) {
204 pr_err("%s: Bad constraints on qos?\n", __func__);
205 return -EINVAL;
206 }
207
208 /* Lock to ensure we have a snapshot */
209 spin_lock_irqsave(&pm_qos_lock, flags);
210 if (plist_head_empty(&c->list)) {
211 seq_puts(s, "Empty!\n");
212 goto out;
213 }
214
215 switch (c->type) {
216 case PM_QOS_MIN:
217 type = "Minimum";
218 break;
219 case PM_QOS_MAX:
220 type = "Maximum";
221 break;
222 case PM_QOS_SUM:
223 type = "Sum";
224 break;
225 default:
226 type = "Unknown";
227 }
228
229 plist_for_each_entry(req, &c->list, node) {
230 char *state = "Default";
231
232 if ((req->node).prio != c->default_value) {
233 active_reqs++;
234 state = "Active";
235 }
236 tot_reqs++;
237 seq_printf(s, "%d: %d: %s\n", tot_reqs,
238 (req->node).prio, state);
239 }
240
241 seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n",
242 type, pm_qos_get_value(c), active_reqs, tot_reqs);
243
244out:
245 spin_unlock_irqrestore(&pm_qos_lock, flags);
246 return 0;
247}
248
249static int pm_qos_dbg_open(struct inode *inode, struct file *file)
250{
251 return single_open(file, pm_qos_dbg_show_requests,
252 inode->i_private);
253}
254
255static const struct file_operations pm_qos_debug_fops = {
256 .open = pm_qos_dbg_open,
257 .read = seq_read,
258 .llseek = seq_lseek,
259 .release = single_release,
260};
261
185/** 262/**
186 * pm_qos_update_target - manages the constraints list and calls the notifiers 263 * pm_qos_update_target - manages the constraints list and calls the notifiers
187 * if needed 264 * if needed
@@ -509,12 +586,17 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
509EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 586EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
510 587
511/* User space interface to PM QoS classes via misc devices */ 588/* User space interface to PM QoS classes via misc devices */
512static int register_pm_qos_misc(struct pm_qos_object *qos) 589static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
513{ 590{
514 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; 591 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
515 qos->pm_qos_power_miscdev.name = qos->name; 592 qos->pm_qos_power_miscdev.name = qos->name;
516 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; 593 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
517 594
595 if (d) {
596 (void)debugfs_create_file(qos->name, S_IRUGO, d,
597 (void *)qos, &pm_qos_debug_fops);
598 }
599
518 return misc_register(&qos->pm_qos_power_miscdev); 600 return misc_register(&qos->pm_qos_power_miscdev);
519} 601}
520 602
@@ -608,11 +690,16 @@ static int __init pm_qos_power_init(void)
608{ 690{
609 int ret = 0; 691 int ret = 0;
610 int i; 692 int i;
693 struct dentry *d;
611 694
612 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); 695 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
613 696
697 d = debugfs_create_dir("pm_qos", NULL);
698 if (IS_ERR_OR_NULL(d))
699 d = NULL;
700
614 for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { 701 for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
615 ret = register_pm_qos_misc(pm_qos_array[i]); 702 ret = register_pm_qos_misc(pm_qos_array[i], d);
616 if (ret < 0) { 703 if (ret < 0) {
617 printk(KERN_ERR "pm_qos_param: %s setup failed\n", 704 printk(KERN_ERR "pm_qos_param: %s setup failed\n",
618 pm_qos_array[i]->name); 705 pm_qos_array[i]->name);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0c40c16174b4..c24d5a23bf93 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1472,9 +1472,9 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1472/** 1472/**
1473 * free_unnecessary_pages - Release preallocated pages not needed for the image 1473 * free_unnecessary_pages - Release preallocated pages not needed for the image
1474 */ 1474 */
1475static void free_unnecessary_pages(void) 1475static unsigned long free_unnecessary_pages(void)
1476{ 1476{
1477 unsigned long save, to_free_normal, to_free_highmem; 1477 unsigned long save, to_free_normal, to_free_highmem, free;
1478 1478
1479 save = count_data_pages(); 1479 save = count_data_pages();
1480 if (alloc_normal >= save) { 1480 if (alloc_normal >= save) {
@@ -1495,6 +1495,7 @@ static void free_unnecessary_pages(void)
1495 else 1495 else
1496 to_free_normal = 0; 1496 to_free_normal = 0;
1497 } 1497 }
1498 free = to_free_normal + to_free_highmem;
1498 1499
1499 memory_bm_position_reset(&copy_bm); 1500 memory_bm_position_reset(&copy_bm);
1500 1501
@@ -1518,6 +1519,8 @@ static void free_unnecessary_pages(void)
1518 swsusp_unset_page_free(page); 1519 swsusp_unset_page_free(page);
1519 __free_page(page); 1520 __free_page(page);
1520 } 1521 }
1522
1523 return free;
1521} 1524}
1522 1525
1523/** 1526/**
@@ -1707,7 +1710,7 @@ int hibernate_preallocate_memory(void)
1707 * pages in memory, but we have allocated more. Release the excessive 1710 * pages in memory, but we have allocated more. Release the excessive
1708 * ones now. 1711 * ones now.
1709 */ 1712 */
1710 free_unnecessary_pages(); 1713 pages -= free_unnecessary_pages();
1711 1714
1712 out: 1715 out:
1713 stop = ktime_get(); 1716 stop = ktime_get();
@@ -2310,8 +2313,6 @@ static inline void free_highmem_data(void)
2310 free_image_page(buffer, PG_UNSAFE_CLEAR); 2313 free_image_page(buffer, PG_UNSAFE_CLEAR);
2311} 2314}
2312#else 2315#else
2313static inline int get_safe_write_buffer(void) { return 0; }
2314
2315static unsigned int 2316static unsigned int
2316count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } 2317count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
2317 2318
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c347e3ce3a55..b7d6b3a721b1 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,7 +37,9 @@ const char *pm_states[PM_SUSPEND_MAX];
37static const struct platform_suspend_ops *suspend_ops; 37static const struct platform_suspend_ops *suspend_ops;
38static const struct platform_freeze_ops *freeze_ops; 38static const struct platform_freeze_ops *freeze_ops;
39static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); 39static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
40static bool suspend_freeze_wake; 40
41enum freeze_state __read_mostly suspend_freeze_state;
42static DEFINE_SPINLOCK(suspend_freeze_lock);
41 43
42void freeze_set_ops(const struct platform_freeze_ops *ops) 44void freeze_set_ops(const struct platform_freeze_ops *ops)
43{ 45{
@@ -48,22 +50,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
48 50
49static void freeze_begin(void) 51static void freeze_begin(void)
50{ 52{
51 suspend_freeze_wake = false; 53 suspend_freeze_state = FREEZE_STATE_NONE;
52} 54}
53 55
54static void freeze_enter(void) 56static void freeze_enter(void)
55{ 57{
56 cpuidle_use_deepest_state(true); 58 spin_lock_irq(&suspend_freeze_lock);
59 if (pm_wakeup_pending())
60 goto out;
61
62 suspend_freeze_state = FREEZE_STATE_ENTER;
63 spin_unlock_irq(&suspend_freeze_lock);
64
65 get_online_cpus();
57 cpuidle_resume(); 66 cpuidle_resume();
58 wait_event(suspend_freeze_wait_head, suspend_freeze_wake); 67
68 /* Push all the CPUs into the idle loop. */
69 wake_up_all_idle_cpus();
70 pr_debug("PM: suspend-to-idle\n");
71 /* Make the current CPU wait so it can enter the idle loop too. */
72 wait_event(suspend_freeze_wait_head,
73 suspend_freeze_state == FREEZE_STATE_WAKE);
74 pr_debug("PM: resume from suspend-to-idle\n");
75
59 cpuidle_pause(); 76 cpuidle_pause();
60 cpuidle_use_deepest_state(false); 77 put_online_cpus();
78
79 spin_lock_irq(&suspend_freeze_lock);
80
81 out:
82 suspend_freeze_state = FREEZE_STATE_NONE;
83 spin_unlock_irq(&suspend_freeze_lock);
61} 84}
62 85
63void freeze_wake(void) 86void freeze_wake(void)
64{ 87{
65 suspend_freeze_wake = true; 88 unsigned long flags;
66 wake_up(&suspend_freeze_wait_head); 89
90 spin_lock_irqsave(&suspend_freeze_lock, flags);
91 if (suspend_freeze_state > FREEZE_STATE_NONE) {
92 suspend_freeze_state = FREEZE_STATE_WAKE;
93 wake_up(&suspend_freeze_wait_head);
94 }
95 spin_unlock_irqrestore(&suspend_freeze_lock, flags);
67} 96}
68EXPORT_SYMBOL_GPL(freeze_wake); 97EXPORT_SYMBOL_GPL(freeze_wake);
69 98
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 02d6b6d28796..01cfd69c54c6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -935,8 +935,8 @@ static int __init ignore_loglevel_setup(char *str)
935 935
936early_param("ignore_loglevel", ignore_loglevel_setup); 936early_param("ignore_loglevel", ignore_loglevel_setup);
937module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); 937module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
938MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" 938MODULE_PARM_DESC(ignore_loglevel,
939 "print all kernel messages to the console."); 939 "ignore loglevel setting (prints all kernel messages to the console)");
940 940
941#ifdef CONFIG_BOOT_PRINTK_DELAY 941#ifdef CONFIG_BOOT_PRINTK_DELAY
942 942
@@ -1419,16 +1419,16 @@ static void call_console_drivers(int level, const char *text, size_t len)
1419} 1419}
1420 1420
1421/* 1421/*
1422 * Zap console related locks when oopsing. Only zap at most once 1422 * Zap console related locks when oopsing.
1423 * every 10 seconds, to leave time for slow consoles to print a 1423 * To leave time for slow consoles to print a full oops,
1424 * full oops. 1424 * only zap at most once every 30 seconds.
1425 */ 1425 */
1426static void zap_locks(void) 1426static void zap_locks(void)
1427{ 1427{
1428 static unsigned long oops_timestamp; 1428 static unsigned long oops_timestamp;
1429 1429
1430 if (time_after_eq(jiffies, oops_timestamp) && 1430 if (time_after_eq(jiffies, oops_timestamp) &&
1431 !time_after(jiffies, oops_timestamp + 30 * HZ)) 1431 !time_after(jiffies, oops_timestamp + 30 * HZ))
1432 return; 1432 return;
1433 1433
1434 oops_timestamp = jiffies; 1434 oops_timestamp = jiffies;
@@ -1811,7 +1811,7 @@ int vprintk_default(const char *fmt, va_list args)
1811 1811
1812#ifdef CONFIG_KGDB_KDB 1812#ifdef CONFIG_KGDB_KDB
1813 if (unlikely(kdb_trap_printk)) { 1813 if (unlikely(kdb_trap_printk)) {
1814 r = vkdb_printf(fmt, args); 1814 r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
1815 return r; 1815 return r;
1816 } 1816 }
1817#endif 1817#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index 54bf5ba26420..a7bcd28d6e9f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -422,8 +422,7 @@ void profile_tick(int type)
422 422
423static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) 423static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
424{ 424{
425 seq_cpumask(m, prof_cpu_mask); 425 seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
426 seq_putc(m, '\n');
427 return 0; 426 return 0;
428} 427}
429 428
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1eb9d90c3af9..227fec36b12a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1077,7 +1077,6 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
1077} 1077}
1078 1078
1079#if defined CONFIG_COMPAT 1079#if defined CONFIG_COMPAT
1080#include <linux/compat.h>
1081 1080
1082int compat_ptrace_request(struct task_struct *child, compat_long_t request, 1081int compat_ptrace_request(struct task_struct *child, compat_long_t request,
1083 compat_ulong_t addr, compat_ulong_t data) 1082 compat_ulong_t addr, compat_ulong_t data)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index e6fae503d1bc..50a808424b06 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,5 @@
1obj-y += update.o srcu.o 1obj-y += update.o
2obj-$(CONFIG_SRCU) += srcu.o
2obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 3obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
3obj-$(CONFIG_TREE_RCU) += tree.o 4obj-$(CONFIG_TREE_RCU) += tree.o
4obj-$(CONFIG_PREEMPT_RCU) += tree.o 5obj-$(CONFIG_PREEMPT_RCU) += tree.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 07bb02eda844..80adef7d4c3d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -137,4 +137,10 @@ int rcu_jiffies_till_stall_check(void);
137 137
138void rcu_early_boot_tests(void); 138void rcu_early_boot_tests(void);
139 139
140/*
141 * This function really isn't for public consumption, but RCU is special in
142 * that context switches can allow the state machine to make progress.
143 */
144extern void resched_cpu(int cpu);
145
140#endif /* __LINUX_RCU_H */ 146#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4d559baf06e0..30d42aa55d83 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -244,7 +244,8 @@ struct rcu_torture_ops {
244 int (*readlock)(void); 244 int (*readlock)(void);
245 void (*read_delay)(struct torture_random_state *rrsp); 245 void (*read_delay)(struct torture_random_state *rrsp);
246 void (*readunlock)(int idx); 246 void (*readunlock)(int idx);
247 int (*completed)(void); 247 unsigned long (*started)(void);
248 unsigned long (*completed)(void);
248 void (*deferred_free)(struct rcu_torture *p); 249 void (*deferred_free)(struct rcu_torture *p);
249 void (*sync)(void); 250 void (*sync)(void);
250 void (*exp_sync)(void); 251 void (*exp_sync)(void);
@@ -296,11 +297,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU)
296 rcu_read_unlock(); 297 rcu_read_unlock();
297} 298}
298 299
299static int rcu_torture_completed(void)
300{
301 return rcu_batches_completed();
302}
303
304/* 300/*
305 * Update callback in the pipe. This should be invoked after a grace period. 301 * Update callback in the pipe. This should be invoked after a grace period.
306 */ 302 */
@@ -356,7 +352,7 @@ rcu_torture_cb(struct rcu_head *p)
356 cur_ops->deferred_free(rp); 352 cur_ops->deferred_free(rp);
357} 353}
358 354
359static int rcu_no_completed(void) 355static unsigned long rcu_no_completed(void)
360{ 356{
361 return 0; 357 return 0;
362} 358}
@@ -377,7 +373,8 @@ static struct rcu_torture_ops rcu_ops = {
377 .readlock = rcu_torture_read_lock, 373 .readlock = rcu_torture_read_lock,
378 .read_delay = rcu_read_delay, 374 .read_delay = rcu_read_delay,
379 .readunlock = rcu_torture_read_unlock, 375 .readunlock = rcu_torture_read_unlock,
380 .completed = rcu_torture_completed, 376 .started = rcu_batches_started,
377 .completed = rcu_batches_completed,
381 .deferred_free = rcu_torture_deferred_free, 378 .deferred_free = rcu_torture_deferred_free,
382 .sync = synchronize_rcu, 379 .sync = synchronize_rcu,
383 .exp_sync = synchronize_rcu_expedited, 380 .exp_sync = synchronize_rcu_expedited,
@@ -407,11 +404,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
407 rcu_read_unlock_bh(); 404 rcu_read_unlock_bh();
408} 405}
409 406
410static int rcu_bh_torture_completed(void)
411{
412 return rcu_batches_completed_bh();
413}
414
415static void rcu_bh_torture_deferred_free(struct rcu_torture *p) 407static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
416{ 408{
417 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); 409 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
@@ -423,7 +415,8 @@ static struct rcu_torture_ops rcu_bh_ops = {
423 .readlock = rcu_bh_torture_read_lock, 415 .readlock = rcu_bh_torture_read_lock,
424 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 416 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
425 .readunlock = rcu_bh_torture_read_unlock, 417 .readunlock = rcu_bh_torture_read_unlock,
426 .completed = rcu_bh_torture_completed, 418 .started = rcu_batches_started_bh,
419 .completed = rcu_batches_completed_bh,
427 .deferred_free = rcu_bh_torture_deferred_free, 420 .deferred_free = rcu_bh_torture_deferred_free,
428 .sync = synchronize_rcu_bh, 421 .sync = synchronize_rcu_bh,
429 .exp_sync = synchronize_rcu_bh_expedited, 422 .exp_sync = synchronize_rcu_bh_expedited,
@@ -466,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
466 .readlock = rcu_torture_read_lock, 459 .readlock = rcu_torture_read_lock,
467 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 460 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
468 .readunlock = rcu_torture_read_unlock, 461 .readunlock = rcu_torture_read_unlock,
462 .started = rcu_no_completed,
469 .completed = rcu_no_completed, 463 .completed = rcu_no_completed,
470 .deferred_free = rcu_busted_torture_deferred_free, 464 .deferred_free = rcu_busted_torture_deferred_free,
471 .sync = synchronize_rcu_busted, 465 .sync = synchronize_rcu_busted,
@@ -510,7 +504,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
510 srcu_read_unlock(&srcu_ctl, idx); 504 srcu_read_unlock(&srcu_ctl, idx);
511} 505}
512 506
513static int srcu_torture_completed(void) 507static unsigned long srcu_torture_completed(void)
514{ 508{
515 return srcu_batches_completed(&srcu_ctl); 509 return srcu_batches_completed(&srcu_ctl);
516} 510}
@@ -564,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = {
564 .readlock = srcu_torture_read_lock, 558 .readlock = srcu_torture_read_lock,
565 .read_delay = srcu_read_delay, 559 .read_delay = srcu_read_delay,
566 .readunlock = srcu_torture_read_unlock, 560 .readunlock = srcu_torture_read_unlock,
561 .started = NULL,
567 .completed = srcu_torture_completed, 562 .completed = srcu_torture_completed,
568 .deferred_free = srcu_torture_deferred_free, 563 .deferred_free = srcu_torture_deferred_free,
569 .sync = srcu_torture_synchronize, 564 .sync = srcu_torture_synchronize,
@@ -600,7 +595,8 @@ static struct rcu_torture_ops sched_ops = {
600 .readlock = sched_torture_read_lock, 595 .readlock = sched_torture_read_lock,
601 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 596 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
602 .readunlock = sched_torture_read_unlock, 597 .readunlock = sched_torture_read_unlock,
603 .completed = rcu_no_completed, 598 .started = rcu_batches_started_sched,
599 .completed = rcu_batches_completed_sched,
604 .deferred_free = rcu_sched_torture_deferred_free, 600 .deferred_free = rcu_sched_torture_deferred_free,
605 .sync = synchronize_sched, 601 .sync = synchronize_sched,
606 .exp_sync = synchronize_sched_expedited, 602 .exp_sync = synchronize_sched_expedited,
@@ -638,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = {
638 .readlock = tasks_torture_read_lock, 634 .readlock = tasks_torture_read_lock,
639 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 635 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
640 .readunlock = tasks_torture_read_unlock, 636 .readunlock = tasks_torture_read_unlock,
637 .started = rcu_no_completed,
641 .completed = rcu_no_completed, 638 .completed = rcu_no_completed,
642 .deferred_free = rcu_tasks_torture_deferred_free, 639 .deferred_free = rcu_tasks_torture_deferred_free,
643 .sync = synchronize_rcu_tasks, 640 .sync = synchronize_rcu_tasks,
@@ -1015,8 +1012,8 @@ static void rcutorture_trace_dump(void)
1015static void rcu_torture_timer(unsigned long unused) 1012static void rcu_torture_timer(unsigned long unused)
1016{ 1013{
1017 int idx; 1014 int idx;
1018 int completed; 1015 unsigned long started;
1019 int completed_end; 1016 unsigned long completed;
1020 static DEFINE_TORTURE_RANDOM(rand); 1017 static DEFINE_TORTURE_RANDOM(rand);
1021 static DEFINE_SPINLOCK(rand_lock); 1018 static DEFINE_SPINLOCK(rand_lock);
1022 struct rcu_torture *p; 1019 struct rcu_torture *p;
@@ -1024,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused)
1024 unsigned long long ts; 1021 unsigned long long ts;
1025 1022
1026 idx = cur_ops->readlock(); 1023 idx = cur_ops->readlock();
1027 completed = cur_ops->completed(); 1024 if (cur_ops->started)
1025 started = cur_ops->started();
1026 else
1027 started = cur_ops->completed();
1028 ts = rcu_trace_clock_local(); 1028 ts = rcu_trace_clock_local();
1029 p = rcu_dereference_check(rcu_torture_current, 1029 p = rcu_dereference_check(rcu_torture_current,
1030 rcu_read_lock_bh_held() || 1030 rcu_read_lock_bh_held() ||
@@ -1047,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused)
1047 /* Should not happen, but... */ 1047 /* Should not happen, but... */
1048 pipe_count = RCU_TORTURE_PIPE_LEN; 1048 pipe_count = RCU_TORTURE_PIPE_LEN;
1049 } 1049 }
1050 completed_end = cur_ops->completed(); 1050 completed = cur_ops->completed();
1051 if (pipe_count > 1) { 1051 if (pipe_count > 1) {
1052 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, 1052 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
1053 completed, completed_end); 1053 started, completed);
1054 rcutorture_trace_dump(); 1054 rcutorture_trace_dump();
1055 } 1055 }
1056 __this_cpu_inc(rcu_torture_count[pipe_count]); 1056 __this_cpu_inc(rcu_torture_count[pipe_count]);
1057 completed = completed_end - completed; 1057 completed = completed - started;
1058 if (cur_ops->started)
1059 completed++;
1058 if (completed > RCU_TORTURE_PIPE_LEN) { 1060 if (completed > RCU_TORTURE_PIPE_LEN) {
1059 /* Should not happen, but... */ 1061 /* Should not happen, but... */
1060 completed = RCU_TORTURE_PIPE_LEN; 1062 completed = RCU_TORTURE_PIPE_LEN;
@@ -1073,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused)
1073static int 1075static int
1074rcu_torture_reader(void *arg) 1076rcu_torture_reader(void *arg)
1075{ 1077{
1076 int completed; 1078 unsigned long started;
1077 int completed_end; 1079 unsigned long completed;
1078 int idx; 1080 int idx;
1079 DEFINE_TORTURE_RANDOM(rand); 1081 DEFINE_TORTURE_RANDOM(rand);
1080 struct rcu_torture *p; 1082 struct rcu_torture *p;
@@ -1093,7 +1095,10 @@ rcu_torture_reader(void *arg)
1093 mod_timer(&t, jiffies + 1); 1095 mod_timer(&t, jiffies + 1);
1094 } 1096 }
1095 idx = cur_ops->readlock(); 1097 idx = cur_ops->readlock();
1096 completed = cur_ops->completed(); 1098 if (cur_ops->started)
1099 started = cur_ops->started();
1100 else
1101 started = cur_ops->completed();
1097 ts = rcu_trace_clock_local(); 1102 ts = rcu_trace_clock_local();
1098 p = rcu_dereference_check(rcu_torture_current, 1103 p = rcu_dereference_check(rcu_torture_current,
1099 rcu_read_lock_bh_held() || 1104 rcu_read_lock_bh_held() ||
@@ -1114,14 +1119,16 @@ rcu_torture_reader(void *arg)
1114 /* Should not happen, but... */ 1119 /* Should not happen, but... */
1115 pipe_count = RCU_TORTURE_PIPE_LEN; 1120 pipe_count = RCU_TORTURE_PIPE_LEN;
1116 } 1121 }
1117 completed_end = cur_ops->completed(); 1122 completed = cur_ops->completed();
1118 if (pipe_count > 1) { 1123 if (pipe_count > 1) {
1119 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, 1124 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
1120 ts, completed, completed_end); 1125 ts, started, completed);
1121 rcutorture_trace_dump(); 1126 rcutorture_trace_dump();
1122 } 1127 }
1123 __this_cpu_inc(rcu_torture_count[pipe_count]); 1128 __this_cpu_inc(rcu_torture_count[pipe_count]);
1124 completed = completed_end - completed; 1129 completed = completed - started;
1130 if (cur_ops->started)
1131 completed++;
1125 if (completed > RCU_TORTURE_PIPE_LEN) { 1132 if (completed > RCU_TORTURE_PIPE_LEN) {
1126 /* Should not happen, but... */ 1133 /* Should not happen, but... */
1127 completed = RCU_TORTURE_PIPE_LEN; 1134 completed = RCU_TORTURE_PIPE_LEN;
@@ -1420,6 +1427,9 @@ static int rcu_torture_barrier(void *arg)
1420 cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ 1427 cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
1421 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { 1428 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
1422 n_rcu_torture_barrier_error++; 1429 n_rcu_torture_barrier_error++;
1430 pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
1431 atomic_read(&barrier_cbs_invoked),
1432 n_barrier_cbs);
1423 WARN_ON_ONCE(1); 1433 WARN_ON_ONCE(1);
1424 } 1434 }
1425 n_barrier_successes++; 1435 n_barrier_successes++;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e037f3eb2f7b..445bf8ffe3fb 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
546 * Report the number of batches, correlated with, but not necessarily 546 * Report the number of batches, correlated with, but not necessarily
547 * precisely the same as, the number of grace periods that have elapsed. 547 * precisely the same as, the number of grace periods that have elapsed.
548 */ 548 */
549long srcu_batches_completed(struct srcu_struct *sp) 549unsigned long srcu_batches_completed(struct srcu_struct *sp)
550{ 550{
551 return sp->completed; 551 return sp->completed;
552} 552}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 0db5649f8817..cc9ceca7bde1 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head,
47 void (*func)(struct rcu_head *rcu), 47 void (*func)(struct rcu_head *rcu),
48 struct rcu_ctrlblk *rcp); 48 struct rcu_ctrlblk *rcp);
49 49
50static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
51
52#include "tiny_plugin.h" 50#include "tiny_plugin.h"
53 51
54/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */
55static void rcu_idle_enter_common(long long newval)
56{
57 if (newval) {
58 RCU_TRACE(trace_rcu_dyntick(TPS("--="),
59 rcu_dynticks_nesting, newval));
60 rcu_dynticks_nesting = newval;
61 return;
62 }
63 RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
64 rcu_dynticks_nesting, newval));
65 if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
66 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
67
68 RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
69 rcu_dynticks_nesting, newval));
70 ftrace_dump(DUMP_ALL);
71 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
72 current->pid, current->comm,
73 idle->pid, idle->comm); /* must be idle task! */
74 }
75 rcu_sched_qs(); /* implies rcu_bh_inc() */
76 barrier();
77 rcu_dynticks_nesting = newval;
78}
79
80/* 52/*
81 * Enter idle, which is an extended quiescent state if we have fully 53 * Enter idle, which is an extended quiescent state if we have fully
82 * entered that mode (i.e., if the new value of dynticks_nesting is zero). 54 * entered that mode.
83 */ 55 */
84void rcu_idle_enter(void) 56void rcu_idle_enter(void)
85{ 57{
86 unsigned long flags;
87 long long newval;
88
89 local_irq_save(flags);
90 WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
91 if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
92 DYNTICK_TASK_NEST_VALUE)
93 newval = 0;
94 else
95 newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
96 rcu_idle_enter_common(newval);
97 local_irq_restore(flags);
98} 58}
99EXPORT_SYMBOL_GPL(rcu_idle_enter); 59EXPORT_SYMBOL_GPL(rcu_idle_enter);
100 60
@@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
103 */ 63 */
104void rcu_irq_exit(void) 64void rcu_irq_exit(void)
105{ 65{
106 unsigned long flags;
107 long long newval;
108
109 local_irq_save(flags);
110 newval = rcu_dynticks_nesting - 1;
111 WARN_ON_ONCE(newval < 0);
112 rcu_idle_enter_common(newval);
113 local_irq_restore(flags);
114} 66}
115EXPORT_SYMBOL_GPL(rcu_irq_exit); 67EXPORT_SYMBOL_GPL(rcu_irq_exit);
116 68
117/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */
118static void rcu_idle_exit_common(long long oldval)
119{
120 if (oldval) {
121 RCU_TRACE(trace_rcu_dyntick(TPS("++="),
122 oldval, rcu_dynticks_nesting));
123 return;
124 }
125 RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
126 if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
127 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
128
129 RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
130 oldval, rcu_dynticks_nesting));
131 ftrace_dump(DUMP_ALL);
132 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
133 current->pid, current->comm,
134 idle->pid, idle->comm); /* must be idle task! */
135 }
136}
137
138/* 69/*
139 * Exit idle, so that we are no longer in an extended quiescent state. 70 * Exit idle, so that we are no longer in an extended quiescent state.
140 */ 71 */
141void rcu_idle_exit(void) 72void rcu_idle_exit(void)
142{ 73{
143 unsigned long flags;
144 long long oldval;
145
146 local_irq_save(flags);
147 oldval = rcu_dynticks_nesting;
148 WARN_ON_ONCE(rcu_dynticks_nesting < 0);
149 if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
150 rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
151 else
152 rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
153 rcu_idle_exit_common(oldval);
154 local_irq_restore(flags);
155} 74}
156EXPORT_SYMBOL_GPL(rcu_idle_exit); 75EXPORT_SYMBOL_GPL(rcu_idle_exit);
157 76
@@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
160 */ 79 */
161void rcu_irq_enter(void) 80void rcu_irq_enter(void)
162{ 81{
163 unsigned long flags;
164 long long oldval;
165
166 local_irq_save(flags);
167 oldval = rcu_dynticks_nesting;
168 rcu_dynticks_nesting++;
169 WARN_ON_ONCE(rcu_dynticks_nesting == 0);
170 rcu_idle_exit_common(oldval);
171 local_irq_restore(flags);
172} 82}
173EXPORT_SYMBOL_GPL(rcu_irq_enter); 83EXPORT_SYMBOL_GPL(rcu_irq_enter);
174 84
@@ -179,23 +89,13 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter);
179 */ 89 */
180bool notrace __rcu_is_watching(void) 90bool notrace __rcu_is_watching(void)
181{ 91{
182 return rcu_dynticks_nesting; 92 return true;
183} 93}
184EXPORT_SYMBOL(__rcu_is_watching); 94EXPORT_SYMBOL(__rcu_is_watching);
185 95
186#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ 96#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
187 97
188/* 98/*
189 * Test whether the current CPU was interrupted from idle. Nested
190 * interrupts don't count, we must be running at the first interrupt
191 * level.
192 */
193static int rcu_is_cpu_rrupt_from_idle(void)
194{
195 return rcu_dynticks_nesting <= 1;
196}
197
198/*
199 * Helper function for rcu_sched_qs() and rcu_bh_qs(). 99 * Helper function for rcu_sched_qs() and rcu_bh_qs().
200 * Also irqs are disabled to avoid confusion due to interrupt handlers 100 * Also irqs are disabled to avoid confusion due to interrupt handlers
201 * invoking call_rcu(). 101 * invoking call_rcu().
@@ -250,7 +150,7 @@ void rcu_bh_qs(void)
250void rcu_check_callbacks(int user) 150void rcu_check_callbacks(int user)
251{ 151{
252 RCU_TRACE(check_cpu_stalls()); 152 RCU_TRACE(check_cpu_stalls());
253 if (user || rcu_is_cpu_rrupt_from_idle()) 153 if (user)
254 rcu_sched_qs(); 154 rcu_sched_qs();
255 else if (!in_softirq()) 155 else if (!in_softirq())
256 rcu_bh_qs(); 156 rcu_bh_qs();
@@ -357,6 +257,11 @@ static void __call_rcu(struct rcu_head *head,
357 rcp->curtail = &head->next; 257 rcp->curtail = &head->next;
358 RCU_TRACE(rcp->qlen++); 258 RCU_TRACE(rcp->qlen++);
359 local_irq_restore(flags); 259 local_irq_restore(flags);
260
261 if (unlikely(is_idle_task(current))) {
262 /* force scheduling for rcu_sched_qs() */
263 resched_cpu(0);
264 }
360} 265}
361 266
362/* 267/*
@@ -383,6 +288,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
383void __init rcu_init(void) 288void __init rcu_init(void)
384{ 289{
385 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 290 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
291 RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
292 RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
386 293
387 rcu_early_boot_tests(); 294 rcu_early_boot_tests();
388} 295}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 858c56569127..f94e209a10d6 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
145 rcp->ticks_this_gp++; 145 rcp->ticks_this_gp++;
146 j = jiffies; 146 j = jiffies;
147 js = ACCESS_ONCE(rcp->jiffies_stall); 147 js = ACCESS_ONCE(rcp->jiffies_stall);
148 if (*rcp->curtail && ULONG_CMP_GE(j, js)) { 148 if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", 149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
150 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, 150 rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
151 jiffies - rcp->gp_start, rcp->qlen); 151 jiffies - rcp->gp_start, rcp->qlen);
152 dump_stack(); 152 dump_stack();
153 }
154 if (*rcp->curtail && ULONG_CMP_GE(j, js))
155 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + 153 ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
156 3 * rcu_jiffies_till_stall_check() + 3; 154 3 * rcu_jiffies_till_stall_check() + 3;
157 else if (ULONG_CMP_GE(j, js)) 155 } else if (ULONG_CMP_GE(j, js)) {
158 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); 156 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
157 }
159} 158}
160 159
161static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) 160static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4c106fcc0d54..48d640ca1a05 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
156static void invoke_rcu_core(void); 156static void invoke_rcu_core(void);
157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
158 158
159/* rcuc/rcub kthread realtime priority */
160static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
161module_param(kthread_prio, int, 0644);
162
159/* 163/*
160 * Track the rcutorture test sequence number and the update version 164 * Track the rcutorture test sequence number and the update version
161 * number within a given test. The rcutorture_testseq is incremented 165 * number within a given test. The rcutorture_testseq is incremented
@@ -215,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
215#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 219#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
216}; 220};
217 221
222DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
223EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
224
218/* 225/*
219 * Let the RCU core know that this CPU has gone through the scheduler, 226 * Let the RCU core know that this CPU has gone through the scheduler,
220 * which is a quiescent state. This is called when the need for a 227 * which is a quiescent state. This is called when the need for a
@@ -284,6 +291,22 @@ void rcu_note_context_switch(void)
284} 291}
285EXPORT_SYMBOL_GPL(rcu_note_context_switch); 292EXPORT_SYMBOL_GPL(rcu_note_context_switch);
286 293
294/*
295 * Register a quiesecent state for all RCU flavors. If there is an
296 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
297 * dyntick-idle quiescent state visible to other CPUs (but only for those
298 * RCU flavors in desparate need of a quiescent state, which will normally
299 * be none of them). Either way, do a lightweight quiescent state for
300 * all RCU flavors.
301 */
302void rcu_all_qs(void)
303{
304 if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
305 rcu_momentary_dyntick_idle();
306 this_cpu_inc(rcu_qs_ctr);
307}
308EXPORT_SYMBOL_GPL(rcu_all_qs);
309
287static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 310static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
288static long qhimark = 10000; /* If this many pending, ignore blimit. */ 311static long qhimark = 10000; /* If this many pending, ignore blimit. */
289static long qlowmark = 100; /* Once only this many pending, use blimit. */ 312static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -315,18 +338,54 @@ static void force_quiescent_state(struct rcu_state *rsp);
315static int rcu_pending(void); 338static int rcu_pending(void);
316 339
317/* 340/*
318 * Return the number of RCU-sched batches processed thus far for debug & stats. 341 * Return the number of RCU batches started thus far for debug & stats.
342 */
343unsigned long rcu_batches_started(void)
344{
345 return rcu_state_p->gpnum;
346}
347EXPORT_SYMBOL_GPL(rcu_batches_started);
348
349/*
350 * Return the number of RCU-sched batches started thus far for debug & stats.
319 */ 351 */
320long rcu_batches_completed_sched(void) 352unsigned long rcu_batches_started_sched(void)
353{
354 return rcu_sched_state.gpnum;
355}
356EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
357
358/*
359 * Return the number of RCU BH batches started thus far for debug & stats.
360 */
361unsigned long rcu_batches_started_bh(void)
362{
363 return rcu_bh_state.gpnum;
364}
365EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
366
367/*
368 * Return the number of RCU batches completed thus far for debug & stats.
369 */
370unsigned long rcu_batches_completed(void)
371{
372 return rcu_state_p->completed;
373}
374EXPORT_SYMBOL_GPL(rcu_batches_completed);
375
376/*
377 * Return the number of RCU-sched batches completed thus far for debug & stats.
378 */
379unsigned long rcu_batches_completed_sched(void)
321{ 380{
322 return rcu_sched_state.completed; 381 return rcu_sched_state.completed;
323} 382}
324EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); 383EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
325 384
326/* 385/*
327 * Return the number of RCU BH batches processed thus far for debug & stats. 386 * Return the number of RCU BH batches completed thus far for debug & stats.
328 */ 387 */
329long rcu_batches_completed_bh(void) 388unsigned long rcu_batches_completed_bh(void)
330{ 389{
331 return rcu_bh_state.completed; 390 return rcu_bh_state.completed;
332} 391}
@@ -930,17 +989,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
930 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); 989 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
931 return 1; 990 return 1;
932 } else { 991 } else {
992 if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
993 rdp->mynode->gpnum))
994 ACCESS_ONCE(rdp->gpwrap) = true;
933 return 0; 995 return 0;
934 } 996 }
935} 997}
936 998
937/* 999/*
938 * This function really isn't for public consumption, but RCU is special in
939 * that context switches can allow the state machine to make progress.
940 */
941extern void resched_cpu(int cpu);
942
943/*
944 * Return true if the specified CPU has passed through a quiescent 1000 * Return true if the specified CPU has passed through a quiescent
945 * state by virtue of being in or having passed through an dynticks 1001 * state by virtue of being in or having passed through an dynticks
946 * idle state since the last call to dyntick_save_progress_counter() 1002 * idle state since the last call to dyntick_save_progress_counter()
@@ -1043,6 +1099,22 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
1043 j1 = rcu_jiffies_till_stall_check(); 1099 j1 = rcu_jiffies_till_stall_check();
1044 ACCESS_ONCE(rsp->jiffies_stall) = j + j1; 1100 ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
1045 rsp->jiffies_resched = j + j1 / 2; 1101 rsp->jiffies_resched = j + j1 / 2;
1102 rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
1103}
1104
1105/*
1106 * Complain about starvation of grace-period kthread.
1107 */
1108static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
1109{
1110 unsigned long gpa;
1111 unsigned long j;
1112
1113 j = jiffies;
1114 gpa = ACCESS_ONCE(rsp->gp_activity);
1115 if (j - gpa > 2 * HZ)
1116 pr_err("%s kthread starved for %ld jiffies!\n",
1117 rsp->name, j - gpa);
1046} 1118}
1047 1119
1048/* 1120/*
@@ -1065,11 +1137,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
1065 } 1137 }
1066} 1138}
1067 1139
1068static void print_other_cpu_stall(struct rcu_state *rsp) 1140static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1069{ 1141{
1070 int cpu; 1142 int cpu;
1071 long delta; 1143 long delta;
1072 unsigned long flags; 1144 unsigned long flags;
1145 unsigned long gpa;
1146 unsigned long j;
1073 int ndetected = 0; 1147 int ndetected = 0;
1074 struct rcu_node *rnp = rcu_get_root(rsp); 1148 struct rcu_node *rnp = rcu_get_root(rsp);
1075 long totqlen = 0; 1149 long totqlen = 0;
@@ -1107,30 +1181,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
1107 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1181 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1108 } 1182 }
1109 1183
1110 /*
1111 * Now rat on any tasks that got kicked up to the root rcu_node
1112 * due to CPU offlining.
1113 */
1114 rnp = rcu_get_root(rsp);
1115 raw_spin_lock_irqsave(&rnp->lock, flags);
1116 ndetected += rcu_print_task_stall(rnp);
1117 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1118
1119 print_cpu_stall_info_end(); 1184 print_cpu_stall_info_end();
1120 for_each_possible_cpu(cpu) 1185 for_each_possible_cpu(cpu)
1121 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1186 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
1122 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", 1187 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
1123 smp_processor_id(), (long)(jiffies - rsp->gp_start), 1188 smp_processor_id(), (long)(jiffies - rsp->gp_start),
1124 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1189 (long)rsp->gpnum, (long)rsp->completed, totqlen);
1125 if (ndetected == 0) 1190 if (ndetected) {
1126 pr_err("INFO: Stall ended before state dump start\n");
1127 else
1128 rcu_dump_cpu_stacks(rsp); 1191 rcu_dump_cpu_stacks(rsp);
1192 } else {
1193 if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
1194 ACCESS_ONCE(rsp->completed) == gpnum) {
1195 pr_err("INFO: Stall ended before state dump start\n");
1196 } else {
1197 j = jiffies;
1198 gpa = ACCESS_ONCE(rsp->gp_activity);
1199 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
1200 rsp->name, j - gpa, j, gpa,
1201 jiffies_till_next_fqs);
1202 /* In this case, the current CPU might be at fault. */
1203 sched_show_task(current);
1204 }
1205 }
1129 1206
1130 /* Complain about tasks blocking the grace period. */ 1207 /* Complain about tasks blocking the grace period. */
1131
1132 rcu_print_detail_task_stall(rsp); 1208 rcu_print_detail_task_stall(rsp);
1133 1209
1210 rcu_check_gp_kthread_starvation(rsp);
1211
1134 force_quiescent_state(rsp); /* Kick them all. */ 1212 force_quiescent_state(rsp); /* Kick them all. */
1135} 1213}
1136 1214
@@ -1155,6 +1233,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
1155 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", 1233 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
1156 jiffies - rsp->gp_start, 1234 jiffies - rsp->gp_start,
1157 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1235 (long)rsp->gpnum, (long)rsp->completed, totqlen);
1236
1237 rcu_check_gp_kthread_starvation(rsp);
1238
1158 rcu_dump_cpu_stacks(rsp); 1239 rcu_dump_cpu_stacks(rsp);
1159 1240
1160 raw_spin_lock_irqsave(&rnp->lock, flags); 1241 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1225,7 +1306,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
1225 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { 1306 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
1226 1307
1227 /* They had a few time units to dump stack, so complain. */ 1308 /* They had a few time units to dump stack, so complain. */
1228 print_other_cpu_stall(rsp); 1309 print_other_cpu_stall(rsp, gpnum);
1229 } 1310 }
1230} 1311}
1231 1312
@@ -1562,7 +1643,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1562 bool ret; 1643 bool ret;
1563 1644
1564 /* Handle the ends of any preceding grace periods first. */ 1645 /* Handle the ends of any preceding grace periods first. */
1565 if (rdp->completed == rnp->completed) { 1646 if (rdp->completed == rnp->completed &&
1647 !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
1566 1648
1567 /* No grace period end, so just accelerate recent callbacks. */ 1649 /* No grace period end, so just accelerate recent callbacks. */
1568 ret = rcu_accelerate_cbs(rsp, rnp, rdp); 1650 ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1577,7 +1659,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1577 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); 1659 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
1578 } 1660 }
1579 1661
1580 if (rdp->gpnum != rnp->gpnum) { 1662 if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
1581 /* 1663 /*
1582 * If the current grace period is waiting for this CPU, 1664 * If the current grace period is waiting for this CPU,
1583 * set up to detect a quiescent state, otherwise don't 1665 * set up to detect a quiescent state, otherwise don't
@@ -1586,8 +1668,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1586 rdp->gpnum = rnp->gpnum; 1668 rdp->gpnum = rnp->gpnum;
1587 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); 1669 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
1588 rdp->passed_quiesce = 0; 1670 rdp->passed_quiesce = 0;
1671 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
1589 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1672 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1590 zero_cpu_stall_ticks(rdp); 1673 zero_cpu_stall_ticks(rdp);
1674 ACCESS_ONCE(rdp->gpwrap) = false;
1591 } 1675 }
1592 return ret; 1676 return ret;
1593} 1677}
@@ -1601,7 +1685,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1601 local_irq_save(flags); 1685 local_irq_save(flags);
1602 rnp = rdp->mynode; 1686 rnp = rdp->mynode;
1603 if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && 1687 if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
1604 rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ 1688 rdp->completed == ACCESS_ONCE(rnp->completed) &&
1689 !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
1605 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ 1690 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1606 local_irq_restore(flags); 1691 local_irq_restore(flags);
1607 return; 1692 return;
@@ -1621,6 +1706,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1621 struct rcu_data *rdp; 1706 struct rcu_data *rdp;
1622 struct rcu_node *rnp = rcu_get_root(rsp); 1707 struct rcu_node *rnp = rcu_get_root(rsp);
1623 1708
1709 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1624 rcu_bind_gp_kthread(); 1710 rcu_bind_gp_kthread();
1625 raw_spin_lock_irq(&rnp->lock); 1711 raw_spin_lock_irq(&rnp->lock);
1626 smp_mb__after_unlock_lock(); 1712 smp_mb__after_unlock_lock();
@@ -1681,6 +1767,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1681 rnp->grphi, rnp->qsmask); 1767 rnp->grphi, rnp->qsmask);
1682 raw_spin_unlock_irq(&rnp->lock); 1768 raw_spin_unlock_irq(&rnp->lock);
1683 cond_resched_rcu_qs(); 1769 cond_resched_rcu_qs();
1770 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1684 } 1771 }
1685 1772
1686 mutex_unlock(&rsp->onoff_mutex); 1773 mutex_unlock(&rsp->onoff_mutex);
@@ -1697,6 +1784,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1697 unsigned long maxj; 1784 unsigned long maxj;
1698 struct rcu_node *rnp = rcu_get_root(rsp); 1785 struct rcu_node *rnp = rcu_get_root(rsp);
1699 1786
1787 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1700 rsp->n_force_qs++; 1788 rsp->n_force_qs++;
1701 if (fqs_state == RCU_SAVE_DYNTICK) { 1789 if (fqs_state == RCU_SAVE_DYNTICK) {
1702 /* Collect dyntick-idle snapshots. */ 1790 /* Collect dyntick-idle snapshots. */
@@ -1735,6 +1823,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1735 struct rcu_data *rdp; 1823 struct rcu_data *rdp;
1736 struct rcu_node *rnp = rcu_get_root(rsp); 1824 struct rcu_node *rnp = rcu_get_root(rsp);
1737 1825
1826 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1738 raw_spin_lock_irq(&rnp->lock); 1827 raw_spin_lock_irq(&rnp->lock);
1739 smp_mb__after_unlock_lock(); 1828 smp_mb__after_unlock_lock();
1740 gp_duration = jiffies - rsp->gp_start; 1829 gp_duration = jiffies - rsp->gp_start;
@@ -1771,6 +1860,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1771 nocb += rcu_future_gp_cleanup(rsp, rnp); 1860 nocb += rcu_future_gp_cleanup(rsp, rnp);
1772 raw_spin_unlock_irq(&rnp->lock); 1861 raw_spin_unlock_irq(&rnp->lock);
1773 cond_resched_rcu_qs(); 1862 cond_resched_rcu_qs();
1863 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1774 } 1864 }
1775 rnp = rcu_get_root(rsp); 1865 rnp = rcu_get_root(rsp);
1776 raw_spin_lock_irq(&rnp->lock); 1866 raw_spin_lock_irq(&rnp->lock);
@@ -1820,6 +1910,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1820 if (rcu_gp_init(rsp)) 1910 if (rcu_gp_init(rsp))
1821 break; 1911 break;
1822 cond_resched_rcu_qs(); 1912 cond_resched_rcu_qs();
1913 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1823 WARN_ON(signal_pending(current)); 1914 WARN_ON(signal_pending(current));
1824 trace_rcu_grace_period(rsp->name, 1915 trace_rcu_grace_period(rsp->name,
1825 ACCESS_ONCE(rsp->gpnum), 1916 ACCESS_ONCE(rsp->gpnum),
@@ -1863,9 +1954,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
1863 ACCESS_ONCE(rsp->gpnum), 1954 ACCESS_ONCE(rsp->gpnum),
1864 TPS("fqsend")); 1955 TPS("fqsend"));
1865 cond_resched_rcu_qs(); 1956 cond_resched_rcu_qs();
1957 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1866 } else { 1958 } else {
1867 /* Deal with stray signal. */ 1959 /* Deal with stray signal. */
1868 cond_resched_rcu_qs(); 1960 cond_resched_rcu_qs();
1961 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1869 WARN_ON(signal_pending(current)); 1962 WARN_ON(signal_pending(current));
1870 trace_rcu_grace_period(rsp->name, 1963 trace_rcu_grace_period(rsp->name,
1871 ACCESS_ONCE(rsp->gpnum), 1964 ACCESS_ONCE(rsp->gpnum),
@@ -2042,8 +2135,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2042 rnp = rdp->mynode; 2135 rnp = rdp->mynode;
2043 raw_spin_lock_irqsave(&rnp->lock, flags); 2136 raw_spin_lock_irqsave(&rnp->lock, flags);
2044 smp_mb__after_unlock_lock(); 2137 smp_mb__after_unlock_lock();
2045 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || 2138 if ((rdp->passed_quiesce == 0 &&
2046 rnp->completed == rnp->gpnum) { 2139 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
2140 rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
2141 rdp->gpwrap) {
2047 2142
2048 /* 2143 /*
2049 * The grace period in which this quiescent state was 2144 * The grace period in which this quiescent state was
@@ -2052,6 +2147,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2052 * within the current grace period. 2147 * within the current grace period.
2053 */ 2148 */
2054 rdp->passed_quiesce = 0; /* need qs for new gp. */ 2149 rdp->passed_quiesce = 0; /* need qs for new gp. */
2150 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
2055 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2151 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2056 return; 2152 return;
2057 } 2153 }
@@ -2096,7 +2192,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
2096 * Was there a quiescent state since the beginning of the grace 2192 * Was there a quiescent state since the beginning of the grace
2097 * period? If no, then exit and wait for the next call. 2193 * period? If no, then exit and wait for the next call.
2098 */ 2194 */
2099 if (!rdp->passed_quiesce) 2195 if (!rdp->passed_quiesce &&
2196 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
2100 return; 2197 return;
2101 2198
2102 /* 2199 /*
@@ -2227,6 +2324,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2227} 2324}
2228 2325
2229/* 2326/*
2327 * All CPUs for the specified rcu_node structure have gone offline,
2328 * and all tasks that were preempted within an RCU read-side critical
2329 * section while running on one of those CPUs have since exited their RCU
2330 * read-side critical section. Some other CPU is reporting this fact with
2331 * the specified rcu_node structure's ->lock held and interrupts disabled.
2332 * This function therefore goes up the tree of rcu_node structures,
2333 * clearing the corresponding bits in the ->qsmaskinit fields. Note that
2334 * the leaf rcu_node structure's ->qsmaskinit field has already been
2335 * updated
2336 *
2337 * This function does check that the specified rcu_node structure has
2338 * all CPUs offline and no blocked tasks, so it is OK to invoke it
2339 * prematurely. That said, invoking it after the fact will cost you
2340 * a needless lock acquisition. So once it has done its work, don't
2341 * invoke it again.
2342 */
2343static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2344{
2345 long mask;
2346 struct rcu_node *rnp = rnp_leaf;
2347
2348 if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
2349 return;
2350 for (;;) {
2351 mask = rnp->grpmask;
2352 rnp = rnp->parent;
2353 if (!rnp)
2354 break;
2355 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
2356 smp_mb__after_unlock_lock(); /* GP memory ordering. */
2357 rnp->qsmaskinit &= ~mask;
2358 if (rnp->qsmaskinit) {
2359 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2360 return;
2361 }
2362 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2363 }
2364}
2365
2366/*
2230 * The CPU has been completely removed, and some other CPU is reporting 2367 * The CPU has been completely removed, and some other CPU is reporting
2231 * this fact from process context. Do the remainder of the cleanup, 2368 * this fact from process context. Do the remainder of the cleanup,
2232 * including orphaning the outgoing CPU's RCU callbacks, and also 2369 * including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2236,8 +2373,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2236static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2373static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2237{ 2374{
2238 unsigned long flags; 2375 unsigned long flags;
2239 unsigned long mask;
2240 int need_report = 0;
2241 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2376 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2242 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 2377 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
2243 2378
@@ -2251,40 +2386,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2251 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 2386 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
2252 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 2387 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
2253 rcu_adopt_orphan_cbs(rsp, flags); 2388 rcu_adopt_orphan_cbs(rsp, flags);
2389 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
2254 2390
2255 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 2391 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
2256 mask = rdp->grpmask; /* rnp->grplo is constant. */ 2392 raw_spin_lock_irqsave(&rnp->lock, flags);
2257 do { 2393 smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
2258 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 2394 rnp->qsmaskinit &= ~rdp->grpmask;
2259 smp_mb__after_unlock_lock(); 2395 if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
2260 rnp->qsmaskinit &= ~mask; 2396 rcu_cleanup_dead_rnp(rnp);
2261 if (rnp->qsmaskinit != 0) { 2397 rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
2262 if (rnp != rdp->mynode)
2263 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2264 break;
2265 }
2266 if (rnp == rdp->mynode)
2267 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
2268 else
2269 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2270 mask = rnp->grpmask;
2271 rnp = rnp->parent;
2272 } while (rnp != NULL);
2273
2274 /*
2275 * We still hold the leaf rcu_node structure lock here, and
2276 * irqs are still disabled. The reason for this subterfuge is
2277 * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
2278 * held leads to deadlock.
2279 */
2280 raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
2281 rnp = rdp->mynode;
2282 if (need_report & RCU_OFL_TASKS_NORM_GP)
2283 rcu_report_unblock_qs_rnp(rnp, flags);
2284 else
2285 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2286 if (need_report & RCU_OFL_TASKS_EXP_GP)
2287 rcu_report_exp_rnp(rsp, rnp, true);
2288 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 2398 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
2289 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 2399 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
2290 cpu, rdp->qlen, rdp->nxtlist); 2400 cpu, rdp->qlen, rdp->nxtlist);
@@ -2300,6 +2410,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2300{ 2410{
2301} 2411}
2302 2412
2413static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2414{
2415}
2416
2303static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2417static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2304{ 2418{
2305} 2419}
@@ -2496,12 +2610,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
2496 } 2610 }
2497 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2611 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2498 } 2612 }
2499 rnp = rcu_get_root(rsp);
2500 if (rnp->qsmask == 0) {
2501 raw_spin_lock_irqsave(&rnp->lock, flags);
2502 smp_mb__after_unlock_lock();
2503 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
2504 }
2505} 2613}
2506 2614
2507/* 2615/*
@@ -2601,7 +2709,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
2601 * Schedule RCU callback invocation. If the specified type of RCU 2709 * Schedule RCU callback invocation. If the specified type of RCU
2602 * does not support RCU priority boosting, just do a direct call, 2710 * does not support RCU priority boosting, just do a direct call,
2603 * otherwise wake up the per-CPU kernel kthread. Note that because we 2711 * otherwise wake up the per-CPU kernel kthread. Note that because we
2604 * are running on the current CPU with interrupts disabled, the 2712 * are running on the current CPU with softirqs disabled, the
2605 * rcu_cpu_kthread_task cannot disappear out from under us. 2713 * rcu_cpu_kthread_task cannot disappear out from under us.
2606 */ 2714 */
2607static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 2715static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -3141,9 +3249,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3141 3249
3142 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3250 /* Is the RCU core waiting for a quiescent state from this CPU? */
3143 if (rcu_scheduler_fully_active && 3251 if (rcu_scheduler_fully_active &&
3144 rdp->qs_pending && !rdp->passed_quiesce) { 3252 rdp->qs_pending && !rdp->passed_quiesce &&
3253 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
3145 rdp->n_rp_qs_pending++; 3254 rdp->n_rp_qs_pending++;
3146 } else if (rdp->qs_pending && rdp->passed_quiesce) { 3255 } else if (rdp->qs_pending &&
3256 (rdp->passed_quiesce ||
3257 rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
3147 rdp->n_rp_report_qs++; 3258 rdp->n_rp_report_qs++;
3148 return 1; 3259 return 1;
3149 } 3260 }
@@ -3167,7 +3278,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3167 } 3278 }
3168 3279
3169 /* Has a new RCU grace period started? */ 3280 /* Has a new RCU grace period started? */
3170 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ 3281 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
3282 unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
3171 rdp->n_rp_gp_started++; 3283 rdp->n_rp_gp_started++;
3172 return 1; 3284 return 1;
3173 } 3285 }
@@ -3350,6 +3462,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
3350 } else { 3462 } else {
3351 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 3463 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
3352 rsp->n_barrier_done); 3464 rsp->n_barrier_done);
3465 smp_mb__before_atomic();
3353 atomic_inc(&rsp->barrier_cpu_count); 3466 atomic_inc(&rsp->barrier_cpu_count);
3354 __call_rcu(&rdp->barrier_head, 3467 __call_rcu(&rdp->barrier_head,
3355 rcu_barrier_callback, rsp, cpu, 0); 3468 rcu_barrier_callback, rsp, cpu, 0);
@@ -3417,9 +3530,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
3417 /* Set up local state, ensuring consistent view of global state. */ 3530 /* Set up local state, ensuring consistent view of global state. */
3418 raw_spin_lock_irqsave(&rnp->lock, flags); 3531 raw_spin_lock_irqsave(&rnp->lock, flags);
3419 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 3532 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
3420 init_callback_list(rdp);
3421 rdp->qlen_lazy = 0;
3422 ACCESS_ONCE(rdp->qlen) = 0;
3423 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 3533 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
3424 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 3534 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
3425 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 3535 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -3476,6 +3586,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3476 rdp->gpnum = rnp->completed; 3586 rdp->gpnum = rnp->completed;
3477 rdp->completed = rnp->completed; 3587 rdp->completed = rnp->completed;
3478 rdp->passed_quiesce = 0; 3588 rdp->passed_quiesce = 0;
3589 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
3479 rdp->qs_pending = 0; 3590 rdp->qs_pending = 0;
3480 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); 3591 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3481 } 3592 }
@@ -3567,17 +3678,35 @@ static int rcu_pm_notify(struct notifier_block *self,
3567static int __init rcu_spawn_gp_kthread(void) 3678static int __init rcu_spawn_gp_kthread(void)
3568{ 3679{
3569 unsigned long flags; 3680 unsigned long flags;
3681 int kthread_prio_in = kthread_prio;
3570 struct rcu_node *rnp; 3682 struct rcu_node *rnp;
3571 struct rcu_state *rsp; 3683 struct rcu_state *rsp;
3684 struct sched_param sp;
3572 struct task_struct *t; 3685 struct task_struct *t;
3573 3686
3687 /* Force priority into range. */
3688 if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
3689 kthread_prio = 1;
3690 else if (kthread_prio < 0)
3691 kthread_prio = 0;
3692 else if (kthread_prio > 99)
3693 kthread_prio = 99;
3694 if (kthread_prio != kthread_prio_in)
3695 pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
3696 kthread_prio, kthread_prio_in);
3697
3574 rcu_scheduler_fully_active = 1; 3698 rcu_scheduler_fully_active = 1;
3575 for_each_rcu_flavor(rsp) { 3699 for_each_rcu_flavor(rsp) {
3576 t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); 3700 t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
3577 BUG_ON(IS_ERR(t)); 3701 BUG_ON(IS_ERR(t));
3578 rnp = rcu_get_root(rsp); 3702 rnp = rcu_get_root(rsp);
3579 raw_spin_lock_irqsave(&rnp->lock, flags); 3703 raw_spin_lock_irqsave(&rnp->lock, flags);
3580 rsp->gp_kthread = t; 3704 rsp->gp_kthread = t;
3705 if (kthread_prio) {
3706 sp.sched_priority = kthread_prio;
3707 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
3708 }
3709 wake_up_process(t);
3581 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3710 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3582 } 3711 }
3583 rcu_spawn_nocb_kthreads(); 3712 rcu_spawn_nocb_kthreads();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b1843896e..119de399eb2f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,7 +27,6 @@
27#include <linux/threads.h> 27#include <linux/threads.h>
28#include <linux/cpumask.h> 28#include <linux/cpumask.h>
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30#include <linux/irq_work.h>
31 30
32/* 31/*
33 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and 32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -172,11 +171,6 @@ struct rcu_node {
172 /* queued on this rcu_node structure that */ 171 /* queued on this rcu_node structure that */
173 /* are blocking the current grace period, */ 172 /* are blocking the current grace period, */
174 /* there can be no such task. */ 173 /* there can be no such task. */
175 struct completion boost_completion;
176 /* Used to ensure that the rt_mutex used */
177 /* to carry out the boosting is fully */
178 /* released with no future boostee accesses */
179 /* before that rt_mutex is re-initialized. */
180 struct rt_mutex boost_mtx; 174 struct rt_mutex boost_mtx;
181 /* Used only for the priority-boosting */ 175 /* Used only for the priority-boosting */
182 /* side effect, not as a lock. */ 176 /* side effect, not as a lock. */
@@ -257,9 +251,12 @@ struct rcu_data {
257 /* in order to detect GP end. */ 251 /* in order to detect GP end. */
258 unsigned long gpnum; /* Highest gp number that this CPU */ 252 unsigned long gpnum; /* Highest gp number that this CPU */
259 /* is aware of having started. */ 253 /* is aware of having started. */
254 unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
255 /* for rcu_all_qs() invocations. */
260 bool passed_quiesce; /* User-mode/idle loop etc. */ 256 bool passed_quiesce; /* User-mode/idle loop etc. */
261 bool qs_pending; /* Core waits for quiesc state. */ 257 bool qs_pending; /* Core waits for quiesc state. */
262 bool beenonline; /* CPU online at least once. */ 258 bool beenonline; /* CPU online at least once. */
259 bool gpwrap; /* Possible gpnum/completed wrap. */
263 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 260 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
264 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 261 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
265#ifdef CONFIG_RCU_CPU_STALL_INFO 262#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -340,14 +337,10 @@ struct rcu_data {
340#ifdef CONFIG_RCU_NOCB_CPU 337#ifdef CONFIG_RCU_NOCB_CPU
341 struct rcu_head *nocb_head; /* CBs waiting for kthread. */ 338 struct rcu_head *nocb_head; /* CBs waiting for kthread. */
342 struct rcu_head **nocb_tail; 339 struct rcu_head **nocb_tail;
343 atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ 340 atomic_long_t nocb_q_count; /* # CBs waiting for nocb */
344 atomic_long_t nocb_q_count_lazy; /* (approximate). */ 341 atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
345 struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ 342 struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
346 struct rcu_head **nocb_follower_tail; 343 struct rcu_head **nocb_follower_tail;
347 atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
348 atomic_long_t nocb_follower_count_lazy; /* (approximate). */
349 int nocb_p_count; /* # CBs being invoked by kthread */
350 int nocb_p_count_lazy; /* (approximate). */
351 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ 344 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
352 struct task_struct *nocb_kthread; 345 struct task_struct *nocb_kthread;
353 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ 346 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
@@ -356,8 +349,6 @@ struct rcu_data {
356 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; 349 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
357 /* CBs waiting for GP. */ 350 /* CBs waiting for GP. */
358 struct rcu_head **nocb_gp_tail; 351 struct rcu_head **nocb_gp_tail;
359 long nocb_gp_count;
360 long nocb_gp_count_lazy;
361 bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ 352 bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
362 struct rcu_data *nocb_next_follower; 353 struct rcu_data *nocb_next_follower;
363 /* Next follower in wakeup chain. */ 354 /* Next follower in wakeup chain. */
@@ -488,10 +479,14 @@ struct rcu_state {
488 /* due to no GP active. */ 479 /* due to no GP active. */
489 unsigned long gp_start; /* Time at which GP started, */ 480 unsigned long gp_start; /* Time at which GP started, */
490 /* but in jiffies. */ 481 /* but in jiffies. */
482 unsigned long gp_activity; /* Time of last GP kthread */
483 /* activity in jiffies. */
491 unsigned long jiffies_stall; /* Time at which to check */ 484 unsigned long jiffies_stall; /* Time at which to check */
492 /* for CPU stalls. */ 485 /* for CPU stalls. */
493 unsigned long jiffies_resched; /* Time at which to resched */ 486 unsigned long jiffies_resched; /* Time at which to resched */
494 /* a reluctant CPU. */ 487 /* a reluctant CPU. */
488 unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
489 /* GP start. */
495 unsigned long gp_max; /* Maximum GP duration in */ 490 unsigned long gp_max; /* Maximum GP duration in */
496 /* jiffies. */ 491 /* jiffies. */
497 const char *name; /* Name of structure. */ 492 const char *name; /* Name of structure. */
@@ -514,13 +509,6 @@ extern struct list_head rcu_struct_flavors;
514#define for_each_rcu_flavor(rsp) \ 509#define for_each_rcu_flavor(rsp) \
515 list_for_each_entry((rsp), &rcu_struct_flavors, flavors) 510 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
516 511
517/* Return values for rcu_preempt_offline_tasks(). */
518
519#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
520 /* GP were moved to root. */
521#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
522 /* GP were moved to root. */
523
524/* 512/*
525 * RCU implementation internal declarations: 513 * RCU implementation internal declarations:
526 */ 514 */
@@ -546,27 +534,16 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
546 534
547/* Forward declarations for rcutree_plugin.h */ 535/* Forward declarations for rcutree_plugin.h */
548static void rcu_bootup_announce(void); 536static void rcu_bootup_announce(void);
549long rcu_batches_completed(void);
550static void rcu_preempt_note_context_switch(void); 537static void rcu_preempt_note_context_switch(void);
551static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 538static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
552#ifdef CONFIG_HOTPLUG_CPU 539#ifdef CONFIG_HOTPLUG_CPU
553static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 540static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
554 unsigned long flags);
555#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 541#endif /* #ifdef CONFIG_HOTPLUG_CPU */
556static void rcu_print_detail_task_stall(struct rcu_state *rsp); 542static void rcu_print_detail_task_stall(struct rcu_state *rsp);
557static int rcu_print_task_stall(struct rcu_node *rnp); 543static int rcu_print_task_stall(struct rcu_node *rnp);
558static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 544static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
559#ifdef CONFIG_HOTPLUG_CPU
560static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
561 struct rcu_node *rnp,
562 struct rcu_data *rdp);
563#endif /* #ifdef CONFIG_HOTPLUG_CPU */
564static void rcu_preempt_check_callbacks(void); 545static void rcu_preempt_check_callbacks(void);
565void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 546void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
566#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
567static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
568 bool wake);
569#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
570static void __init __rcu_init_preempt(void); 547static void __init __rcu_init_preempt(void);
571static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 548static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
572static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 549static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -622,24 +599,15 @@ static void rcu_dynticks_task_exit(void);
622#endif /* #ifndef RCU_TREE_NONCORE */ 599#endif /* #ifndef RCU_TREE_NONCORE */
623 600
624#ifdef CONFIG_RCU_TRACE 601#ifdef CONFIG_RCU_TRACE
625#ifdef CONFIG_RCU_NOCB_CPU 602/* Read out queue lengths for tracing. */
626/* Sum up queue lengths for tracing. */
627static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) 603static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
628{ 604{
629 *ql = atomic_long_read(&rdp->nocb_q_count) + 605#ifdef CONFIG_RCU_NOCB_CPU
630 rdp->nocb_p_count + 606 *ql = atomic_long_read(&rdp->nocb_q_count);
631 atomic_long_read(&rdp->nocb_follower_count) + 607 *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
632 rdp->nocb_p_count + rdp->nocb_gp_count;
633 *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
634 rdp->nocb_p_count_lazy +
635 atomic_long_read(&rdp->nocb_follower_count_lazy) +
636 rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
637}
638#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 608#else /* #ifdef CONFIG_RCU_NOCB_CPU */
639static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
640{
641 *ql = 0; 609 *ql = 0;
642 *qll = 0; 610 *qll = 0;
643}
644#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 611#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
612}
645#endif /* #ifdef CONFIG_RCU_TRACE */ 613#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..0a571e9a0f1d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -34,10 +34,6 @@
34 34
35#include "../locking/rtmutex_common.h" 35#include "../locking/rtmutex_common.h"
36 36
37/* rcuc/rcub kthread realtime priority */
38static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
39module_param(kthread_prio, int, 0644);
40
41/* 37/*
42 * Control variables for per-CPU and per-rcu_node kthreads. These 38 * Control variables for per-CPU and per-rcu_node kthreads. These
43 * handle all flavors of RCU. 39 * handle all flavors of RCU.
@@ -53,7 +49,6 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
53static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ 49static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
54static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ 50static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
55static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ 51static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
56static char __initdata nocb_buf[NR_CPUS * 5];
57#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 52#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
58 53
59/* 54/*
@@ -103,6 +98,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
103static struct rcu_state *rcu_state_p = &rcu_preempt_state; 98static struct rcu_state *rcu_state_p = &rcu_preempt_state;
104 99
105static int rcu_preempted_readers_exp(struct rcu_node *rnp); 100static int rcu_preempted_readers_exp(struct rcu_node *rnp);
101static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
102 bool wake);
106 103
107/* 104/*
108 * Tell them what RCU they are running. 105 * Tell them what RCU they are running.
@@ -114,25 +111,6 @@ static void __init rcu_bootup_announce(void)
114} 111}
115 112
116/* 113/*
117 * Return the number of RCU-preempt batches processed thus far
118 * for debug and statistics.
119 */
120static long rcu_batches_completed_preempt(void)
121{
122 return rcu_preempt_state.completed;
123}
124EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
125
126/*
127 * Return the number of RCU batches processed thus far for debug & stats.
128 */
129long rcu_batches_completed(void)
130{
131 return rcu_batches_completed_preempt();
132}
133EXPORT_SYMBOL_GPL(rcu_batches_completed);
134
135/*
136 * Record a preemptible-RCU quiescent state for the specified CPU. Note 114 * Record a preemptible-RCU quiescent state for the specified CPU. Note
137 * that this just means that the task currently running on the CPU is 115 * that this just means that the task currently running on the CPU is
138 * not in a quiescent state. There might be any number of tasks blocked 116 * not in a quiescent state. There might be any number of tasks blocked
@@ -307,15 +285,25 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
307} 285}
308 286
309/* 287/*
288 * Return true if the specified rcu_node structure has tasks that were
289 * preempted within an RCU read-side critical section.
290 */
291static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
292{
293 return !list_empty(&rnp->blkd_tasks);
294}
295
296/*
310 * Handle special cases during rcu_read_unlock(), such as needing to 297 * Handle special cases during rcu_read_unlock(), such as needing to
311 * notify RCU core processing or task having blocked during the RCU 298 * notify RCU core processing or task having blocked during the RCU
312 * read-side critical section. 299 * read-side critical section.
313 */ 300 */
314void rcu_read_unlock_special(struct task_struct *t) 301void rcu_read_unlock_special(struct task_struct *t)
315{ 302{
316 int empty; 303 bool empty;
317 int empty_exp; 304 bool empty_exp;
318 int empty_exp_now; 305 bool empty_norm;
306 bool empty_exp_now;
319 unsigned long flags; 307 unsigned long flags;
320 struct list_head *np; 308 struct list_head *np;
321#ifdef CONFIG_RCU_BOOST 309#ifdef CONFIG_RCU_BOOST
@@ -338,6 +326,7 @@ void rcu_read_unlock_special(struct task_struct *t)
338 special = t->rcu_read_unlock_special; 326 special = t->rcu_read_unlock_special;
339 if (special.b.need_qs) { 327 if (special.b.need_qs) {
340 rcu_preempt_qs(); 328 rcu_preempt_qs();
329 t->rcu_read_unlock_special.b.need_qs = false;
341 if (!t->rcu_read_unlock_special.s) { 330 if (!t->rcu_read_unlock_special.s) {
342 local_irq_restore(flags); 331 local_irq_restore(flags);
343 return; 332 return;
@@ -367,7 +356,8 @@ void rcu_read_unlock_special(struct task_struct *t)
367 break; 356 break;
368 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 357 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
369 } 358 }
370 empty = !rcu_preempt_blocked_readers_cgp(rnp); 359 empty = !rcu_preempt_has_tasks(rnp);
360 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
371 empty_exp = !rcu_preempted_readers_exp(rnp); 361 empty_exp = !rcu_preempted_readers_exp(rnp);
372 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 362 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
373 np = rcu_next_node_entry(t, rnp); 363 np = rcu_next_node_entry(t, rnp);
@@ -387,13 +377,21 @@ void rcu_read_unlock_special(struct task_struct *t)
387#endif /* #ifdef CONFIG_RCU_BOOST */ 377#endif /* #ifdef CONFIG_RCU_BOOST */
388 378
389 /* 379 /*
380 * If this was the last task on the list, go see if we
381 * need to propagate ->qsmaskinit bit clearing up the
382 * rcu_node tree.
383 */
384 if (!empty && !rcu_preempt_has_tasks(rnp))
385 rcu_cleanup_dead_rnp(rnp);
386
387 /*
390 * If this was the last task on the current list, and if 388 * If this was the last task on the current list, and if
391 * we aren't waiting on any CPUs, report the quiescent state. 389 * we aren't waiting on any CPUs, report the quiescent state.
392 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, 390 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
393 * so we must take a snapshot of the expedited state. 391 * so we must take a snapshot of the expedited state.
394 */ 392 */
395 empty_exp_now = !rcu_preempted_readers_exp(rnp); 393 empty_exp_now = !rcu_preempted_readers_exp(rnp);
396 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 394 if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
397 trace_rcu_quiescent_state_report(TPS("preempt_rcu"), 395 trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
398 rnp->gpnum, 396 rnp->gpnum,
399 0, rnp->qsmask, 397 0, rnp->qsmask,
@@ -408,10 +406,8 @@ void rcu_read_unlock_special(struct task_struct *t)
408 406
409#ifdef CONFIG_RCU_BOOST 407#ifdef CONFIG_RCU_BOOST
410 /* Unboost if we were boosted. */ 408 /* Unboost if we were boosted. */
411 if (drop_boost_mutex) { 409 if (drop_boost_mutex)
412 rt_mutex_unlock(&rnp->boost_mtx); 410 rt_mutex_unlock(&rnp->boost_mtx);
413 complete(&rnp->boost_completion);
414 }
415#endif /* #ifdef CONFIG_RCU_BOOST */ 411#endif /* #ifdef CONFIG_RCU_BOOST */
416 412
417 /* 413 /*
@@ -519,99 +515,13 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
519static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 515static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
520{ 516{
521 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); 517 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
522 if (!list_empty(&rnp->blkd_tasks)) 518 if (rcu_preempt_has_tasks(rnp))
523 rnp->gp_tasks = rnp->blkd_tasks.next; 519 rnp->gp_tasks = rnp->blkd_tasks.next;
524 WARN_ON_ONCE(rnp->qsmask); 520 WARN_ON_ONCE(rnp->qsmask);
525} 521}
526 522
527#ifdef CONFIG_HOTPLUG_CPU 523#ifdef CONFIG_HOTPLUG_CPU
528 524
529/*
530 * Handle tasklist migration for case in which all CPUs covered by the
531 * specified rcu_node have gone offline. Move them up to the root
532 * rcu_node. The reason for not just moving them to the immediate
533 * parent is to remove the need for rcu_read_unlock_special() to
534 * make more than two attempts to acquire the target rcu_node's lock.
535 * Returns true if there were tasks blocking the current RCU grace
536 * period.
537 *
538 * Returns 1 if there was previously a task blocking the current grace
539 * period on the specified rcu_node structure.
540 *
541 * The caller must hold rnp->lock with irqs disabled.
542 */
543static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
544 struct rcu_node *rnp,
545 struct rcu_data *rdp)
546{
547 struct list_head *lp;
548 struct list_head *lp_root;
549 int retval = 0;
550 struct rcu_node *rnp_root = rcu_get_root(rsp);
551 struct task_struct *t;
552
553 if (rnp == rnp_root) {
554 WARN_ONCE(1, "Last CPU thought to be offlined?");
555 return 0; /* Shouldn't happen: at least one CPU online. */
556 }
557
558 /* If we are on an internal node, complain bitterly. */
559 WARN_ON_ONCE(rnp != rdp->mynode);
560
561 /*
562 * Move tasks up to root rcu_node. Don't try to get fancy for
563 * this corner-case operation -- just put this node's tasks
564 * at the head of the root node's list, and update the root node's
565 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
566 * if non-NULL. This might result in waiting for more tasks than
567 * absolutely necessary, but this is a good performance/complexity
568 * tradeoff.
569 */
570 if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
571 retval |= RCU_OFL_TASKS_NORM_GP;
572 if (rcu_preempted_readers_exp(rnp))
573 retval |= RCU_OFL_TASKS_EXP_GP;
574 lp = &rnp->blkd_tasks;
575 lp_root = &rnp_root->blkd_tasks;
576 while (!list_empty(lp)) {
577 t = list_entry(lp->next, typeof(*t), rcu_node_entry);
578 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
579 smp_mb__after_unlock_lock();
580 list_del(&t->rcu_node_entry);
581 t->rcu_blocked_node = rnp_root;
582 list_add(&t->rcu_node_entry, lp_root);
583 if (&t->rcu_node_entry == rnp->gp_tasks)
584 rnp_root->gp_tasks = rnp->gp_tasks;
585 if (&t->rcu_node_entry == rnp->exp_tasks)
586 rnp_root->exp_tasks = rnp->exp_tasks;
587#ifdef CONFIG_RCU_BOOST
588 if (&t->rcu_node_entry == rnp->boost_tasks)
589 rnp_root->boost_tasks = rnp->boost_tasks;
590#endif /* #ifdef CONFIG_RCU_BOOST */
591 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
592 }
593
594 rnp->gp_tasks = NULL;
595 rnp->exp_tasks = NULL;
596#ifdef CONFIG_RCU_BOOST
597 rnp->boost_tasks = NULL;
598 /*
599 * In case root is being boosted and leaf was not. Make sure
600 * that we boost the tasks blocking the current grace period
601 * in this case.
602 */
603 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
604 smp_mb__after_unlock_lock();
605 if (rnp_root->boost_tasks != NULL &&
606 rnp_root->boost_tasks != rnp_root->gp_tasks &&
607 rnp_root->boost_tasks != rnp_root->exp_tasks)
608 rnp_root->boost_tasks = rnp_root->gp_tasks;
609 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
610#endif /* #ifdef CONFIG_RCU_BOOST */
611
612 return retval;
613}
614
615#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 525#endif /* #ifdef CONFIG_HOTPLUG_CPU */
616 526
617/* 527/*
@@ -771,7 +681,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
771 681
772 raw_spin_lock_irqsave(&rnp->lock, flags); 682 raw_spin_lock_irqsave(&rnp->lock, flags);
773 smp_mb__after_unlock_lock(); 683 smp_mb__after_unlock_lock();
774 if (list_empty(&rnp->blkd_tasks)) { 684 if (!rcu_preempt_has_tasks(rnp)) {
775 raw_spin_unlock_irqrestore(&rnp->lock, flags); 685 raw_spin_unlock_irqrestore(&rnp->lock, flags);
776 } else { 686 } else {
777 rnp->exp_tasks = rnp->blkd_tasks.next; 687 rnp->exp_tasks = rnp->blkd_tasks.next;
@@ -933,15 +843,6 @@ static void __init rcu_bootup_announce(void)
933} 843}
934 844
935/* 845/*
936 * Return the number of RCU batches processed thus far for debug & stats.
937 */
938long rcu_batches_completed(void)
939{
940 return rcu_batches_completed_sched();
941}
942EXPORT_SYMBOL_GPL(rcu_batches_completed);
943
944/*
945 * Because preemptible RCU does not exist, we never have to check for 846 * Because preemptible RCU does not exist, we never have to check for
946 * CPUs being in quiescent states. 847 * CPUs being in quiescent states.
947 */ 848 */
@@ -960,11 +861,12 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
960 861
961#ifdef CONFIG_HOTPLUG_CPU 862#ifdef CONFIG_HOTPLUG_CPU
962 863
963/* Because preemptible RCU does not exist, no quieting of tasks. */ 864/*
964static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 865 * Because there is no preemptible RCU, there can be no readers blocked.
965 __releases(rnp->lock) 866 */
867static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
966{ 868{
967 raw_spin_unlock_irqrestore(&rnp->lock, flags); 869 return false;
968} 870}
969 871
970#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 872#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -996,23 +898,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
996 WARN_ON_ONCE(rnp->qsmask); 898 WARN_ON_ONCE(rnp->qsmask);
997} 899}
998 900
999#ifdef CONFIG_HOTPLUG_CPU
1000
1001/*
1002 * Because preemptible RCU does not exist, it never needs to migrate
1003 * tasks that were blocked within RCU read-side critical sections, and
1004 * such non-existent tasks cannot possibly have been blocking the current
1005 * grace period.
1006 */
1007static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1008 struct rcu_node *rnp,
1009 struct rcu_data *rdp)
1010{
1011 return 0;
1012}
1013
1014#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1015
1016/* 901/*
1017 * Because preemptible RCU does not exist, it never has any callbacks 902 * Because preemptible RCU does not exist, it never has any callbacks
1018 * to check. 903 * to check.
@@ -1031,20 +916,6 @@ void synchronize_rcu_expedited(void)
1031} 916}
1032EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 917EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1033 918
1034#ifdef CONFIG_HOTPLUG_CPU
1035
1036/*
1037 * Because preemptible RCU does not exist, there is never any need to
1038 * report on tasks preempted in RCU read-side critical sections during
1039 * expedited RCU grace periods.
1040 */
1041static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1042 bool wake)
1043{
1044}
1045
1046#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1047
1048/* 919/*
1049 * Because preemptible RCU does not exist, rcu_barrier() is just 920 * Because preemptible RCU does not exist, rcu_barrier() is just
1050 * another name for rcu_barrier_sched(). 921 * another name for rcu_barrier_sched().
@@ -1080,7 +951,7 @@ void exit_rcu(void)
1080 951
1081static void rcu_initiate_boost_trace(struct rcu_node *rnp) 952static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1082{ 953{
1083 if (list_empty(&rnp->blkd_tasks)) 954 if (!rcu_preempt_has_tasks(rnp))
1084 rnp->n_balk_blkd_tasks++; 955 rnp->n_balk_blkd_tasks++;
1085 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) 956 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1086 rnp->n_balk_exp_gp_tasks++; 957 rnp->n_balk_exp_gp_tasks++;
@@ -1127,7 +998,8 @@ static int rcu_boost(struct rcu_node *rnp)
1127 struct task_struct *t; 998 struct task_struct *t;
1128 struct list_head *tb; 999 struct list_head *tb;
1129 1000
1130 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) 1001 if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
1002 ACCESS_ONCE(rnp->boost_tasks) == NULL)
1131 return 0; /* Nothing left to boost. */ 1003 return 0; /* Nothing left to boost. */
1132 1004
1133 raw_spin_lock_irqsave(&rnp->lock, flags); 1005 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1175,15 +1047,11 @@ static int rcu_boost(struct rcu_node *rnp)
1175 */ 1047 */
1176 t = container_of(tb, struct task_struct, rcu_node_entry); 1048 t = container_of(tb, struct task_struct, rcu_node_entry);
1177 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); 1049 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
1178 init_completion(&rnp->boost_completion);
1179 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1050 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1180 /* Lock only for side effect: boosts task t's priority. */ 1051 /* Lock only for side effect: boosts task t's priority. */
1181 rt_mutex_lock(&rnp->boost_mtx); 1052 rt_mutex_lock(&rnp->boost_mtx);
1182 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ 1053 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
1183 1054
1184 /* Wait for boostee to be done w/boost_mtx before reinitializing. */
1185 wait_for_completion(&rnp->boost_completion);
1186
1187 return ACCESS_ONCE(rnp->exp_tasks) != NULL || 1055 return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1188 ACCESS_ONCE(rnp->boost_tasks) != NULL; 1056 ACCESS_ONCE(rnp->boost_tasks) != NULL;
1189} 1057}
@@ -1416,12 +1284,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1416 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) 1284 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1417 if ((mask & 0x1) && cpu != outgoingcpu) 1285 if ((mask & 0x1) && cpu != outgoingcpu)
1418 cpumask_set_cpu(cpu, cm); 1286 cpumask_set_cpu(cpu, cm);
1419 if (cpumask_weight(cm) == 0) { 1287 if (cpumask_weight(cm) == 0)
1420 cpumask_setall(cm); 1288 cpumask_setall(cm);
1421 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1422 cpumask_clear_cpu(cpu, cm);
1423 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1424 }
1425 set_cpus_allowed_ptr(t, cm); 1289 set_cpus_allowed_ptr(t, cm);
1426 free_cpumask_var(cm); 1290 free_cpumask_var(cm);
1427} 1291}
@@ -1446,12 +1310,8 @@ static void __init rcu_spawn_boost_kthreads(void)
1446 for_each_possible_cpu(cpu) 1310 for_each_possible_cpu(cpu)
1447 per_cpu(rcu_cpu_has_work, cpu) = 0; 1311 per_cpu(rcu_cpu_has_work, cpu) = 0;
1448 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); 1312 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
1449 rnp = rcu_get_root(rcu_state_p); 1313 rcu_for_each_leaf_node(rcu_state_p, rnp)
1450 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); 1314 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1451 if (NUM_RCU_NODES > 1) {
1452 rcu_for_each_leaf_node(rcu_state_p, rnp)
1453 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1454 }
1455} 1315}
1456 1316
1457static void rcu_prepare_kthreads(int cpu) 1317static void rcu_prepare_kthreads(int cpu)
@@ -1605,7 +1465,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
1605 * completed since we last checked and there are 1465 * completed since we last checked and there are
1606 * callbacks not yet ready to invoke. 1466 * callbacks not yet ready to invoke.
1607 */ 1467 */
1608 if (rdp->completed != rnp->completed && 1468 if ((rdp->completed != rnp->completed ||
1469 unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
1609 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) 1470 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1610 note_gp_changes(rsp, rdp); 1471 note_gp_changes(rsp, rdp);
1611 1472
@@ -1898,11 +1759,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1898 ticks_value = rsp->gpnum - rdp->gpnum; 1759 ticks_value = rsp->gpnum - rdp->gpnum;
1899 } 1760 }
1900 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1761 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
1901 pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", 1762 pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
1902 cpu, ticks_value, ticks_title, 1763 cpu, ticks_value, ticks_title,
1903 atomic_read(&rdtp->dynticks) & 0xfff, 1764 atomic_read(&rdtp->dynticks) & 0xfff,
1904 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 1765 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
1905 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), 1766 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
1767 ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
1906 fast_no_hz); 1768 fast_no_hz);
1907} 1769}
1908 1770
@@ -2056,9 +1918,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
2056static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) 1918static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2057{ 1919{
2058 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1920 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1921 unsigned long ret;
1922#ifdef CONFIG_PROVE_RCU
2059 struct rcu_head *rhp; 1923 struct rcu_head *rhp;
1924#endif /* #ifdef CONFIG_PROVE_RCU */
2060 1925
2061 /* No-CBs CPUs might have callbacks on any of three lists. */ 1926 /*
1927 * Check count of all no-CBs callbacks awaiting invocation.
1928 * There needs to be a barrier before this function is called,
1929 * but associated with a prior determination that no more
1930 * callbacks would be posted. In the worst case, the first
1931 * barrier in _rcu_barrier() suffices (but the caller cannot
1932 * necessarily rely on this, not a substitute for the caller
1933 * getting the concurrency design right!). There must also be
1934 * a barrier between the following load an posting of a callback
1935 * (if a callback is in fact needed). This is associated with an
1936 * atomic_inc() in the caller.
1937 */
1938 ret = atomic_long_read(&rdp->nocb_q_count);
1939
1940#ifdef CONFIG_PROVE_RCU
2062 rhp = ACCESS_ONCE(rdp->nocb_head); 1941 rhp = ACCESS_ONCE(rdp->nocb_head);
2063 if (!rhp) 1942 if (!rhp)
2064 rhp = ACCESS_ONCE(rdp->nocb_gp_head); 1943 rhp = ACCESS_ONCE(rdp->nocb_gp_head);
@@ -2072,8 +1951,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2072 cpu, rhp->func); 1951 cpu, rhp->func);
2073 WARN_ON_ONCE(1); 1952 WARN_ON_ONCE(1);
2074 } 1953 }
1954#endif /* #ifdef CONFIG_PROVE_RCU */
2075 1955
2076 return !!rhp; 1956 return !!ret;
2077} 1957}
2078 1958
2079/* 1959/*
@@ -2095,9 +1975,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2095 struct task_struct *t; 1975 struct task_struct *t;
2096 1976
2097 /* Enqueue the callback on the nocb list and update counts. */ 1977 /* Enqueue the callback on the nocb list and update counts. */
1978 atomic_long_add(rhcount, &rdp->nocb_q_count);
1979 /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
2098 old_rhpp = xchg(&rdp->nocb_tail, rhtp); 1980 old_rhpp = xchg(&rdp->nocb_tail, rhtp);
2099 ACCESS_ONCE(*old_rhpp) = rhp; 1981 ACCESS_ONCE(*old_rhpp) = rhp;
2100 atomic_long_add(rhcount, &rdp->nocb_q_count);
2101 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); 1982 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
2102 smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ 1983 smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
2103 1984
@@ -2288,9 +2169,6 @@ wait_again:
2288 /* Move callbacks to wait-for-GP list, which is empty. */ 2169 /* Move callbacks to wait-for-GP list, which is empty. */
2289 ACCESS_ONCE(rdp->nocb_head) = NULL; 2170 ACCESS_ONCE(rdp->nocb_head) = NULL;
2290 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); 2171 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2291 rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
2292 rdp->nocb_gp_count_lazy =
2293 atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2294 gotcbs = true; 2172 gotcbs = true;
2295 } 2173 }
2296 2174
@@ -2338,9 +2216,6 @@ wait_again:
2338 /* Append callbacks to follower's "done" list. */ 2216 /* Append callbacks to follower's "done" list. */
2339 tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); 2217 tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
2340 *tail = rdp->nocb_gp_head; 2218 *tail = rdp->nocb_gp_head;
2341 atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
2342 atomic_long_add(rdp->nocb_gp_count_lazy,
2343 &rdp->nocb_follower_count_lazy);
2344 smp_mb__after_atomic(); /* Store *tail before wakeup. */ 2219 smp_mb__after_atomic(); /* Store *tail before wakeup. */
2345 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { 2220 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
2346 /* 2221 /*
@@ -2415,13 +2290,11 @@ static int rcu_nocb_kthread(void *arg)
2415 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); 2290 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
2416 ACCESS_ONCE(rdp->nocb_follower_head) = NULL; 2291 ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
2417 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); 2292 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
2418 c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
2419 cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
2420 rdp->nocb_p_count += c;
2421 rdp->nocb_p_count_lazy += cl;
2422 2293
2423 /* Each pass through the following loop invokes a callback. */ 2294 /* Each pass through the following loop invokes a callback. */
2424 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); 2295 trace_rcu_batch_start(rdp->rsp->name,
2296 atomic_long_read(&rdp->nocb_q_count_lazy),
2297 atomic_long_read(&rdp->nocb_q_count), -1);
2425 c = cl = 0; 2298 c = cl = 0;
2426 while (list) { 2299 while (list) {
2427 next = list->next; 2300 next = list->next;
@@ -2443,9 +2316,9 @@ static int rcu_nocb_kthread(void *arg)
2443 list = next; 2316 list = next;
2444 } 2317 }
2445 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); 2318 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2446 ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c; 2319 smp_mb__before_atomic(); /* _add after CB invocation. */
2447 ACCESS_ONCE(rdp->nocb_p_count_lazy) = 2320 atomic_long_add(-c, &rdp->nocb_q_count);
2448 rdp->nocb_p_count_lazy - cl; 2321 atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
2449 rdp->n_nocbs_invoked += c; 2322 rdp->n_nocbs_invoked += c;
2450 } 2323 }
2451 return 0; 2324 return 0;
@@ -2513,8 +2386,8 @@ void __init rcu_init_nohz(void)
2513 cpumask_and(rcu_nocb_mask, cpu_possible_mask, 2386 cpumask_and(rcu_nocb_mask, cpu_possible_mask,
2514 rcu_nocb_mask); 2387 rcu_nocb_mask);
2515 } 2388 }
2516 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 2389 pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
2517 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); 2390 cpumask_pr_args(rcu_nocb_mask));
2518 if (rcu_nocb_poll) 2391 if (rcu_nocb_poll)
2519 pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); 2392 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
2520 2393
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 5cdc62e1beeb..fbb6240509ea 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -46,6 +46,8 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "tree.h" 47#include "tree.h"
48 48
49DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
50
49static int r_open(struct inode *inode, struct file *file, 51static int r_open(struct inode *inode, struct file *file,
50 const struct seq_operations *op) 52 const struct seq_operations *op)
51{ 53{
@@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
115 117
116 if (!rdp->beenonline) 118 if (!rdp->beenonline)
117 return; 119 return;
118 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", 120 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
119 rdp->cpu, 121 rdp->cpu,
120 cpu_is_offline(rdp->cpu) ? '!' : ' ', 122 cpu_is_offline(rdp->cpu) ? '!' : ' ',
121 ulong2long(rdp->completed), ulong2long(rdp->gpnum), 123 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
122 rdp->passed_quiesce, rdp->qs_pending); 124 rdp->passed_quiesce,
125 rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
126 rdp->qs_pending);
123 seq_printf(m, " dt=%d/%llx/%d df=%lu", 127 seq_printf(m, " dt=%d/%llx/%d df=%lu",
124 atomic_read(&rdp->dynticks->dynticks), 128 atomic_read(&rdp->dynticks->dynticks),
125 rdp->dynticks->dynticks_nesting, 129 rdp->dynticks->dynticks_nesting,
diff --git a/kernel/resource.c b/kernel/resource.c
index 0bcebffc4e77..19f2357dfda3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -22,6 +22,7 @@
22#include <linux/device.h> 22#include <linux/device.h>
23#include <linux/pfn.h> 23#include <linux/pfn.h>
24#include <linux/mm.h> 24#include <linux/mm.h>
25#include <linux/resource_ext.h>
25#include <asm/io.h> 26#include <asm/io.h>
26 27
27 28
@@ -1529,6 +1530,30 @@ int iomem_is_exclusive(u64 addr)
1529 return err; 1530 return err;
1530} 1531}
1531 1532
1533struct resource_entry *resource_list_create_entry(struct resource *res,
1534 size_t extra_size)
1535{
1536 struct resource_entry *entry;
1537
1538 entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
1539 if (entry) {
1540 INIT_LIST_HEAD(&entry->node);
1541 entry->res = res ? res : &entry->__res;
1542 }
1543
1544 return entry;
1545}
1546EXPORT_SYMBOL(resource_list_create_entry);
1547
1548void resource_list_free(struct list_head *head)
1549{
1550 struct resource_entry *entry, *tmp;
1551
1552 list_for_each_entry_safe(entry, tmp, head, node)
1553 resource_list_destroy_entry(entry);
1554}
1555EXPORT_SYMBOL(resource_list_free);
1556
1532static int __init strict_iomem(char *str) 1557static int __init strict_iomem(char *str)
1533{ 1558{
1534 if (strstr(str, "relaxed")) 1559 if (strstr(str, "relaxed"))
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index ab32b7b0db5c..46be87024875 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,5 +1,5 @@
1ifdef CONFIG_FUNCTION_TRACER 1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_clock.o = -pg 2CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
3endif 3endif
4 4
5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 8a2e230fb86a..eae160dd669d 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void)
87 * so we don't have to move tasks around upon policy change, 87 * so we don't have to move tasks around upon policy change,
88 * or flail around trying to allocate bandwidth on the fly. 88 * or flail around trying to allocate bandwidth on the fly.
89 * A bandwidth exception in __sched_setscheduler() allows 89 * A bandwidth exception in __sched_setscheduler() allows
90 * the policy change to proceed. Thereafter, task_group() 90 * the policy change to proceed.
91 * returns &root_task_group, so zero bandwidth is required.
92 */ 91 */
93 free_rt_sched_group(tg); 92 free_rt_sched_group(tg);
94 tg->rt_se = root_task_group.rt_se; 93 tg->rt_se = root_task_group.rt_se;
@@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
115 if (tg != &root_task_group) 114 if (tg != &root_task_group)
116 return false; 115 return false;
117 116
118 if (p->sched_class != &fair_sched_class)
119 return false;
120
121 /* 117 /*
122 * We can only assume the task group can't go away on us if 118 * We can only assume the task group can't go away on us if
123 * autogroup_move_group() can see us on ->thread_group list. 119 * autogroup_move_group() can see us on ->thread_group list.
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c27e4f8f4879..c0a205101c23 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -420,3 +420,16 @@ u64 local_clock(void)
420 420
421EXPORT_SYMBOL_GPL(cpu_clock); 421EXPORT_SYMBOL_GPL(cpu_clock);
422EXPORT_SYMBOL_GPL(local_clock); 422EXPORT_SYMBOL_GPL(local_clock);
423
424/*
425 * Running clock - returns the time that has elapsed while a guest has been
426 * running.
427 * On a guest this value should be local_clock minus the time the guest was
428 * suspended by the hypervisor (for any reason).
429 * On bare metal this function should return the same as local_clock.
430 * Architectures and sub-architectures can override this.
431 */
432u64 __weak running_clock(void)
433{
434 return local_clock();
435}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 607f852b4d04..8d0f35debf35 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -268,6 +268,15 @@ bool try_wait_for_completion(struct completion *x)
268 unsigned long flags; 268 unsigned long flags;
269 int ret = 1; 269 int ret = 1;
270 270
271 /*
272 * Since x->done will need to be locked only
273 * in the non-blocking case, we check x->done
274 * first without taking the lock so we can
275 * return early in the blocking case.
276 */
277 if (!READ_ONCE(x->done))
278 return 0;
279
271 spin_lock_irqsave(&x->wait.lock, flags); 280 spin_lock_irqsave(&x->wait.lock, flags);
272 if (!x->done) 281 if (!x->done)
273 ret = 0; 282 ret = 0;
@@ -288,13 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion);
288 */ 297 */
289bool completion_done(struct completion *x) 298bool completion_done(struct completion *x)
290{ 299{
291 unsigned long flags; 300 if (!READ_ONCE(x->done))
292 int ret = 1; 301 return false;
293 302
294 spin_lock_irqsave(&x->wait.lock, flags); 303 /*
295 if (!x->done) 304 * If ->done, we need to wait for complete() to release ->wait.lock
296 ret = 0; 305 * otherwise we can end up freeing the completion before complete()
297 spin_unlock_irqrestore(&x->wait.lock, flags); 306 * is done referencing it.
298 return ret; 307 *
308 * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
309 * the loads of ->done and ->wait.lock such that we cannot observe
310 * the lock before complete() acquires it while observing the ->done
311 * after it's acquired the lock.
312 */
313 smp_rmb();
314 spin_unlock_wait(&x->wait.lock);
315 return true;
299} 316}
300EXPORT_SYMBOL(completion_done); 317EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ae1188f62693..f0f831e8a345 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
119{ 119{
120 s64 delta; 120 s64 delta;
121 121
122 if (rq->skip_clock_update > 0) 122 lockdep_assert_held(&rq->lock);
123
124 if (rq->clock_skip_update & RQCF_ACT_SKIP)
123 return; 125 return;
124 126
125 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 127 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -305,66 +307,6 @@ __read_mostly int scheduler_running;
305int sysctl_sched_rt_runtime = 950000; 307int sysctl_sched_rt_runtime = 950000;
306 308
307/* 309/*
308 * __task_rq_lock - lock the rq @p resides on.
309 */
310static inline struct rq *__task_rq_lock(struct task_struct *p)
311 __acquires(rq->lock)
312{
313 struct rq *rq;
314
315 lockdep_assert_held(&p->pi_lock);
316
317 for (;;) {
318 rq = task_rq(p);
319 raw_spin_lock(&rq->lock);
320 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
321 return rq;
322 raw_spin_unlock(&rq->lock);
323
324 while (unlikely(task_on_rq_migrating(p)))
325 cpu_relax();
326 }
327}
328
329/*
330 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
331 */
332static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
333 __acquires(p->pi_lock)
334 __acquires(rq->lock)
335{
336 struct rq *rq;
337
338 for (;;) {
339 raw_spin_lock_irqsave(&p->pi_lock, *flags);
340 rq = task_rq(p);
341 raw_spin_lock(&rq->lock);
342 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
343 return rq;
344 raw_spin_unlock(&rq->lock);
345 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
346
347 while (unlikely(task_on_rq_migrating(p)))
348 cpu_relax();
349 }
350}
351
352static void __task_rq_unlock(struct rq *rq)
353 __releases(rq->lock)
354{
355 raw_spin_unlock(&rq->lock);
356}
357
358static inline void
359task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
360 __releases(rq->lock)
361 __releases(p->pi_lock)
362{
363 raw_spin_unlock(&rq->lock);
364 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
365}
366
367/*
368 * this_rq_lock - lock this runqueue and disable interrupts. 310 * this_rq_lock - lock this runqueue and disable interrupts.
369 */ 311 */
370static struct rq *this_rq_lock(void) 312static struct rq *this_rq_lock(void)
@@ -490,6 +432,11 @@ static __init void init_hrtick(void)
490 */ 432 */
491void hrtick_start(struct rq *rq, u64 delay) 433void hrtick_start(struct rq *rq, u64 delay)
492{ 434{
435 /*
436 * Don't schedule slices shorter than 10000ns, that just
437 * doesn't make sense. Rely on vruntime for fairness.
438 */
439 delay = max_t(u64, delay, 10000LL);
493 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 440 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
494 HRTIMER_MODE_REL_PINNED, 0); 441 HRTIMER_MODE_REL_PINNED, 0);
495} 442}
@@ -1046,7 +993,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1046 * this case, we can save a useless back to back clock update. 993 * this case, we can save a useless back to back clock update.
1047 */ 994 */
1048 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) 995 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1049 rq->skip_clock_update = 1; 996 rq_clock_skip_update(rq, true);
1050} 997}
1051 998
1052#ifdef CONFIG_SMP 999#ifdef CONFIG_SMP
@@ -1814,6 +1761,10 @@ void __dl_clear_params(struct task_struct *p)
1814 dl_se->dl_period = 0; 1761 dl_se->dl_period = 0;
1815 dl_se->flags = 0; 1762 dl_se->flags = 0;
1816 dl_se->dl_bw = 0; 1763 dl_se->dl_bw = 0;
1764
1765 dl_se->dl_throttled = 0;
1766 dl_se->dl_new = 1;
1767 dl_se->dl_yielded = 0;
1817} 1768}
1818 1769
1819/* 1770/*
@@ -1832,6 +1783,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1832 p->se.prev_sum_exec_runtime = 0; 1783 p->se.prev_sum_exec_runtime = 0;
1833 p->se.nr_migrations = 0; 1784 p->se.nr_migrations = 0;
1834 p->se.vruntime = 0; 1785 p->se.vruntime = 0;
1786#ifdef CONFIG_SMP
1787 p->se.avg.decay_count = 0;
1788#endif
1835 INIT_LIST_HEAD(&p->se.group_node); 1789 INIT_LIST_HEAD(&p->se.group_node);
1836 1790
1837#ifdef CONFIG_SCHEDSTATS 1791#ifdef CONFIG_SCHEDSTATS
@@ -1839,7 +1793,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1839#endif 1793#endif
1840 1794
1841 RB_CLEAR_NODE(&p->dl.rb_node); 1795 RB_CLEAR_NODE(&p->dl.rb_node);
1842 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1796 init_dl_task_timer(&p->dl);
1843 __dl_clear_params(p); 1797 __dl_clear_params(p);
1844 1798
1845 INIT_LIST_HEAD(&p->rt.run_list); 1799 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2049,6 +2003,9 @@ static inline int dl_bw_cpus(int i)
2049 * allocated bandwidth to reflect the new situation. 2003 * allocated bandwidth to reflect the new situation.
2050 * 2004 *
2051 * This function is called while holding p's rq->lock. 2005 * This function is called while holding p's rq->lock.
2006 *
2007 * XXX we should delay bw change until the task's 0-lag point, see
2008 * __setparam_dl().
2052 */ 2009 */
2053static int dl_overflow(struct task_struct *p, int policy, 2010static int dl_overflow(struct task_struct *p, int policy,
2054 const struct sched_attr *attr) 2011 const struct sched_attr *attr)
@@ -2748,6 +2705,10 @@ again:
2748 * - explicit schedule() call 2705 * - explicit schedule() call
2749 * - return from syscall or exception to user-space 2706 * - return from syscall or exception to user-space
2750 * - return from interrupt-handler to user-space 2707 * - return from interrupt-handler to user-space
2708 *
2709 * WARNING: all callers must re-check need_resched() afterward and reschedule
2710 * accordingly in case an event triggered the need for rescheduling (such as
2711 * an interrupt waking up a task) while preemption was disabled in __schedule().
2751 */ 2712 */
2752static void __sched __schedule(void) 2713static void __sched __schedule(void)
2753{ 2714{
@@ -2756,7 +2717,6 @@ static void __sched __schedule(void)
2756 struct rq *rq; 2717 struct rq *rq;
2757 int cpu; 2718 int cpu;
2758 2719
2759need_resched:
2760 preempt_disable(); 2720 preempt_disable();
2761 cpu = smp_processor_id(); 2721 cpu = smp_processor_id();
2762 rq = cpu_rq(cpu); 2722 rq = cpu_rq(cpu);
@@ -2776,6 +2736,8 @@ need_resched:
2776 smp_mb__before_spinlock(); 2736 smp_mb__before_spinlock();
2777 raw_spin_lock_irq(&rq->lock); 2737 raw_spin_lock_irq(&rq->lock);
2778 2738
2739 rq->clock_skip_update <<= 1; /* promote REQ to ACT */
2740
2779 switch_count = &prev->nivcsw; 2741 switch_count = &prev->nivcsw;
2780 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2742 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2781 if (unlikely(signal_pending_state(prev->state, prev))) { 2743 if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2800,13 +2762,13 @@ need_resched:
2800 switch_count = &prev->nvcsw; 2762 switch_count = &prev->nvcsw;
2801 } 2763 }
2802 2764
2803 if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) 2765 if (task_on_rq_queued(prev))
2804 update_rq_clock(rq); 2766 update_rq_clock(rq);
2805 2767
2806 next = pick_next_task(rq, prev); 2768 next = pick_next_task(rq, prev);
2807 clear_tsk_need_resched(prev); 2769 clear_tsk_need_resched(prev);
2808 clear_preempt_need_resched(); 2770 clear_preempt_need_resched();
2809 rq->skip_clock_update = 0; 2771 rq->clock_skip_update = 0;
2810 2772
2811 if (likely(prev != next)) { 2773 if (likely(prev != next)) {
2812 rq->nr_switches++; 2774 rq->nr_switches++;
@@ -2821,8 +2783,6 @@ need_resched:
2821 post_schedule(rq); 2783 post_schedule(rq);
2822 2784
2823 sched_preempt_enable_no_resched(); 2785 sched_preempt_enable_no_resched();
2824 if (need_resched())
2825 goto need_resched;
2826} 2786}
2827 2787
2828static inline void sched_submit_work(struct task_struct *tsk) 2788static inline void sched_submit_work(struct task_struct *tsk)
@@ -2842,7 +2802,9 @@ asmlinkage __visible void __sched schedule(void)
2842 struct task_struct *tsk = current; 2802 struct task_struct *tsk = current;
2843 2803
2844 sched_submit_work(tsk); 2804 sched_submit_work(tsk);
2845 __schedule(); 2805 do {
2806 __schedule();
2807 } while (need_resched());
2846} 2808}
2847EXPORT_SYMBOL(schedule); 2809EXPORT_SYMBOL(schedule);
2848 2810
@@ -2877,6 +2839,21 @@ void __sched schedule_preempt_disabled(void)
2877 preempt_disable(); 2839 preempt_disable();
2878} 2840}
2879 2841
2842static void __sched notrace preempt_schedule_common(void)
2843{
2844 do {
2845 __preempt_count_add(PREEMPT_ACTIVE);
2846 __schedule();
2847 __preempt_count_sub(PREEMPT_ACTIVE);
2848
2849 /*
2850 * Check again in case we missed a preemption opportunity
2851 * between schedule and now.
2852 */
2853 barrier();
2854 } while (need_resched());
2855}
2856
2880#ifdef CONFIG_PREEMPT 2857#ifdef CONFIG_PREEMPT
2881/* 2858/*
2882 * this is the entry point to schedule() from in-kernel preemption 2859 * this is the entry point to schedule() from in-kernel preemption
@@ -2892,17 +2869,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
2892 if (likely(!preemptible())) 2869 if (likely(!preemptible()))
2893 return; 2870 return;
2894 2871
2895 do { 2872 preempt_schedule_common();
2896 __preempt_count_add(PREEMPT_ACTIVE);
2897 __schedule();
2898 __preempt_count_sub(PREEMPT_ACTIVE);
2899
2900 /*
2901 * Check again in case we missed a preemption opportunity
2902 * between schedule and now.
2903 */
2904 barrier();
2905 } while (need_resched());
2906} 2873}
2907NOKPROBE_SYMBOL(preempt_schedule); 2874NOKPROBE_SYMBOL(preempt_schedule);
2908EXPORT_SYMBOL(preempt_schedule); 2875EXPORT_SYMBOL(preempt_schedule);
@@ -3251,15 +3218,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3251{ 3218{
3252 struct sched_dl_entity *dl_se = &p->dl; 3219 struct sched_dl_entity *dl_se = &p->dl;
3253 3220
3254 init_dl_task_timer(dl_se);
3255 dl_se->dl_runtime = attr->sched_runtime; 3221 dl_se->dl_runtime = attr->sched_runtime;
3256 dl_se->dl_deadline = attr->sched_deadline; 3222 dl_se->dl_deadline = attr->sched_deadline;
3257 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; 3223 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3258 dl_se->flags = attr->sched_flags; 3224 dl_se->flags = attr->sched_flags;
3259 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3225 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3260 dl_se->dl_throttled = 0; 3226
3261 dl_se->dl_new = 1; 3227 /*
3262 dl_se->dl_yielded = 0; 3228 * Changing the parameters of a task is 'tricky' and we're not doing
3229 * the correct thing -- also see task_dead_dl() and switched_from_dl().
3230 *
3231 * What we SHOULD do is delay the bandwidth release until the 0-lag
3232 * point. This would include retaining the task_struct until that time
3233 * and change dl_overflow() to not immediately decrement the current
3234 * amount.
3235 *
3236 * Instead we retain the current runtime/deadline and let the new
3237 * parameters take effect after the current reservation period lapses.
3238 * This is safe (albeit pessimistic) because the 0-lag point is always
3239 * before the current scheduling deadline.
3240 *
3241 * We can still have temporary overloads because we do not delay the
3242 * change in bandwidth until that time; so admission control is
3243 * not on the safe side. It does however guarantee tasks will never
3244 * consume more than promised.
3245 */
3263} 3246}
3264 3247
3265/* 3248/*
@@ -3382,6 +3365,20 @@ static bool check_same_owner(struct task_struct *p)
3382 return match; 3365 return match;
3383} 3366}
3384 3367
3368static bool dl_param_changed(struct task_struct *p,
3369 const struct sched_attr *attr)
3370{
3371 struct sched_dl_entity *dl_se = &p->dl;
3372
3373 if (dl_se->dl_runtime != attr->sched_runtime ||
3374 dl_se->dl_deadline != attr->sched_deadline ||
3375 dl_se->dl_period != attr->sched_period ||
3376 dl_se->flags != attr->sched_flags)
3377 return true;
3378
3379 return false;
3380}
3381
3385static int __sched_setscheduler(struct task_struct *p, 3382static int __sched_setscheduler(struct task_struct *p,
3386 const struct sched_attr *attr, 3383 const struct sched_attr *attr,
3387 bool user) 3384 bool user)
@@ -3510,7 +3507,7 @@ recheck:
3510 goto change; 3507 goto change;
3511 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3508 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3512 goto change; 3509 goto change;
3513 if (dl_policy(policy)) 3510 if (dl_policy(policy) && dl_param_changed(p, attr))
3514 goto change; 3511 goto change;
3515 3512
3516 p->sched_reset_on_fork = reset_on_fork; 3513 p->sched_reset_on_fork = reset_on_fork;
@@ -4202,17 +4199,10 @@ SYSCALL_DEFINE0(sched_yield)
4202 return 0; 4199 return 0;
4203} 4200}
4204 4201
4205static void __cond_resched(void)
4206{
4207 __preempt_count_add(PREEMPT_ACTIVE);
4208 __schedule();
4209 __preempt_count_sub(PREEMPT_ACTIVE);
4210}
4211
4212int __sched _cond_resched(void) 4202int __sched _cond_resched(void)
4213{ 4203{
4214 if (should_resched()) { 4204 if (should_resched()) {
4215 __cond_resched(); 4205 preempt_schedule_common();
4216 return 1; 4206 return 1;
4217 } 4207 }
4218 return 0; 4208 return 0;
@@ -4237,7 +4227,7 @@ int __cond_resched_lock(spinlock_t *lock)
4237 if (spin_needbreak(lock) || resched) { 4227 if (spin_needbreak(lock) || resched) {
4238 spin_unlock(lock); 4228 spin_unlock(lock);
4239 if (resched) 4229 if (resched)
4240 __cond_resched(); 4230 preempt_schedule_common();
4241 else 4231 else
4242 cpu_relax(); 4232 cpu_relax();
4243 ret = 1; 4233 ret = 1;
@@ -4253,7 +4243,7 @@ int __sched __cond_resched_softirq(void)
4253 4243
4254 if (should_resched()) { 4244 if (should_resched()) {
4255 local_bh_enable(); 4245 local_bh_enable();
4256 __cond_resched(); 4246 preempt_schedule_common();
4257 local_bh_disable(); 4247 local_bh_disable();
4258 return 1; 4248 return 1;
4259 } 4249 }
@@ -4368,36 +4358,29 @@ EXPORT_SYMBOL_GPL(yield_to);
4368 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4358 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4369 * that process accounting knows that this is a task in IO wait state. 4359 * that process accounting knows that this is a task in IO wait state.
4370 */ 4360 */
4371void __sched io_schedule(void)
4372{
4373 struct rq *rq = raw_rq();
4374
4375 delayacct_blkio_start();
4376 atomic_inc(&rq->nr_iowait);
4377 blk_flush_plug(current);
4378 current->in_iowait = 1;
4379 schedule();
4380 current->in_iowait = 0;
4381 atomic_dec(&rq->nr_iowait);
4382 delayacct_blkio_end();
4383}
4384EXPORT_SYMBOL(io_schedule);
4385
4386long __sched io_schedule_timeout(long timeout) 4361long __sched io_schedule_timeout(long timeout)
4387{ 4362{
4388 struct rq *rq = raw_rq(); 4363 int old_iowait = current->in_iowait;
4364 struct rq *rq;
4389 long ret; 4365 long ret;
4390 4366
4367 current->in_iowait = 1;
4368 if (old_iowait)
4369 blk_schedule_flush_plug(current);
4370 else
4371 blk_flush_plug(current);
4372
4391 delayacct_blkio_start(); 4373 delayacct_blkio_start();
4374 rq = raw_rq();
4392 atomic_inc(&rq->nr_iowait); 4375 atomic_inc(&rq->nr_iowait);
4393 blk_flush_plug(current);
4394 current->in_iowait = 1;
4395 ret = schedule_timeout(timeout); 4376 ret = schedule_timeout(timeout);
4396 current->in_iowait = 0; 4377 current->in_iowait = old_iowait;
4397 atomic_dec(&rq->nr_iowait); 4378 atomic_dec(&rq->nr_iowait);
4398 delayacct_blkio_end(); 4379 delayacct_blkio_end();
4380
4399 return ret; 4381 return ret;
4400} 4382}
4383EXPORT_SYMBOL(io_schedule_timeout);
4401 4384
4402/** 4385/**
4403 * sys_sched_get_priority_max - return maximum RT priority. 4386 * sys_sched_get_priority_max - return maximum RT priority.
@@ -4508,9 +4491,10 @@ void sched_show_task(struct task_struct *p)
4508{ 4491{
4509 unsigned long free = 0; 4492 unsigned long free = 0;
4510 int ppid; 4493 int ppid;
4511 unsigned state; 4494 unsigned long state = p->state;
4512 4495
4513 state = p->state ? __ffs(p->state) + 1 : 0; 4496 if (state)
4497 state = __ffs(state) + 1;
4514 printk(KERN_INFO "%-15.15s %c", p->comm, 4498 printk(KERN_INFO "%-15.15s %c", p->comm,
4515 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4499 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4516#if BITS_PER_LONG == 32 4500#if BITS_PER_LONG == 32
@@ -4642,6 +4626,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
4642 struct dl_bw *cur_dl_b; 4626 struct dl_bw *cur_dl_b;
4643 unsigned long flags; 4627 unsigned long flags;
4644 4628
4629 if (!cpumask_weight(cur))
4630 return ret;
4631
4645 rcu_read_lock_sched(); 4632 rcu_read_lock_sched();
4646 cur_dl_b = dl_bw_of(cpumask_any(cur)); 4633 cur_dl_b = dl_bw_of(cpumask_any(cur));
4647 trial_cpus = cpumask_weight(trial); 4634 trial_cpus = cpumask_weight(trial);
@@ -4740,7 +4727,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
4740 4727
4741void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4728void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4742{ 4729{
4743 if (p->sched_class && p->sched_class->set_cpus_allowed) 4730 if (p->sched_class->set_cpus_allowed)
4744 p->sched_class->set_cpus_allowed(p, new_mask); 4731 p->sched_class->set_cpus_allowed(p, new_mask);
4745 4732
4746 cpumask_copy(&p->cpus_allowed, new_mask); 4733 cpumask_copy(&p->cpus_allowed, new_mask);
@@ -5408,9 +5395,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5408 struct cpumask *groupmask) 5395 struct cpumask *groupmask)
5409{ 5396{
5410 struct sched_group *group = sd->groups; 5397 struct sched_group *group = sd->groups;
5411 char str[256];
5412 5398
5413 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5414 cpumask_clear(groupmask); 5399 cpumask_clear(groupmask);
5415 5400
5416 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 5401 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -5423,7 +5408,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5423 return -1; 5408 return -1;
5424 } 5409 }
5425 5410
5426 printk(KERN_CONT "span %s level %s\n", str, sd->name); 5411 printk(KERN_CONT "span %*pbl level %s\n",
5412 cpumask_pr_args(sched_domain_span(sd)), sd->name);
5427 5413
5428 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 5414 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5429 printk(KERN_ERR "ERROR: domain->span does not contain " 5415 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -5468,9 +5454,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5468 5454
5469 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 5455 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5470 5456
5471 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 5457 printk(KERN_CONT " %*pbl",
5472 5458 cpumask_pr_args(sched_group_cpus(group)));
5473 printk(KERN_CONT " %s", str);
5474 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { 5459 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
5475 printk(KERN_CONT " (cpu_capacity = %d)", 5460 printk(KERN_CONT " (cpu_capacity = %d)",
5476 group->sgc->capacity); 5461 group->sgc->capacity);
@@ -7250,6 +7235,11 @@ void __init sched_init(void)
7250 enter_lazy_tlb(&init_mm, current); 7235 enter_lazy_tlb(&init_mm, current);
7251 7236
7252 /* 7237 /*
7238 * During early bootup we pretend to be a normal task:
7239 */
7240 current->sched_class = &fair_sched_class;
7241
7242 /*
7253 * Make us the idle thread. Technically, schedule() should not be 7243 * Make us the idle thread. Technically, schedule() should not be
7254 * called from this thread, however somewhere below it might be, 7244 * called from this thread, however somewhere below it might be,
7255 * but because we are the idle thread, we just pick up running again 7245 * but because we are the idle thread, we just pick up running again
@@ -7259,11 +7249,6 @@ void __init sched_init(void)
7259 7249
7260 calc_load_update = jiffies + LOAD_FREQ; 7250 calc_load_update = jiffies + LOAD_FREQ;
7261 7251
7262 /*
7263 * During early bootup we pretend to be a normal task:
7264 */
7265 current->sched_class = &fair_sched_class;
7266
7267#ifdef CONFIG_SMP 7252#ifdef CONFIG_SMP
7268 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 7253 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7269 /* May be allocated at isolcpus cmdline parse time */ 7254 /* May be allocated at isolcpus cmdline parse time */
@@ -7324,6 +7309,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
7324 in_atomic(), irqs_disabled(), 7309 in_atomic(), irqs_disabled(),
7325 current->pid, current->comm); 7310 current->pid, current->comm);
7326 7311
7312 if (task_stack_end_corrupted(current))
7313 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
7314
7327 debug_show_held_locks(current); 7315 debug_show_held_locks(current);
7328 if (irqs_disabled()) 7316 if (irqs_disabled())
7329 print_irqtrace_events(current); 7317 print_irqtrace_events(current);
@@ -7587,6 +7575,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
7587{ 7575{
7588 struct task_struct *g, *p; 7576 struct task_struct *g, *p;
7589 7577
7578 /*
7579 * Autogroups do not have RT tasks; see autogroup_create().
7580 */
7581 if (task_group_is_autogroup(tg))
7582 return 0;
7583
7590 for_each_process_thread(g, p) { 7584 for_each_process_thread(g, p) {
7591 if (rt_task(p) && task_group(p) == tg) 7585 if (rt_task(p) && task_group(p) == tg)
7592 return 1; 7586 return 1;
@@ -7679,6 +7673,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
7679{ 7673{
7680 int i, err = 0; 7674 int i, err = 0;
7681 7675
7676 /*
7677 * Disallowing the root group RT runtime is BAD, it would disallow the
7678 * kernel creating (and or operating) RT threads.
7679 */
7680 if (tg == &root_task_group && rt_runtime == 0)
7681 return -EINVAL;
7682
7683 /* No period doesn't make any sense. */
7684 if (rt_period == 0)
7685 return -EINVAL;
7686
7682 mutex_lock(&rt_constraints_mutex); 7687 mutex_lock(&rt_constraints_mutex);
7683 read_lock(&tasklist_lock); 7688 read_lock(&tasklist_lock);
7684 err = __rt_schedulable(tg, rt_period, rt_runtime); 7689 err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -7735,9 +7740,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7735 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 7740 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7736 rt_runtime = tg->rt_bandwidth.rt_runtime; 7741 rt_runtime = tg->rt_bandwidth.rt_runtime;
7737 7742
7738 if (rt_period == 0)
7739 return -EINVAL;
7740
7741 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7743 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7742} 7744}
7743 7745
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 539ca3ce071b..c6acb07466bb 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,7 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
107 int best_cpu = -1; 107 int best_cpu = -1;
108 const struct sched_dl_entity *dl_se = &p->dl; 108 const struct sched_dl_entity *dl_se = &p->dl;
109 109
110 if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { 110 if (later_mask &&
111 cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
111 best_cpu = cpumask_any(later_mask); 112 best_cpu = cpumask_any(later_mask);
112 goto out; 113 goto out;
113 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && 114 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
@@ -186,6 +187,26 @@ out:
186} 187}
187 188
188/* 189/*
190 * cpudl_set_freecpu - Set the cpudl.free_cpus
191 * @cp: the cpudl max-heap context
192 * @cpu: rd attached cpu
193 */
194void cpudl_set_freecpu(struct cpudl *cp, int cpu)
195{
196 cpumask_set_cpu(cpu, cp->free_cpus);
197}
198
199/*
200 * cpudl_clear_freecpu - Clear the cpudl.free_cpus
201 * @cp: the cpudl max-heap context
202 * @cpu: rd attached cpu
203 */
204void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
205{
206 cpumask_clear_cpu(cpu, cp->free_cpus);
207}
208
209/*
189 * cpudl_init - initialize the cpudl structure 210 * cpudl_init - initialize the cpudl structure
190 * @cp: the cpudl max-heap context 211 * @cp: the cpudl max-heap context
191 */ 212 */
@@ -203,7 +224,7 @@ int cpudl_init(struct cpudl *cp)
203 if (!cp->elements) 224 if (!cp->elements)
204 return -ENOMEM; 225 return -ENOMEM;
205 226
206 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { 227 if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
207 kfree(cp->elements); 228 kfree(cp->elements);
208 return -ENOMEM; 229 return -ENOMEM;
209 } 230 }
@@ -211,8 +232,6 @@ int cpudl_init(struct cpudl *cp)
211 for_each_possible_cpu(i) 232 for_each_possible_cpu(i)
212 cp->elements[i].idx = IDX_INVALID; 233 cp->elements[i].idx = IDX_INVALID;
213 234
214 cpumask_setall(cp->free_cpus);
215
216 return 0; 235 return 0;
217} 236}
218 237
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 020039bd1326..1a0a6ef2fbe1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
24 struct cpumask *later_mask); 24 struct cpumask *later_mask);
25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); 25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26int cpudl_init(struct cpudl *cp); 26int cpudl_init(struct cpudl *cp);
27void cpudl_set_freecpu(struct cpudl *cp, int cpu);
28void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
27void cpudl_cleanup(struct cpudl *cp); 29void cpudl_cleanup(struct cpudl *cp);
28#endif /* CONFIG_SMP */ 30#endif /* CONFIG_SMP */
29 31
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b52092f2636d..3fa8fa6d9403 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -350,6 +350,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
350 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 350 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
351 dl_se->runtime = pi_se->dl_runtime; 351 dl_se->runtime = pi_se->dl_runtime;
352 } 352 }
353
354 if (dl_se->dl_yielded)
355 dl_se->dl_yielded = 0;
356 if (dl_se->dl_throttled)
357 dl_se->dl_throttled = 0;
353} 358}
354 359
355/* 360/*
@@ -506,16 +511,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
506 struct sched_dl_entity, 511 struct sched_dl_entity,
507 dl_timer); 512 dl_timer);
508 struct task_struct *p = dl_task_of(dl_se); 513 struct task_struct *p = dl_task_of(dl_se);
514 unsigned long flags;
509 struct rq *rq; 515 struct rq *rq;
510again:
511 rq = task_rq(p);
512 raw_spin_lock(&rq->lock);
513 516
514 if (rq != task_rq(p)) { 517 rq = task_rq_lock(current, &flags);
515 /* Task was moved, retrying. */
516 raw_spin_unlock(&rq->lock);
517 goto again;
518 }
519 518
520 /* 519 /*
521 * We need to take care of several possible races here: 520 * We need to take care of several possible races here:
@@ -536,25 +535,41 @@ again:
536 535
537 sched_clock_tick(); 536 sched_clock_tick();
538 update_rq_clock(rq); 537 update_rq_clock(rq);
539 dl_se->dl_throttled = 0; 538
540 dl_se->dl_yielded = 0; 539 /*
541 if (task_on_rq_queued(p)) { 540 * If the throttle happened during sched-out; like:
542 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 541 *
543 if (dl_task(rq->curr)) 542 * schedule()
544 check_preempt_curr_dl(rq, p, 0); 543 * deactivate_task()
545 else 544 * dequeue_task_dl()
546 resched_curr(rq); 545 * update_curr_dl()
546 * start_dl_timer()
547 * __dequeue_task_dl()
548 * prev->on_rq = 0;
549 *
550 * We can be both throttled and !queued. Replenish the counter
551 * but do not enqueue -- wait for our wakeup to do that.
552 */
553 if (!task_on_rq_queued(p)) {
554 replenish_dl_entity(dl_se, dl_se);
555 goto unlock;
556 }
557
558 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
559 if (dl_task(rq->curr))
560 check_preempt_curr_dl(rq, p, 0);
561 else
562 resched_curr(rq);
547#ifdef CONFIG_SMP 563#ifdef CONFIG_SMP
548 /* 564 /*
549 * Queueing this task back might have overloaded rq, 565 * Queueing this task back might have overloaded rq,
550 * check if we need to kick someone away. 566 * check if we need to kick someone away.
551 */ 567 */
552 if (has_pushable_dl_tasks(rq)) 568 if (has_pushable_dl_tasks(rq))
553 push_dl_task(rq); 569 push_dl_task(rq);
554#endif 570#endif
555 }
556unlock: 571unlock:
557 raw_spin_unlock(&rq->lock); 572 task_rq_unlock(rq, current, &flags);
558 573
559 return HRTIMER_NORESTART; 574 return HRTIMER_NORESTART;
560} 575}
@@ -613,10 +628,9 @@ static void update_curr_dl(struct rq *rq)
613 628
614 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; 629 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
615 if (dl_runtime_exceeded(rq, dl_se)) { 630 if (dl_runtime_exceeded(rq, dl_se)) {
631 dl_se->dl_throttled = 1;
616 __dequeue_task_dl(rq, curr, 0); 632 __dequeue_task_dl(rq, curr, 0);
617 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) 633 if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
618 dl_se->dl_throttled = 1;
619 else
620 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); 634 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
621 635
622 if (!is_leftmost(curr, &rq->dl)) 636 if (!is_leftmost(curr, &rq->dl))
@@ -853,7 +867,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
853 * its rq, the bandwidth timer callback (which clearly has not 867 * its rq, the bandwidth timer callback (which clearly has not
854 * run yet) will take care of this. 868 * run yet) will take care of this.
855 */ 869 */
856 if (p->dl.dl_throttled) 870 if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
857 return; 871 return;
858 872
859 enqueue_dl_entity(&p->dl, pi_se, flags); 873 enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -898,6 +912,7 @@ static void yield_task_dl(struct rq *rq)
898 rq->curr->dl.dl_yielded = 1; 912 rq->curr->dl.dl_yielded = 1;
899 p->dl.runtime = 0; 913 p->dl.runtime = 0;
900 } 914 }
915 update_rq_clock(rq);
901 update_curr_dl(rq); 916 update_curr_dl(rq);
902} 917}
903 918
@@ -1073,7 +1088,13 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1073{ 1088{
1074 update_curr_dl(rq); 1089 update_curr_dl(rq);
1075 1090
1076 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) 1091 /*
1092 * Even when we have runtime, update_curr_dl() might have resulted in us
1093 * not being the leftmost task anymore. In that case NEED_RESCHED will
1094 * be set and schedule() will start a new hrtick for the next task.
1095 */
1096 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
1097 is_leftmost(p, &rq->dl))
1077 start_hrtick_dl(rq, p); 1098 start_hrtick_dl(rq, p);
1078} 1099}
1079 1100
@@ -1094,6 +1115,7 @@ static void task_dead_dl(struct task_struct *p)
1094 * Since we are TASK_DEAD we won't slip out of the domain! 1115 * Since we are TASK_DEAD we won't slip out of the domain!
1095 */ 1116 */
1096 raw_spin_lock_irq(&dl_b->lock); 1117 raw_spin_lock_irq(&dl_b->lock);
1118 /* XXX we should retain the bw until 0-lag */
1097 dl_b->total_bw -= p->dl.dl_bw; 1119 dl_b->total_bw -= p->dl.dl_bw;
1098 raw_spin_unlock_irq(&dl_b->lock); 1120 raw_spin_unlock_irq(&dl_b->lock);
1099 1121
@@ -1165,9 +1187,6 @@ static int find_later_rq(struct task_struct *task)
1165 * We have to consider system topology and task affinity 1187 * We have to consider system topology and task affinity
1166 * first, then we can look for a suitable cpu. 1188 * first, then we can look for a suitable cpu.
1167 */ 1189 */
1168 cpumask_copy(later_mask, task_rq(task)->rd->span);
1169 cpumask_and(later_mask, later_mask, cpu_active_mask);
1170 cpumask_and(later_mask, later_mask, &task->cpus_allowed);
1171 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, 1190 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
1172 task, later_mask); 1191 task, later_mask);
1173 if (best_cpu == -1) 1192 if (best_cpu == -1)
@@ -1562,6 +1581,7 @@ static void rq_online_dl(struct rq *rq)
1562 if (rq->dl.overloaded) 1581 if (rq->dl.overloaded)
1563 dl_set_overload(rq); 1582 dl_set_overload(rq);
1564 1583
1584 cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
1565 if (rq->dl.dl_nr_running > 0) 1585 if (rq->dl.dl_nr_running > 0)
1566 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); 1586 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
1567} 1587}
@@ -1573,6 +1593,7 @@ static void rq_offline_dl(struct rq *rq)
1573 dl_clear_overload(rq); 1593 dl_clear_overload(rq);
1574 1594
1575 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); 1595 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
1596 cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
1576} 1597}
1577 1598
1578void init_sched_dl_class(void) 1599void init_sched_dl_class(void)
@@ -1614,8 +1635,8 @@ static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
1614 1635
1615static void switched_from_dl(struct rq *rq, struct task_struct *p) 1636static void switched_from_dl(struct rq *rq, struct task_struct *p)
1616{ 1637{
1638 /* XXX we should retain the bw until 0-lag */
1617 cancel_dl_timer(rq, p); 1639 cancel_dl_timer(rq, p);
1618
1619 __dl_clear_params(p); 1640 __dl_clear_params(p);
1620 1641
1621 /* 1642 /*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 92cc52001e74..8baaf858d25c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -305,6 +305,7 @@ do { \
305 PN(next_balance); 305 PN(next_balance);
306 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); 306 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
307 PN(clock); 307 PN(clock);
308 PN(clock_task);
308 P(cpu_load[0]); 309 P(cpu_load[0]);
309 P(cpu_load[1]); 310 P(cpu_load[1]);
310 P(cpu_load[2]); 311 P(cpu_load[2]);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40667cbf371b..7ce18f3c097a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -676,7 +676,6 @@ void init_task_runnable_average(struct task_struct *p)
676{ 676{
677 u32 slice; 677 u32 slice;
678 678
679 p->se.avg.decay_count = 0;
680 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; 679 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
681 p->se.avg.runnable_avg_sum = slice; 680 p->se.avg.runnable_avg_sum = slice;
682 p->se.avg.runnable_avg_period = slice; 681 p->se.avg.runnable_avg_period = slice;
@@ -1730,7 +1729,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
1730 nodes = node_online_map; 1729 nodes = node_online_map;
1731 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { 1730 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1732 unsigned long max_faults = 0; 1731 unsigned long max_faults = 0;
1733 nodemask_t max_group; 1732 nodemask_t max_group = NODE_MASK_NONE;
1734 int a, b; 1733 int a, b;
1735 1734
1736 /* Are there nodes at this distance from each other? */ 1735 /* Are there nodes at this distance from each other? */
@@ -2574,11 +2573,11 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
2574 u64 decays = atomic64_read(&cfs_rq->decay_counter); 2573 u64 decays = atomic64_read(&cfs_rq->decay_counter);
2575 2574
2576 decays -= se->avg.decay_count; 2575 decays -= se->avg.decay_count;
2576 se->avg.decay_count = 0;
2577 if (!decays) 2577 if (!decays)
2578 return 0; 2578 return 0;
2579 2579
2580 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); 2580 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
2581 se->avg.decay_count = 0;
2582 2581
2583 return decays; 2582 return decays;
2584} 2583}
@@ -5157,7 +5156,7 @@ static void yield_task_fair(struct rq *rq)
5157 * so we don't do microscopic update in schedule() 5156 * so we don't do microscopic update in schedule()
5158 * and double the fastpath cost. 5157 * and double the fastpath cost.
5159 */ 5158 */
5160 rq->skip_clock_update = 1; 5159 rq_clock_skip_update(rq, true);
5161 } 5160 }
5162 5161
5163 set_skip_buddy(se); 5162 set_skip_buddy(se);
@@ -5949,8 +5948,8 @@ static unsigned long scale_rt_capacity(int cpu)
5949 */ 5948 */
5950 age_stamp = ACCESS_ONCE(rq->age_stamp); 5949 age_stamp = ACCESS_ONCE(rq->age_stamp);
5951 avg = ACCESS_ONCE(rq->rt_avg); 5950 avg = ACCESS_ONCE(rq->rt_avg);
5951 delta = __rq_clock_broken(rq) - age_stamp;
5952 5952
5953 delta = rq_clock(rq) - age_stamp;
5954 if (unlikely(delta < 0)) 5953 if (unlikely(delta < 0))
5955 delta = 0; 5954 delta = 0;
5956 5955
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..94b2d7b88a27 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
7#include <linux/tick.h> 7#include <linux/tick.h>
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/stackprotector.h> 9#include <linux/stackprotector.h>
10#include <linux/suspend.h>
10 11
11#include <asm/tlb.h> 12#include <asm/tlb.h>
12 13
@@ -47,7 +48,8 @@ static inline int cpu_idle_poll(void)
47 rcu_idle_enter(); 48 rcu_idle_enter();
48 trace_cpu_idle_rcuidle(0, smp_processor_id()); 49 trace_cpu_idle_rcuidle(0, smp_processor_id());
49 local_irq_enable(); 50 local_irq_enable();
50 while (!tif_need_resched()) 51 while (!tif_need_resched() &&
52 (cpu_idle_force_poll || tick_check_broadcast_expired()))
51 cpu_relax(); 53 cpu_relax();
52 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 54 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
53 rcu_idle_exit(); 55 rcu_idle_exit();
@@ -104,6 +106,21 @@ static void cpuidle_idle_call(void)
104 rcu_idle_enter(); 106 rcu_idle_enter();
105 107
106 /* 108 /*
109 * Suspend-to-idle ("freeze") is a system state in which all user space
110 * has been frozen, all I/O devices have been suspended and the only
111 * activity happens here and in iterrupts (if any). In that case bypass
112 * the cpuidle governor and go stratight for the deepest idle state
113 * available. Possibly also suspend the local tick and the entire
114 * timekeeping to prevent timer interrupts from kicking us out of idle
115 * until a proper wakeup interrupt happens.
116 */
117 if (idle_should_freeze()) {
118 cpuidle_enter_freeze();
119 local_irq_enable();
120 goto exit_idle;
121 }
122
123 /*
107 * Ask the cpuidle framework to choose a convenient idle state. 124 * Ask the cpuidle framework to choose a convenient idle state.
108 * Fall back to the default arch idle method on errors. 125 * Fall back to the default arch idle method on errors.
109 */ 126 */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ee15f5a0d1c1..f4d4b077eba0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -831,11 +831,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
831 enqueue = 1; 831 enqueue = 1;
832 832
833 /* 833 /*
834 * Force a clock update if the CPU was idle, 834 * When we're idle and a woken (rt) task is
835 * lest wakeup -> unthrottle time accumulate. 835 * throttled check_preempt_curr() will set
836 * skip_update and the time between the wakeup
837 * and this unthrottle will get accounted as
838 * 'runtime'.
836 */ 839 */
837 if (rt_rq->rt_nr_running && rq->curr == rq->idle) 840 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
838 rq->skip_clock_update = -1; 841 rq_clock_skip_update(rq, false);
839 } 842 }
840 if (rt_rq->rt_time || rt_rq->rt_nr_running) 843 if (rt_rq->rt_time || rt_rq->rt_nr_running)
841 idle = 0; 844 idle = 0;
@@ -1337,7 +1340,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1337 curr->prio <= p->prio)) { 1340 curr->prio <= p->prio)) {
1338 int target = find_lowest_rq(p); 1341 int target = find_lowest_rq(p);
1339 1342
1340 if (target != -1) 1343 /*
1344 * Don't bother moving it if the destination CPU is
1345 * not running a lower priority task.
1346 */
1347 if (target != -1 &&
1348 p->prio < cpu_rq(target)->rt.highest_prio.curr)
1341 cpu = target; 1349 cpu = target;
1342 } 1350 }
1343 rcu_read_unlock(); 1351 rcu_read_unlock();
@@ -1614,6 +1622,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1614 1622
1615 lowest_rq = cpu_rq(cpu); 1623 lowest_rq = cpu_rq(cpu);
1616 1624
1625 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1626 /*
1627 * Target rq has tasks of equal or higher priority,
1628 * retrying does not release any lock and is unlikely
1629 * to yield a different result.
1630 */
1631 lowest_rq = NULL;
1632 break;
1633 }
1634
1617 /* if the prio of this runqueue changed, try again */ 1635 /* if the prio of this runqueue changed, try again */
1618 if (double_lock_balance(rq, lowest_rq)) { 1636 if (double_lock_balance(rq, lowest_rq)) {
1619 /* 1637 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a2a45c970e7..dc0f435a2779 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -558,8 +558,6 @@ struct rq {
558#ifdef CONFIG_NO_HZ_FULL 558#ifdef CONFIG_NO_HZ_FULL
559 unsigned long last_sched_tick; 559 unsigned long last_sched_tick;
560#endif 560#endif
561 int skip_clock_update;
562
563 /* capture load from *all* tasks on this cpu: */ 561 /* capture load from *all* tasks on this cpu: */
564 struct load_weight load; 562 struct load_weight load;
565 unsigned long nr_load_updates; 563 unsigned long nr_load_updates;
@@ -588,6 +586,7 @@ struct rq {
588 unsigned long next_balance; 586 unsigned long next_balance;
589 struct mm_struct *prev_mm; 587 struct mm_struct *prev_mm;
590 588
589 unsigned int clock_skip_update;
591 u64 clock; 590 u64 clock;
592 u64 clock_task; 591 u64 clock_task;
593 592
@@ -687,16 +686,35 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
687#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 686#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
688#define raw_rq() raw_cpu_ptr(&runqueues) 687#define raw_rq() raw_cpu_ptr(&runqueues)
689 688
689static inline u64 __rq_clock_broken(struct rq *rq)
690{
691 return ACCESS_ONCE(rq->clock);
692}
693
690static inline u64 rq_clock(struct rq *rq) 694static inline u64 rq_clock(struct rq *rq)
691{ 695{
696 lockdep_assert_held(&rq->lock);
692 return rq->clock; 697 return rq->clock;
693} 698}
694 699
695static inline u64 rq_clock_task(struct rq *rq) 700static inline u64 rq_clock_task(struct rq *rq)
696{ 701{
702 lockdep_assert_held(&rq->lock);
697 return rq->clock_task; 703 return rq->clock_task;
698} 704}
699 705
706#define RQCF_REQ_SKIP 0x01
707#define RQCF_ACT_SKIP 0x02
708
709static inline void rq_clock_skip_update(struct rq *rq, bool skip)
710{
711 lockdep_assert_held(&rq->lock);
712 if (skip)
713 rq->clock_skip_update |= RQCF_REQ_SKIP;
714 else
715 rq->clock_skip_update &= ~RQCF_REQ_SKIP;
716}
717
700#ifdef CONFIG_NUMA 718#ifdef CONFIG_NUMA
701enum numa_topology_type { 719enum numa_topology_type {
702 NUMA_DIRECT, 720 NUMA_DIRECT,
@@ -1362,6 +1380,82 @@ static inline void sched_avg_update(struct rq *rq) { }
1362 1380
1363extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); 1381extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
1364 1382
1383/*
1384 * __task_rq_lock - lock the rq @p resides on.
1385 */
1386static inline struct rq *__task_rq_lock(struct task_struct *p)
1387 __acquires(rq->lock)
1388{
1389 struct rq *rq;
1390
1391 lockdep_assert_held(&p->pi_lock);
1392
1393 for (;;) {
1394 rq = task_rq(p);
1395 raw_spin_lock(&rq->lock);
1396 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
1397 return rq;
1398 raw_spin_unlock(&rq->lock);
1399
1400 while (unlikely(task_on_rq_migrating(p)))
1401 cpu_relax();
1402 }
1403}
1404
1405/*
1406 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
1407 */
1408static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
1409 __acquires(p->pi_lock)
1410 __acquires(rq->lock)
1411{
1412 struct rq *rq;
1413
1414 for (;;) {
1415 raw_spin_lock_irqsave(&p->pi_lock, *flags);
1416 rq = task_rq(p);
1417 raw_spin_lock(&rq->lock);
1418 /*
1419 * move_queued_task() task_rq_lock()
1420 *
1421 * ACQUIRE (rq->lock)
1422 * [S] ->on_rq = MIGRATING [L] rq = task_rq()
1423 * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
1424 * [S] ->cpu = new_cpu [L] task_rq()
1425 * [L] ->on_rq
1426 * RELEASE (rq->lock)
1427 *
1428 * If we observe the old cpu in task_rq_lock, the acquire of
1429 * the old rq->lock will fully serialize against the stores.
1430 *
1431 * If we observe the new cpu in task_rq_lock, the acquire will
1432 * pair with the WMB to ensure we must then also see migrating.
1433 */
1434 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
1435 return rq;
1436 raw_spin_unlock(&rq->lock);
1437 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
1438
1439 while (unlikely(task_on_rq_migrating(p)))
1440 cpu_relax();
1441 }
1442}
1443
1444static inline void __task_rq_unlock(struct rq *rq)
1445 __releases(rq->lock)
1446{
1447 raw_spin_unlock(&rq->lock);
1448}
1449
1450static inline void
1451task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
1452 __releases(rq->lock)
1453 __releases(p->pi_lock)
1454{
1455 raw_spin_unlock(&rq->lock);
1456 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
1457}
1458
1365#ifdef CONFIG_SMP 1459#ifdef CONFIG_SMP
1366#ifdef CONFIG_PREEMPT 1460#ifdef CONFIG_PREEMPT
1367 1461
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index a476bea17fbc..87e2c9f0c33e 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -15,11 +15,6 @@
15static int show_schedstat(struct seq_file *seq, void *v) 15static int show_schedstat(struct seq_file *seq, void *v)
16{ 16{
17 int cpu; 17 int cpu;
18 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
19 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
20
21 if (mask_str == NULL)
22 return -ENOMEM;
23 18
24 if (v == (void *)1) { 19 if (v == (void *)1) {
25 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 20 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
@@ -50,9 +45,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
50 for_each_domain(cpu, sd) { 45 for_each_domain(cpu, sd) {
51 enum cpu_idle_type itype; 46 enum cpu_idle_type itype;
52 47
53 cpumask_scnprintf(mask_str, mask_len, 48 seq_printf(seq, "domain%d %*pb", dcount++,
54 sched_domain_span(sd)); 49 cpumask_pr_args(sched_domain_span(sd)));
55 seq_printf(seq, "domain%d %s", dcount++, mask_str);
56 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 50 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
57 itype++) { 51 itype++) {
58 seq_printf(seq, " %u %u %u %u %u %u %u %u", 52 seq_printf(seq, " %u %u %u %u %u %u %u %u",
@@ -76,7 +70,6 @@ static int show_schedstat(struct seq_file *seq, void *v)
76 rcu_read_unlock(); 70 rcu_read_unlock();
77#endif 71#endif
78 } 72 }
79 kfree(mask_str);
80 return 0; 73 return 0;
81} 74}
82 75
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4ef9687ac115..4f44028943e6 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -629,7 +629,9 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
629 629
630 switch (action) { 630 switch (action) {
631 case SECCOMP_RET_ERRNO: 631 case SECCOMP_RET_ERRNO:
632 /* Set the low-order 16-bits as a errno. */ 632 /* Set low-order bits as an errno, capped at MAX_ERRNO. */
633 if (data > MAX_ERRNO)
634 data = MAX_ERRNO;
633 syscall_set_return_value(current, task_pt_regs(current), 635 syscall_set_return_value(current, task_pt_regs(current),
634 -data, 0); 636 -data, 0);
635 goto skip; 637 goto skip;
diff --git a/kernel/signal.c b/kernel/signal.c
index 16a305295256..a390499943e4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2501,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals);
2501 */ 2501 */
2502SYSCALL_DEFINE0(restart_syscall) 2502SYSCALL_DEFINE0(restart_syscall)
2503{ 2503{
2504 struct restart_block *restart = &current_thread_info()->restart_block; 2504 struct restart_block *restart = &current->restart_block;
2505 return restart->fn(restart); 2505 return restart->fn(restart);
2506} 2506}
2507 2507
@@ -3550,7 +3550,7 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
3550SYSCALL_DEFINE0(pause) 3550SYSCALL_DEFINE0(pause)
3551{ 3551{
3552 while (!signal_pending(current)) { 3552 while (!signal_pending(current)) {
3553 current->state = TASK_INTERRUPTIBLE; 3553 __set_current_state(TASK_INTERRUPTIBLE);
3554 schedule(); 3554 schedule();
3555 } 3555 }
3556 return -ERESTARTNOHAND; 3556 return -ERESTARTNOHAND;
@@ -3563,7 +3563,7 @@ int sigsuspend(sigset_t *set)
3563 current->saved_sigmask = current->blocked; 3563 current->saved_sigmask = current->blocked;
3564 set_current_blocked(set); 3564 set_current_blocked(set);
3565 3565
3566 current->state = TASK_INTERRUPTIBLE; 3566 __set_current_state(TASK_INTERRUPTIBLE);
3567 schedule(); 3567 schedule();
3568 set_restore_sigmask(); 3568 set_restore_sigmask();
3569 return -ERESTARTNOHAND; 3569 return -ERESTARTNOHAND;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f032fb5284e3..40190f28db35 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -280,6 +280,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
280 unsigned int cpu; 280 unsigned int cpu;
281 int ret = 0; 281 int ret = 0;
282 282
283 get_online_cpus();
283 mutex_lock(&smpboot_threads_lock); 284 mutex_lock(&smpboot_threads_lock);
284 for_each_online_cpu(cpu) { 285 for_each_online_cpu(cpu) {
285 ret = __smpboot_create_thread(plug_thread, cpu); 286 ret = __smpboot_create_thread(plug_thread, cpu);
@@ -292,6 +293,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
292 list_add(&plug_thread->list, &hotplug_threads); 293 list_add(&plug_thread->list, &hotplug_threads);
293out: 294out:
294 mutex_unlock(&smpboot_threads_lock); 295 mutex_unlock(&smpboot_threads_lock);
296 put_online_cpus();
295 return ret; 297 return ret;
296} 298}
297EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); 299EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 501baa9ac1be..479e4436f787 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
114 trace_softirqs_off(ip); 114 trace_softirqs_off(ip);
115 raw_local_irq_restore(flags); 115 raw_local_irq_restore(flags);
116 116
117 if (preempt_count() == cnt) 117 if (preempt_count() == cnt) {
118#ifdef CONFIG_DEBUG_PREEMPT
119 current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
120#endif
118 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 121 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
122 }
119} 123}
120EXPORT_SYMBOL(__local_bh_disable_ip); 124EXPORT_SYMBOL(__local_bh_disable_ip);
121#endif /* CONFIG_TRACE_IRQFLAGS */ 125#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -656,9 +660,8 @@ static void run_ksoftirqd(unsigned int cpu)
656 * in the task stack here. 660 * in the task stack here.
657 */ 661 */
658 __do_softirq(); 662 __do_softirq();
659 rcu_note_context_switch();
660 local_irq_enable(); 663 local_irq_enable();
661 cond_resched(); 664 cond_resched_rcu_qs();
662 return; 665 return;
663 } 666 }
664 local_irq_enable(); 667 local_irq_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index ea9c88109894..667b2e62fad2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -97,6 +97,12 @@
97#ifndef MPX_DISABLE_MANAGEMENT 97#ifndef MPX_DISABLE_MANAGEMENT
98# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL) 98# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL)
99#endif 99#endif
100#ifndef GET_FP_MODE
101# define GET_FP_MODE(a) (-EINVAL)
102#endif
103#ifndef SET_FP_MODE
104# define SET_FP_MODE(a,b) (-EINVAL)
105#endif
100 106
101/* 107/*
102 * this is where the system-wide overflow UID and GID are defined, for 108 * this is where the system-wide overflow UID and GID are defined, for
@@ -2219,6 +2225,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2219 return -EINVAL; 2225 return -EINVAL;
2220 error = MPX_DISABLE_MANAGEMENT(me); 2226 error = MPX_DISABLE_MANAGEMENT(me);
2221 break; 2227 break;
2228 case PR_SET_FP_MODE:
2229 error = SET_FP_MODE(me, arg2);
2230 break;
2231 case PR_GET_FP_MODE:
2232 error = GET_FP_MODE(me);
2233 break;
2222 default: 2234 default:
2223 error = -EINVAL; 2235 error = -EINVAL;
2224 break; 2236 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 137c7f69b264..88ea2d6e0031 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1248,7 +1248,6 @@ static struct ctl_table vm_table[] = {
1248 .maxlen = sizeof(unsigned long), 1248 .maxlen = sizeof(unsigned long),
1249 .mode = 0644, 1249 .mode = 0644,
1250 .proc_handler = hugetlb_sysctl_handler, 1250 .proc_handler = hugetlb_sysctl_handler,
1251 .extra1 = &zero,
1252 }, 1251 },
1253#ifdef CONFIG_NUMA 1252#ifdef CONFIG_NUMA
1254 { 1253 {
@@ -1257,7 +1256,6 @@ static struct ctl_table vm_table[] = {
1257 .maxlen = sizeof(unsigned long), 1256 .maxlen = sizeof(unsigned long),
1258 .mode = 0644, 1257 .mode = 0644,
1259 .proc_handler = &hugetlb_mempolicy_sysctl_handler, 1258 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
1260 .extra1 = &zero,
1261 }, 1259 },
1262#endif 1260#endif
1263 { 1261 {
@@ -1280,7 +1278,6 @@ static struct ctl_table vm_table[] = {
1280 .maxlen = sizeof(unsigned long), 1278 .maxlen = sizeof(unsigned long),
1281 .mode = 0644, 1279 .mode = 0644,
1282 .proc_handler = hugetlb_overcommit_handler, 1280 .proc_handler = hugetlb_overcommit_handler,
1283 .extra1 = &zero,
1284 }, 1281 },
1285#endif 1282#endif
1286 { 1283 {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 670fff88a961..21f82c29c914 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -111,13 +111,8 @@ static int send_reply(struct sk_buff *skb, struct genl_info *info)
111{ 111{
112 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 112 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
113 void *reply = genlmsg_data(genlhdr); 113 void *reply = genlmsg_data(genlhdr);
114 int rc;
115 114
116 rc = genlmsg_end(skb, reply); 115 genlmsg_end(skb, reply);
117 if (rc < 0) {
118 nlmsg_free(skb);
119 return rc;
120 }
121 116
122 return genlmsg_reply(skb, info); 117 return genlmsg_reply(skb, info);
123} 118}
@@ -134,11 +129,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
134 void *reply = genlmsg_data(genlhdr); 129 void *reply = genlmsg_data(genlhdr);
135 int rc, delcount = 0; 130 int rc, delcount = 0;
136 131
137 rc = genlmsg_end(skb, reply); 132 genlmsg_end(skb, reply);
138 if (rc < 0) {
139 nlmsg_free(skb);
140 return;
141 }
142 133
143 rc = 0; 134 rc = 0;
144 down_read(&listeners->sem); 135 down_read(&listeners->sem);
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f622cf28628a..c09c07817d7a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,6 @@
1obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o 1obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
3obj-y += timeconv.o posix-clock.o alarmtimer.o 3obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
4 4
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
6obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 6obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index a7077d3ae52f..1b001ed1edb9 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -788,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
788 goto out; 788 goto out;
789 } 789 }
790 790
791 restart = &current_thread_info()->restart_block; 791 restart = &current->restart_block;
792 restart->fn = alarm_timer_nsleep_restart; 792 restart->fn = alarm_timer_nsleep_restart;
793 restart->nanosleep.clockid = type; 793 restart->nanosleep.clockid = type;
794 restart->nanosleep.expires = exp.tv64; 794 restart->nanosleep.expires = exp.tv64;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b79f39bda7e1..4892352f0e49 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -34,82 +34,6 @@
34#include "tick-internal.h" 34#include "tick-internal.h"
35#include "timekeeping_internal.h" 35#include "timekeeping_internal.h"
36 36
37void timecounter_init(struct timecounter *tc,
38 const struct cyclecounter *cc,
39 u64 start_tstamp)
40{
41 tc->cc = cc;
42 tc->cycle_last = cc->read(cc);
43 tc->nsec = start_tstamp;
44}
45EXPORT_SYMBOL_GPL(timecounter_init);
46
47/**
48 * timecounter_read_delta - get nanoseconds since last call of this function
49 * @tc: Pointer to time counter
50 *
51 * When the underlying cycle counter runs over, this will be handled
52 * correctly as long as it does not run over more than once between
53 * calls.
54 *
55 * The first call to this function for a new time counter initializes
56 * the time tracking and returns an undefined result.
57 */
58static u64 timecounter_read_delta(struct timecounter *tc)
59{
60 cycle_t cycle_now, cycle_delta;
61 u64 ns_offset;
62
63 /* read cycle counter: */
64 cycle_now = tc->cc->read(tc->cc);
65
66 /* calculate the delta since the last timecounter_read_delta(): */
67 cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
68
69 /* convert to nanoseconds: */
70 ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
71
72 /* update time stamp of timecounter_read_delta() call: */
73 tc->cycle_last = cycle_now;
74
75 return ns_offset;
76}
77
78u64 timecounter_read(struct timecounter *tc)
79{
80 u64 nsec;
81
82 /* increment time by nanoseconds since last call */
83 nsec = timecounter_read_delta(tc);
84 nsec += tc->nsec;
85 tc->nsec = nsec;
86
87 return nsec;
88}
89EXPORT_SYMBOL_GPL(timecounter_read);
90
91u64 timecounter_cyc2time(struct timecounter *tc,
92 cycle_t cycle_tstamp)
93{
94 u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
95 u64 nsec;
96
97 /*
98 * Instead of always treating cycle_tstamp as more recent
99 * than tc->cycle_last, detect when it is too far in the
100 * future and treat it as old time stamp instead.
101 */
102 if (cycle_delta > tc->cc->mask / 2) {
103 cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
104 nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
105 } else {
106 nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
107 }
108
109 return nsec;
110}
111EXPORT_SYMBOL_GPL(timecounter_cyc2time);
112
113/** 37/**
114 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks 38 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
115 * @mult: pointer to mult variable 39 * @mult: pointer to mult variable
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 37e50aadd471..bee0c1f78091 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -122,7 +122,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
122 mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); 122 mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
123 boot = ktime_add(mono, off_boot); 123 boot = ktime_add(mono, off_boot);
124 xtim = ktime_add(mono, off_real); 124 xtim = ktime_add(mono, off_real);
125 tai = ktime_add(xtim, off_tai); 125 tai = ktime_add(mono, off_tai);
126 126
127 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; 127 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
128 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; 128 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
@@ -266,7 +266,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
266/* 266/*
267 * Divide a ktime value by a nanosecond value 267 * Divide a ktime value by a nanosecond value
268 */ 268 */
269u64 ktime_divns(const ktime_t kt, s64 div) 269u64 __ktime_divns(const ktime_t kt, s64 div)
270{ 270{
271 u64 dclc; 271 u64 dclc;
272 int sft = 0; 272 int sft = 0;
@@ -282,7 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
282 282
283 return dclc; 283 return dclc;
284} 284}
285EXPORT_SYMBOL_GPL(ktime_divns); 285EXPORT_SYMBOL_GPL(__ktime_divns);
286#endif /* BITS_PER_LONG >= 64 */ 286#endif /* BITS_PER_LONG >= 64 */
287 287
288/* 288/*
@@ -440,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer)
440 trace_hrtimer_cancel(timer); 440 trace_hrtimer_cancel(timer);
441} 441}
442 442
443#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
444static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
445{
446 struct hrtimer_clock_base *base = cpu_base->clock_base;
447 ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
448 int i;
449
450 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
451 struct timerqueue_node *next;
452 struct hrtimer *timer;
453
454 next = timerqueue_getnext(&base->active);
455 if (!next)
456 continue;
457
458 timer = container_of(next, struct hrtimer, node);
459 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
460 if (expires.tv64 < expires_next.tv64)
461 expires_next = expires;
462 }
463 /*
464 * clock_was_set() might have changed base->offset of any of
465 * the clock bases so the result might be negative. Fix it up
466 * to prevent a false positive in clockevents_program_event().
467 */
468 if (expires_next.tv64 < 0)
469 expires_next.tv64 = 0;
470 return expires_next;
471}
472#endif
473
443/* High resolution timer related functions */ 474/* High resolution timer related functions */
444#ifdef CONFIG_HIGH_RES_TIMERS 475#ifdef CONFIG_HIGH_RES_TIMERS
445 476
@@ -488,32 +519,7 @@ static inline int hrtimer_hres_active(void)
488static void 519static void
489hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) 520hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
490{ 521{
491 int i; 522 ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
492 struct hrtimer_clock_base *base = cpu_base->clock_base;
493 ktime_t expires, expires_next;
494
495 expires_next.tv64 = KTIME_MAX;
496
497 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
498 struct hrtimer *timer;
499 struct timerqueue_node *next;
500
501 next = timerqueue_getnext(&base->active);
502 if (!next)
503 continue;
504 timer = container_of(next, struct hrtimer, node);
505
506 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
507 /*
508 * clock_was_set() has changed base->offset so the
509 * result might be negative. Fix it up to prevent a
510 * false positive in clockevents_program_event()
511 */
512 if (expires.tv64 < 0)
513 expires.tv64 = 0;
514 if (expires.tv64 < expires_next.tv64)
515 expires_next = expires;
516 }
517 523
518 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) 524 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
519 return; 525 return;
@@ -587,6 +593,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
587 return 0; 593 return 0;
588 594
589 /* 595 /*
596 * When the target cpu of the timer is currently executing
597 * hrtimer_interrupt(), then we do not touch the clock event
598 * device. hrtimer_interrupt() will reevaluate all clock bases
599 * before reprogramming the device.
600 */
601 if (cpu_base->in_hrtirq)
602 return 0;
603
604 /*
590 * If a hang was detected in the last timer interrupt then we 605 * If a hang was detected in the last timer interrupt then we
591 * do not schedule a timer which is earlier than the expiry 606 * do not schedule a timer which is earlier than the expiry
592 * which we enforced in the hang detection. We want the system 607 * which we enforced in the hang detection. We want the system
@@ -1104,29 +1119,14 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1104ktime_t hrtimer_get_next_event(void) 1119ktime_t hrtimer_get_next_event(void)
1105{ 1120{
1106 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1121 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1107 struct hrtimer_clock_base *base = cpu_base->clock_base; 1122 ktime_t mindelta = { .tv64 = KTIME_MAX };
1108 ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
1109 unsigned long flags; 1123 unsigned long flags;
1110 int i;
1111 1124
1112 raw_spin_lock_irqsave(&cpu_base->lock, flags); 1125 raw_spin_lock_irqsave(&cpu_base->lock, flags);
1113 1126
1114 if (!hrtimer_hres_active()) { 1127 if (!hrtimer_hres_active())
1115 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1128 mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
1116 struct hrtimer *timer; 1129 ktime_get());
1117 struct timerqueue_node *next;
1118
1119 next = timerqueue_getnext(&base->active);
1120 if (!next)
1121 continue;
1122
1123 timer = container_of(next, struct hrtimer, node);
1124 delta.tv64 = hrtimer_get_expires_tv64(timer);
1125 delta = ktime_sub(delta, base->get_time());
1126 if (delta.tv64 < mindelta.tv64)
1127 mindelta.tv64 = delta.tv64;
1128 }
1129 }
1130 1130
1131 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1131 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1132 1132
@@ -1253,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1253 raw_spin_lock(&cpu_base->lock); 1253 raw_spin_lock(&cpu_base->lock);
1254 entry_time = now = hrtimer_update_base(cpu_base); 1254 entry_time = now = hrtimer_update_base(cpu_base);
1255retry: 1255retry:
1256 expires_next.tv64 = KTIME_MAX; 1256 cpu_base->in_hrtirq = 1;
1257 /* 1257 /*
1258 * We set expires_next to KTIME_MAX here with cpu_base->lock 1258 * We set expires_next to KTIME_MAX here with cpu_base->lock
1259 * held to prevent that a timer is enqueued in our queue via 1259 * held to prevent that a timer is enqueued in our queue via
@@ -1291,28 +1291,20 @@ retry:
1291 * are right-of a not yet expired timer, because that 1291 * are right-of a not yet expired timer, because that
1292 * timer will have to trigger a wakeup anyway. 1292 * timer will have to trigger a wakeup anyway.
1293 */ 1293 */
1294 1294 if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
1295 if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
1296 ktime_t expires;
1297
1298 expires = ktime_sub(hrtimer_get_expires(timer),
1299 base->offset);
1300 if (expires.tv64 < 0)
1301 expires.tv64 = KTIME_MAX;
1302 if (expires.tv64 < expires_next.tv64)
1303 expires_next = expires;
1304 break; 1295 break;
1305 }
1306 1296
1307 __run_hrtimer(timer, &basenow); 1297 __run_hrtimer(timer, &basenow);
1308 } 1298 }
1309 } 1299 }
1310 1300 /* Reevaluate the clock bases for the next expiry */
1301 expires_next = __hrtimer_get_next_event(cpu_base);
1311 /* 1302 /*
1312 * Store the new expiry value so the migration code can verify 1303 * Store the new expiry value so the migration code can verify
1313 * against it. 1304 * against it.
1314 */ 1305 */
1315 cpu_base->expires_next = expires_next; 1306 cpu_base->expires_next = expires_next;
1307 cpu_base->in_hrtirq = 0;
1316 raw_spin_unlock(&cpu_base->lock); 1308 raw_spin_unlock(&cpu_base->lock);
1317 1309
1318 /* Reprogramming necessary ? */ 1310 /* Reprogramming necessary ? */
@@ -1591,7 +1583,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1591 goto out; 1583 goto out;
1592 } 1584 }
1593 1585
1594 restart = &current_thread_info()->restart_block; 1586 restart = &current->restart_block;
1595 restart->fn = hrtimer_nanosleep_restart; 1587 restart->fn = hrtimer_nanosleep_restart;
1596 restart->nanosleep.clockid = t.timer.base->clockid; 1588 restart->nanosleep.clockid = t.timer.base->clockid;
1597 restart->nanosleep.rmtp = rmtp; 1589 restart->nanosleep.rmtp = rmtp;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 28bf91c60a0b..0f60b08a4f07 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -488,13 +488,13 @@ static void sync_cmos_clock(struct work_struct *work)
488 488
489 getnstimeofday64(&now); 489 getnstimeofday64(&now);
490 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { 490 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
491 struct timespec adjust = timespec64_to_timespec(now); 491 struct timespec64 adjust = now;
492 492
493 fail = -ENODEV; 493 fail = -ENODEV;
494 if (persistent_clock_is_local) 494 if (persistent_clock_is_local)
495 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); 495 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
496#ifdef CONFIG_GENERIC_CMOS_UPDATE 496#ifdef CONFIG_GENERIC_CMOS_UPDATE
497 fail = update_persistent_clock(adjust); 497 fail = update_persistent_clock(timespec64_to_timespec(adjust));
498#endif 498#endif
499#ifdef CONFIG_RTC_SYSTOHC 499#ifdef CONFIG_RTC_SYSTOHC
500 if (fail == -ENODEV) 500 if (fail == -ENODEV)
@@ -633,10 +633,14 @@ int ntp_validate_timex(struct timex *txc)
633 if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) 633 if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
634 return -EPERM; 634 return -EPERM;
635 635
636 if (txc->modes & ADJ_FREQUENCY) { 636 /*
637 if (LONG_MIN / PPM_SCALE > txc->freq) 637 * Check for potential multiplication overflows that can
638 * only happen on 64-bit systems:
639 */
640 if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
641 if (LLONG_MIN / PPM_SCALE > txc->freq)
638 return -EINVAL; 642 return -EINVAL;
639 if (LONG_MAX / PPM_SCALE < txc->freq) 643 if (LLONG_MAX / PPM_SCALE < txc->freq)
640 return -EINVAL; 644 return -EINVAL;
641 } 645 }
642 646
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a16b67859e2a..0075da74abf0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1334,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1334static int posix_cpu_nsleep(const clockid_t which_clock, int flags, 1334static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1335 struct timespec *rqtp, struct timespec __user *rmtp) 1335 struct timespec *rqtp, struct timespec __user *rmtp)
1336{ 1336{
1337 struct restart_block *restart_block = 1337 struct restart_block *restart_block = &current->restart_block;
1338 &current_thread_info()->restart_block;
1339 struct itimerspec it; 1338 struct itimerspec it;
1340 int error; 1339 int error;
1341 1340
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 7efeedf53ebd..f7c515595b42 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -394,6 +394,56 @@ void tick_resume(void)
394 } 394 }
395} 395}
396 396
397static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
398static unsigned int tick_freeze_depth;
399
400/**
401 * tick_freeze - Suspend the local tick and (possibly) timekeeping.
402 *
403 * Check if this is the last online CPU executing the function and if so,
404 * suspend timekeeping. Otherwise suspend the local tick.
405 *
406 * Call with interrupts disabled. Must be balanced with %tick_unfreeze().
407 * Interrupts must not be enabled before the subsequent %tick_unfreeze().
408 */
409void tick_freeze(void)
410{
411 raw_spin_lock(&tick_freeze_lock);
412
413 tick_freeze_depth++;
414 if (tick_freeze_depth == num_online_cpus()) {
415 timekeeping_suspend();
416 } else {
417 tick_suspend();
418 tick_suspend_broadcast();
419 }
420
421 raw_spin_unlock(&tick_freeze_lock);
422}
423
424/**
425 * tick_unfreeze - Resume the local tick and (possibly) timekeeping.
426 *
427 * Check if this is the first CPU executing the function and if so, resume
428 * timekeeping. Otherwise resume the local tick.
429 *
430 * Call with interrupts disabled. Must be balanced with %tick_freeze().
431 * Interrupts must not be enabled after the preceding %tick_freeze().
432 */
433void tick_unfreeze(void)
434{
435 raw_spin_lock(&tick_freeze_lock);
436
437 if (tick_freeze_depth == num_online_cpus())
438 timekeeping_resume();
439 else
440 tick_resume();
441
442 tick_freeze_depth--;
443
444 raw_spin_unlock(&tick_freeze_lock);
445}
446
397/** 447/**
398 * tick_init - initialize the tick control 448 * tick_init - initialize the tick control
399 */ 449 */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1363d58f07e9..a4c4edac4528 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -326,13 +326,6 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
326 return NOTIFY_OK; 326 return NOTIFY_OK;
327} 327}
328 328
329/*
330 * Worst case string length in chunks of CPU range seems 2 steps
331 * separations: 0,2,4,6,...
332 * This is NR_CPUS + sizeof('\0')
333 */
334static char __initdata nohz_full_buf[NR_CPUS + 1];
335
336static int tick_nohz_init_all(void) 329static int tick_nohz_init_all(void)
337{ 330{
338 int err = -1; 331 int err = -1;
@@ -393,8 +386,8 @@ void __init tick_nohz_init(void)
393 context_tracking_cpu_set(cpu); 386 context_tracking_cpu_set(cpu);
394 387
395 cpu_notifier(tick_nohz_cpu_down_callback, 0); 388 cpu_notifier(tick_nohz_cpu_down_callback, 0);
396 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); 389 pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
397 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); 390 cpumask_pr_args(tick_nohz_full_mask));
398} 391}
399#endif 392#endif
400 393
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
new file mode 100644
index 000000000000..4687b3104bae
--- /dev/null
+++ b/kernel/time/timecounter.c
@@ -0,0 +1,112 @@
1/*
2 * linux/kernel/time/timecounter.c
3 *
4 * based on code that migrated away from
5 * linux/kernel/time/clocksource.c
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 */
17
18#include <linux/export.h>
19#include <linux/timecounter.h>
20
21void timecounter_init(struct timecounter *tc,
22 const struct cyclecounter *cc,
23 u64 start_tstamp)
24{
25 tc->cc = cc;
26 tc->cycle_last = cc->read(cc);
27 tc->nsec = start_tstamp;
28 tc->mask = (1ULL << cc->shift) - 1;
29 tc->frac = 0;
30}
31EXPORT_SYMBOL_GPL(timecounter_init);
32
33/**
34 * timecounter_read_delta - get nanoseconds since last call of this function
35 * @tc: Pointer to time counter
36 *
37 * When the underlying cycle counter runs over, this will be handled
38 * correctly as long as it does not run over more than once between
39 * calls.
40 *
41 * The first call to this function for a new time counter initializes
42 * the time tracking and returns an undefined result.
43 */
44static u64 timecounter_read_delta(struct timecounter *tc)
45{
46 cycle_t cycle_now, cycle_delta;
47 u64 ns_offset;
48
49 /* read cycle counter: */
50 cycle_now = tc->cc->read(tc->cc);
51
52 /* calculate the delta since the last timecounter_read_delta(): */
53 cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
54
55 /* convert to nanoseconds: */
56 ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta,
57 tc->mask, &tc->frac);
58
59 /* update time stamp of timecounter_read_delta() call: */
60 tc->cycle_last = cycle_now;
61
62 return ns_offset;
63}
64
65u64 timecounter_read(struct timecounter *tc)
66{
67 u64 nsec;
68
69 /* increment time by nanoseconds since last call */
70 nsec = timecounter_read_delta(tc);
71 nsec += tc->nsec;
72 tc->nsec = nsec;
73
74 return nsec;
75}
76EXPORT_SYMBOL_GPL(timecounter_read);
77
78/*
79 * This is like cyclecounter_cyc2ns(), but it is used for computing a
80 * time previous to the time stored in the cycle counter.
81 */
82static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
83 cycle_t cycles, u64 mask, u64 frac)
84{
85 u64 ns = (u64) cycles;
86
87 ns = ((ns * cc->mult) - frac) >> cc->shift;
88
89 return ns;
90}
91
92u64 timecounter_cyc2time(struct timecounter *tc,
93 cycle_t cycle_tstamp)
94{
95 u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
96 u64 nsec = tc->nsec, frac = tc->frac;
97
98 /*
99 * Instead of always treating cycle_tstamp as more recent
100 * than tc->cycle_last, detect when it is too far in the
101 * future and treat it as old time stamp instead.
102 */
103 if (delta > tc->cc->mask / 2) {
104 delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
105 nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac);
106 } else {
107 nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac);
108 }
109
110 return nsec;
111}
112EXPORT_SYMBOL_GPL(timecounter_cyc2time);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6a931852082f..91db94136c10 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -230,9 +230,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
230 230
231/** 231/**
232 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. 232 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
233 * @tk: The timekeeper from which we take the update 233 * @tkr: Timekeeping readout base from which we take the update
234 * @tkf: The fast timekeeper to update
235 * @tbase: The time base for the fast timekeeper (mono/raw)
236 * 234 *
237 * We want to use this from any context including NMI and tracing / 235 * We want to use this from any context including NMI and tracing /
238 * instrumenting the timekeeping code itself. 236 * instrumenting the timekeeping code itself.
@@ -244,11 +242,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
244 * smp_wmb(); <- Ensure that the last base[1] update is visible 242 * smp_wmb(); <- Ensure that the last base[1] update is visible
245 * tkf->seq++; 243 * tkf->seq++;
246 * smp_wmb(); <- Ensure that the seqcount update is visible 244 * smp_wmb(); <- Ensure that the seqcount update is visible
247 * update(tkf->base[0], tk); 245 * update(tkf->base[0], tkr);
248 * smp_wmb(); <- Ensure that the base[0] update is visible 246 * smp_wmb(); <- Ensure that the base[0] update is visible
249 * tkf->seq++; 247 * tkf->seq++;
250 * smp_wmb(); <- Ensure that the seqcount update is visible 248 * smp_wmb(); <- Ensure that the seqcount update is visible
251 * update(tkf->base[1], tk); 249 * update(tkf->base[1], tkr);
252 * 250 *
253 * The reader side does: 251 * The reader side does:
254 * 252 *
@@ -269,7 +267,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
269 * slightly wrong timestamp (a few nanoseconds). See 267 * slightly wrong timestamp (a few nanoseconds). See
270 * @ktime_get_mono_fast_ns. 268 * @ktime_get_mono_fast_ns.
271 */ 269 */
272static void update_fast_timekeeper(struct timekeeper *tk) 270static void update_fast_timekeeper(struct tk_read_base *tkr)
273{ 271{
274 struct tk_read_base *base = tk_fast_mono.base; 272 struct tk_read_base *base = tk_fast_mono.base;
275 273
@@ -277,7 +275,7 @@ static void update_fast_timekeeper(struct timekeeper *tk)
277 raw_write_seqcount_latch(&tk_fast_mono.seq); 275 raw_write_seqcount_latch(&tk_fast_mono.seq);
278 276
279 /* Update base[0] */ 277 /* Update base[0] */
280 memcpy(base, &tk->tkr, sizeof(*base)); 278 memcpy(base, tkr, sizeof(*base));
281 279
282 /* Force readers back to base[0] */ 280 /* Force readers back to base[0] */
283 raw_write_seqcount_latch(&tk_fast_mono.seq); 281 raw_write_seqcount_latch(&tk_fast_mono.seq);
@@ -334,6 +332,35 @@ u64 notrace ktime_get_mono_fast_ns(void)
334} 332}
335EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); 333EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
336 334
335/* Suspend-time cycles value for halted fast timekeeper. */
336static cycle_t cycles_at_suspend;
337
338static cycle_t dummy_clock_read(struct clocksource *cs)
339{
340 return cycles_at_suspend;
341}
342
343/**
344 * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
345 * @tk: Timekeeper to snapshot.
346 *
347 * It generally is unsafe to access the clocksource after timekeeping has been
348 * suspended, so take a snapshot of the readout base of @tk and use it as the
349 * fast timekeeper's readout base while suspended. It will return the same
350 * number of cycles every time until timekeeping is resumed at which time the
351 * proper readout base for the fast timekeeper will be restored automatically.
352 */
353static void halt_fast_timekeeper(struct timekeeper *tk)
354{
355 static struct tk_read_base tkr_dummy;
356 struct tk_read_base *tkr = &tk->tkr;
357
358 memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
359 cycles_at_suspend = tkr->read(tkr->clock);
360 tkr_dummy.read = dummy_clock_read;
361 update_fast_timekeeper(&tkr_dummy);
362}
363
337#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD 364#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
338 365
339static inline void update_vsyscall(struct timekeeper *tk) 366static inline void update_vsyscall(struct timekeeper *tk)
@@ -462,7 +489,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
462 memcpy(&shadow_timekeeper, &tk_core.timekeeper, 489 memcpy(&shadow_timekeeper, &tk_core.timekeeper,
463 sizeof(tk_core.timekeeper)); 490 sizeof(tk_core.timekeeper));
464 491
465 update_fast_timekeeper(tk); 492 update_fast_timekeeper(&tk->tkr);
466} 493}
467 494
468/** 495/**
@@ -1170,7 +1197,7 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
1170 * xtime/wall_to_monotonic/jiffies/etc are 1197 * xtime/wall_to_monotonic/jiffies/etc are
1171 * still managed by arch specific suspend/resume code. 1198 * still managed by arch specific suspend/resume code.
1172 */ 1199 */
1173static void timekeeping_resume(void) 1200void timekeeping_resume(void)
1174{ 1201{
1175 struct timekeeper *tk = &tk_core.timekeeper; 1202 struct timekeeper *tk = &tk_core.timekeeper;
1176 struct clocksource *clock = tk->tkr.clock; 1203 struct clocksource *clock = tk->tkr.clock;
@@ -1251,7 +1278,7 @@ static void timekeeping_resume(void)
1251 hrtimers_resume(); 1278 hrtimers_resume();
1252} 1279}
1253 1280
1254static int timekeeping_suspend(void) 1281int timekeeping_suspend(void)
1255{ 1282{
1256 struct timekeeper *tk = &tk_core.timekeeper; 1283 struct timekeeper *tk = &tk_core.timekeeper;
1257 unsigned long flags; 1284 unsigned long flags;
@@ -1296,6 +1323,7 @@ static int timekeeping_suspend(void)
1296 } 1323 }
1297 1324
1298 timekeeping_update(tk, TK_MIRROR); 1325 timekeeping_update(tk, TK_MIRROR);
1326 halt_fast_timekeeper(tk);
1299 write_seqcount_end(&tk_core.seq); 1327 write_seqcount_end(&tk_core.seq);
1300 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1328 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1301 1329
@@ -1659,24 +1687,24 @@ out:
1659} 1687}
1660 1688
1661/** 1689/**
1662 * getboottime - Return the real time of system boot. 1690 * getboottime64 - Return the real time of system boot.
1663 * @ts: pointer to the timespec to be set 1691 * @ts: pointer to the timespec64 to be set
1664 * 1692 *
1665 * Returns the wall-time of boot in a timespec. 1693 * Returns the wall-time of boot in a timespec64.
1666 * 1694 *
1667 * This is based on the wall_to_monotonic offset and the total suspend 1695 * This is based on the wall_to_monotonic offset and the total suspend
1668 * time. Calls to settimeofday will affect the value returned (which 1696 * time. Calls to settimeofday will affect the value returned (which
1669 * basically means that however wrong your real time clock is at boot time, 1697 * basically means that however wrong your real time clock is at boot time,
1670 * you get the right time here). 1698 * you get the right time here).
1671 */ 1699 */
1672void getboottime(struct timespec *ts) 1700void getboottime64(struct timespec64 *ts)
1673{ 1701{
1674 struct timekeeper *tk = &tk_core.timekeeper; 1702 struct timekeeper *tk = &tk_core.timekeeper;
1675 ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); 1703 ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
1676 1704
1677 *ts = ktime_to_timespec(t); 1705 *ts = ktime_to_timespec64(t);
1678} 1706}
1679EXPORT_SYMBOL_GPL(getboottime); 1707EXPORT_SYMBOL_GPL(getboottime64);
1680 1708
1681unsigned long get_seconds(void) 1709unsigned long get_seconds(void)
1682{ 1710{
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index adc1fc98bde3..1d91416055d5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -16,5 +16,7 @@ extern int timekeeping_inject_offset(struct timespec *ts);
16extern s32 timekeeping_get_tai_offset(void); 16extern s32 timekeeping_get_tai_offset(void);
17extern void timekeeping_set_tai_offset(s32 tai_offset); 17extern void timekeeping_set_tai_offset(s32 tai_offset);
18extern void timekeeping_clocktai(struct timespec *ts); 18extern void timekeeping_clocktai(struct timespec *ts);
19extern int timekeeping_suspend(void);
20extern void timekeeping_resume(void);
19 21
20#endif 22#endif
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979ccde26720..98f26588255e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -3,11 +3,11 @@
3 3
4ifdef CONFIG_FUNCTION_TRACER 4ifdef CONFIG_FUNCTION_TRACER
5ORIG_CFLAGS := $(KBUILD_CFLAGS) 5ORIG_CFLAGS := $(KBUILD_CFLAGS)
6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) 6KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
7 7
8ifdef CONFIG_FTRACE_SELFTEST 8ifdef CONFIG_FTRACE_SELFTEST
9# selftest needs instrumentation 9# selftest needs instrumentation
10CFLAGS_trace_selftest_dynamic.o = -pg 10CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE)
11obj-y += trace_selftest_dynamic.o 11obj-y += trace_selftest_dynamic.o
12endif 12endif
13endif 13endif
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 224e768bdc73..45e5cb143d17 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5456,7 +5456,7 @@ static __init int ftrace_init_debugfs(void)
5456 struct dentry *d_tracer; 5456 struct dentry *d_tracer;
5457 5457
5458 d_tracer = tracing_init_dentry(); 5458 d_tracer = tracing_init_dentry();
5459 if (!d_tracer) 5459 if (IS_ERR(d_tracer))
5460 return 0; 5460 return 0;
5461 5461
5462 ftrace_init_dyn_debugfs(d_tracer); 5462 ftrace_init_dyn_debugfs(d_tracer);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 1c71382b283d..eb4220a132ec 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,6 @@
13#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
16EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); 17EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
17 18
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7a4104cb95cb..5040d44fe5a3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -9,7 +9,6 @@
9#include <linux/trace_seq.h> 9#include <linux/trace_seq.h>
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/irq_work.h> 11#include <linux/irq_work.h>
12#include <linux/debugfs.h>
13#include <linux/uaccess.h> 12#include <linux/uaccess.h>
14#include <linux/hardirq.h> 13#include <linux/hardirq.h>
15#include <linux/kthread.h> /* for self test */ 14#include <linux/kthread.h> /* for self test */
@@ -23,7 +22,6 @@
23#include <linux/hash.h> 22#include <linux/hash.h>
24#include <linux/list.h> 23#include <linux/list.h>
25#include <linux/cpu.h> 24#include <linux/cpu.h>
26#include <linux/fs.h>
27 25
28#include <asm/local.h> 26#include <asm/local.h>
29 27
@@ -447,7 +445,10 @@ int ring_buffer_print_page_header(struct trace_seq *s)
447struct rb_irq_work { 445struct rb_irq_work {
448 struct irq_work work; 446 struct irq_work work;
449 wait_queue_head_t waiters; 447 wait_queue_head_t waiters;
448 wait_queue_head_t full_waiters;
450 bool waiters_pending; 449 bool waiters_pending;
450 bool full_waiters_pending;
451 bool wakeup_full;
451}; 452};
452 453
453/* 454/*
@@ -529,6 +530,10 @@ static void rb_wake_up_waiters(struct irq_work *work)
529 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); 530 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
530 531
531 wake_up_all(&rbwork->waiters); 532 wake_up_all(&rbwork->waiters);
533 if (rbwork->wakeup_full) {
534 rbwork->wakeup_full = false;
535 wake_up_all(&rbwork->full_waiters);
536 }
532} 537}
533 538
534/** 539/**
@@ -553,9 +558,11 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
553 * data in any cpu buffer, or a specific buffer, put the 558 * data in any cpu buffer, or a specific buffer, put the
554 * caller on the appropriate wait queue. 559 * caller on the appropriate wait queue.
555 */ 560 */
556 if (cpu == RING_BUFFER_ALL_CPUS) 561 if (cpu == RING_BUFFER_ALL_CPUS) {
557 work = &buffer->irq_work; 562 work = &buffer->irq_work;
558 else { 563 /* Full only makes sense on per cpu reads */
564 full = false;
565 } else {
559 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 566 if (!cpumask_test_cpu(cpu, buffer->cpumask))
560 return -ENODEV; 567 return -ENODEV;
561 cpu_buffer = buffer->buffers[cpu]; 568 cpu_buffer = buffer->buffers[cpu];
@@ -564,7 +571,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
564 571
565 572
566 while (true) { 573 while (true) {
567 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); 574 if (full)
575 prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
576 else
577 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
568 578
569 /* 579 /*
570 * The events can happen in critical sections where 580 * The events can happen in critical sections where
@@ -586,7 +596,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
586 * that is necessary is that the wake up happens after 596 * that is necessary is that the wake up happens after
587 * a task has been queued. It's OK for spurious wake ups. 597 * a task has been queued. It's OK for spurious wake ups.
588 */ 598 */
589 work->waiters_pending = true; 599 if (full)
600 work->full_waiters_pending = true;
601 else
602 work->waiters_pending = true;
590 603
591 if (signal_pending(current)) { 604 if (signal_pending(current)) {
592 ret = -EINTR; 605 ret = -EINTR;
@@ -615,7 +628,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
615 schedule(); 628 schedule();
616 } 629 }
617 630
618 finish_wait(&work->waiters, &wait); 631 if (full)
632 finish_wait(&work->full_waiters, &wait);
633 else
634 finish_wait(&work->waiters, &wait);
619 635
620 return ret; 636 return ret;
621} 637}
@@ -1230,6 +1246,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1230 init_completion(&cpu_buffer->update_done); 1246 init_completion(&cpu_buffer->update_done);
1231 init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); 1247 init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
1232 init_waitqueue_head(&cpu_buffer->irq_work.waiters); 1248 init_waitqueue_head(&cpu_buffer->irq_work.waiters);
1249 init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
1233 1250
1234 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1251 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1235 GFP_KERNEL, cpu_to_node(cpu)); 1252 GFP_KERNEL, cpu_to_node(cpu));
@@ -2801,6 +2818,8 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2801static __always_inline void 2818static __always_inline void
2802rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) 2819rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2803{ 2820{
2821 bool pagebusy;
2822
2804 if (buffer->irq_work.waiters_pending) { 2823 if (buffer->irq_work.waiters_pending) {
2805 buffer->irq_work.waiters_pending = false; 2824 buffer->irq_work.waiters_pending = false;
2806 /* irq_work_queue() supplies it's own memory barriers */ 2825 /* irq_work_queue() supplies it's own memory barriers */
@@ -2812,6 +2831,15 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2812 /* irq_work_queue() supplies it's own memory barriers */ 2831 /* irq_work_queue() supplies it's own memory barriers */
2813 irq_work_queue(&cpu_buffer->irq_work.work); 2832 irq_work_queue(&cpu_buffer->irq_work.work);
2814 } 2833 }
2834
2835 pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
2836
2837 if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
2838 cpu_buffer->irq_work.wakeup_full = true;
2839 cpu_buffer->irq_work.full_waiters_pending = false;
2840 /* irq_work_queue() supplies it's own memory barriers */
2841 irq_work_queue(&cpu_buffer->irq_work.work);
2842 }
2815} 2843}
2816 2844
2817/** 2845/**
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 3f9e328c30b5..13d945c0d03f 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -7,7 +7,7 @@
7#include <linux/completion.h> 7#include <linux/completion.h>
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/time.h> 10#include <linux/ktime.h>
11#include <asm/local.h> 11#include <asm/local.h>
12 12
13struct rb_page { 13struct rb_page {
@@ -17,7 +17,7 @@ struct rb_page {
17}; 17};
18 18
19/* run time and sleep time in seconds */ 19/* run time and sleep time in seconds */
20#define RUN_TIME 10 20#define RUN_TIME 10ULL
21#define SLEEP_TIME 10 21#define SLEEP_TIME 10
22 22
23/* number of events for writer to wake up the reader */ 23/* number of events for writer to wake up the reader */
@@ -212,8 +212,7 @@ static void ring_buffer_consumer(void)
212 212
213static void ring_buffer_producer(void) 213static void ring_buffer_producer(void)
214{ 214{
215 struct timeval start_tv; 215 ktime_t start_time, end_time, timeout;
216 struct timeval end_tv;
217 unsigned long long time; 216 unsigned long long time;
218 unsigned long long entries; 217 unsigned long long entries;
219 unsigned long long overruns; 218 unsigned long long overruns;
@@ -227,7 +226,8 @@ static void ring_buffer_producer(void)
227 * make the system stall) 226 * make the system stall)
228 */ 227 */
229 trace_printk("Starting ring buffer hammer\n"); 228 trace_printk("Starting ring buffer hammer\n");
230 do_gettimeofday(&start_tv); 229 start_time = ktime_get();
230 timeout = ktime_add_ns(start_time, RUN_TIME * NSEC_PER_SEC);
231 do { 231 do {
232 struct ring_buffer_event *event; 232 struct ring_buffer_event *event;
233 int *entry; 233 int *entry;
@@ -244,7 +244,7 @@ static void ring_buffer_producer(void)
244 ring_buffer_unlock_commit(buffer, event); 244 ring_buffer_unlock_commit(buffer, event);
245 } 245 }
246 } 246 }
247 do_gettimeofday(&end_tv); 247 end_time = ktime_get();
248 248
249 cnt++; 249 cnt++;
250 if (consumer && !(cnt % wakeup_interval)) 250 if (consumer && !(cnt % wakeup_interval))
@@ -264,7 +264,7 @@ static void ring_buffer_producer(void)
264 cond_resched(); 264 cond_resched();
265#endif 265#endif
266 266
267 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); 267 } while (ktime_before(end_time, timeout) && !kill_test);
268 trace_printk("End ring buffer hammer\n"); 268 trace_printk("End ring buffer hammer\n");
269 269
270 if (consumer) { 270 if (consumer) {
@@ -280,9 +280,7 @@ static void ring_buffer_producer(void)
280 wait_for_completion(&read_done); 280 wait_for_completion(&read_done);
281 } 281 }
282 282
283 time = end_tv.tv_sec - start_tv.tv_sec; 283 time = ktime_us_delta(end_time, start_time);
284 time *= USEC_PER_SEC;
285 time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
286 284
287 entries = ring_buffer_entries(buffer); 285 entries = ring_buffer_entries(buffer);
288 overruns = ring_buffer_overruns(buffer); 286 overruns = ring_buffer_overruns(buffer);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4a9079b9f082..62c6506d663f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2036,7 +2036,8 @@ void trace_printk_init_buffers(void)
2036 2036
2037 /* trace_printk() is for debug use only. Don't use it in production. */ 2037 /* trace_printk() is for debug use only. Don't use it in production. */
2038 2038
2039 pr_warning("\n**********************************************************\n"); 2039 pr_warning("\n");
2040 pr_warning("**********************************************************\n");
2040 pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); 2041 pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
2041 pr_warning("** **\n"); 2042 pr_warning("** **\n");
2042 pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); 2043 pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
@@ -3352,12 +3353,12 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
3352 3353
3353 mutex_lock(&tracing_cpumask_update_lock); 3354 mutex_lock(&tracing_cpumask_update_lock);
3354 3355
3355 len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask); 3356 len = snprintf(mask_str, count, "%*pb\n",
3356 if (count - len < 2) { 3357 cpumask_pr_args(tr->tracing_cpumask));
3358 if (len >= count) {
3357 count = -EINVAL; 3359 count = -EINVAL;
3358 goto out_err; 3360 goto out_err;
3359 } 3361 }
3360 len += sprintf(mask_str + len, "\n");
3361 count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); 3362 count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
3362 3363
3363out_err: 3364out_err:
@@ -4140,6 +4141,12 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
4140 goto out; 4141 goto out;
4141 } 4142 }
4142 4143
4144 /* If trace pipe files are being read, we can't change the tracer */
4145 if (tr->current_trace->ref) {
4146 ret = -EBUSY;
4147 goto out;
4148 }
4149
4143 trace_branch_disable(); 4150 trace_branch_disable();
4144 4151
4145 tr->current_trace->enabled--; 4152 tr->current_trace->enabled--;
@@ -4326,17 +4333,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
4326 } 4333 }
4327 4334
4328 trace_seq_init(&iter->seq); 4335 trace_seq_init(&iter->seq);
4329 4336 iter->trace = tr->current_trace;
4330 /*
4331 * We make a copy of the current tracer to avoid concurrent
4332 * changes on it while we are reading.
4333 */
4334 iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL);
4335 if (!iter->trace) {
4336 ret = -ENOMEM;
4337 goto fail;
4338 }
4339 *iter->trace = *tr->current_trace;
4340 4337
4341 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 4338 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
4342 ret = -ENOMEM; 4339 ret = -ENOMEM;
@@ -4363,6 +4360,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
4363 iter->trace->pipe_open(iter); 4360 iter->trace->pipe_open(iter);
4364 4361
4365 nonseekable_open(inode, filp); 4362 nonseekable_open(inode, filp);
4363
4364 tr->current_trace->ref++;
4366out: 4365out:
4367 mutex_unlock(&trace_types_lock); 4366 mutex_unlock(&trace_types_lock);
4368 return ret; 4367 return ret;
@@ -4382,6 +4381,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
4382 4381
4383 mutex_lock(&trace_types_lock); 4382 mutex_lock(&trace_types_lock);
4384 4383
4384 tr->current_trace->ref--;
4385
4385 if (iter->trace->pipe_close) 4386 if (iter->trace->pipe_close)
4386 iter->trace->pipe_close(iter); 4387 iter->trace->pipe_close(iter);
4387 4388
@@ -4389,7 +4390,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
4389 4390
4390 free_cpumask_var(iter->started); 4391 free_cpumask_var(iter->started);
4391 mutex_destroy(&iter->mutex); 4392 mutex_destroy(&iter->mutex);
4392 kfree(iter->trace);
4393 kfree(iter); 4393 kfree(iter);
4394 4394
4395 trace_array_put(tr); 4395 trace_array_put(tr);
@@ -4422,7 +4422,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
4422 return trace_poll(iter, filp, poll_table); 4422 return trace_poll(iter, filp, poll_table);
4423} 4423}
4424 4424
4425/* Must be called with trace_types_lock mutex held. */ 4425/* Must be called with iter->mutex held. */
4426static int tracing_wait_pipe(struct file *filp) 4426static int tracing_wait_pipe(struct file *filp)
4427{ 4427{
4428 struct trace_iterator *iter = filp->private_data; 4428 struct trace_iterator *iter = filp->private_data;
@@ -4467,7 +4467,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
4467 size_t cnt, loff_t *ppos) 4467 size_t cnt, loff_t *ppos)
4468{ 4468{
4469 struct trace_iterator *iter = filp->private_data; 4469 struct trace_iterator *iter = filp->private_data;
4470 struct trace_array *tr = iter->tr;
4471 ssize_t sret; 4470 ssize_t sret;
4472 4471
4473 /* return any leftover data */ 4472 /* return any leftover data */
@@ -4477,12 +4476,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
4477 4476
4478 trace_seq_init(&iter->seq); 4477 trace_seq_init(&iter->seq);
4479 4478
4480 /* copy the tracer to avoid using a global lock all around */
4481 mutex_lock(&trace_types_lock);
4482 if (unlikely(iter->trace->name != tr->current_trace->name))
4483 *iter->trace = *tr->current_trace;
4484 mutex_unlock(&trace_types_lock);
4485
4486 /* 4479 /*
4487 * Avoid more than one consumer on a single file descriptor 4480 * Avoid more than one consumer on a single file descriptor
4488 * This is just a matter of traces coherency, the ring buffer itself 4481 * This is just a matter of traces coherency, the ring buffer itself
@@ -4642,7 +4635,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
4642 .ops = &tracing_pipe_buf_ops, 4635 .ops = &tracing_pipe_buf_ops,
4643 .spd_release = tracing_spd_release_pipe, 4636 .spd_release = tracing_spd_release_pipe,
4644 }; 4637 };
4645 struct trace_array *tr = iter->tr;
4646 ssize_t ret; 4638 ssize_t ret;
4647 size_t rem; 4639 size_t rem;
4648 unsigned int i; 4640 unsigned int i;
@@ -4650,12 +4642,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
4650 if (splice_grow_spd(pipe, &spd)) 4642 if (splice_grow_spd(pipe, &spd))
4651 return -ENOMEM; 4643 return -ENOMEM;
4652 4644
4653 /* copy the tracer to avoid using a global lock all around */
4654 mutex_lock(&trace_types_lock);
4655 if (unlikely(iter->trace->name != tr->current_trace->name))
4656 *iter->trace = *tr->current_trace;
4657 mutex_unlock(&trace_types_lock);
4658
4659 mutex_lock(&iter->mutex); 4645 mutex_lock(&iter->mutex);
4660 4646
4661 if (iter->trace->splice_read) { 4647 if (iter->trace->splice_read) {
@@ -4942,7 +4928,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4942 *fpos += written; 4928 *fpos += written;
4943 4929
4944 out_unlock: 4930 out_unlock:
4945 for (i = 0; i < nr_pages; i++){ 4931 for (i = nr_pages - 1; i >= 0; i--) {
4946 kunmap_atomic(map_page[i]); 4932 kunmap_atomic(map_page[i]);
4947 put_page(pages[i]); 4933 put_page(pages[i]);
4948 } 4934 }
@@ -5331,6 +5317,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
5331 5317
5332 filp->private_data = info; 5318 filp->private_data = info;
5333 5319
5320 tr->current_trace->ref++;
5321
5334 mutex_unlock(&trace_types_lock); 5322 mutex_unlock(&trace_types_lock);
5335 5323
5336 ret = nonseekable_open(inode, filp); 5324 ret = nonseekable_open(inode, filp);
@@ -5361,21 +5349,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5361 if (!count) 5349 if (!count)
5362 return 0; 5350 return 0;
5363 5351
5364 mutex_lock(&trace_types_lock);
5365
5366#ifdef CONFIG_TRACER_MAX_TRACE 5352#ifdef CONFIG_TRACER_MAX_TRACE
5367 if (iter->snapshot && iter->tr->current_trace->use_max_tr) { 5353 if (iter->snapshot && iter->tr->current_trace->use_max_tr)
5368 size = -EBUSY; 5354 return -EBUSY;
5369 goto out_unlock;
5370 }
5371#endif 5355#endif
5372 5356
5373 if (!info->spare) 5357 if (!info->spare)
5374 info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, 5358 info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
5375 iter->cpu_file); 5359 iter->cpu_file);
5376 size = -ENOMEM;
5377 if (!info->spare) 5360 if (!info->spare)
5378 goto out_unlock; 5361 return -ENOMEM;
5379 5362
5380 /* Do we have previous read data to read? */ 5363 /* Do we have previous read data to read? */
5381 if (info->read < PAGE_SIZE) 5364 if (info->read < PAGE_SIZE)
@@ -5391,21 +5374,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5391 5374
5392 if (ret < 0) { 5375 if (ret < 0) {
5393 if (trace_empty(iter)) { 5376 if (trace_empty(iter)) {
5394 if ((filp->f_flags & O_NONBLOCK)) { 5377 if ((filp->f_flags & O_NONBLOCK))
5395 size = -EAGAIN; 5378 return -EAGAIN;
5396 goto out_unlock; 5379
5397 }
5398 mutex_unlock(&trace_types_lock);
5399 ret = wait_on_pipe(iter, false); 5380 ret = wait_on_pipe(iter, false);
5400 mutex_lock(&trace_types_lock); 5381 if (ret)
5401 if (ret) { 5382 return ret;
5402 size = ret; 5383
5403 goto out_unlock;
5404 }
5405 goto again; 5384 goto again;
5406 } 5385 }
5407 size = 0; 5386 return 0;
5408 goto out_unlock;
5409 } 5387 }
5410 5388
5411 info->read = 0; 5389 info->read = 0;
@@ -5415,18 +5393,14 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5415 size = count; 5393 size = count;
5416 5394
5417 ret = copy_to_user(ubuf, info->spare + info->read, size); 5395 ret = copy_to_user(ubuf, info->spare + info->read, size);
5418 if (ret == size) { 5396 if (ret == size)
5419 size = -EFAULT; 5397 return -EFAULT;
5420 goto out_unlock; 5398
5421 }
5422 size -= ret; 5399 size -= ret;
5423 5400
5424 *ppos += size; 5401 *ppos += size;
5425 info->read += size; 5402 info->read += size;
5426 5403
5427 out_unlock:
5428 mutex_unlock(&trace_types_lock);
5429
5430 return size; 5404 return size;
5431} 5405}
5432 5406
@@ -5437,6 +5411,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
5437 5411
5438 mutex_lock(&trace_types_lock); 5412 mutex_lock(&trace_types_lock);
5439 5413
5414 iter->tr->current_trace->ref--;
5415
5440 __trace_array_put(iter->tr); 5416 __trace_array_put(iter->tr);
5441 5417
5442 if (info->spare) 5418 if (info->spare)
@@ -5522,30 +5498,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5522 int entries, size, i; 5498 int entries, size, i;
5523 ssize_t ret = 0; 5499 ssize_t ret = 0;
5524 5500
5525 mutex_lock(&trace_types_lock);
5526
5527#ifdef CONFIG_TRACER_MAX_TRACE 5501#ifdef CONFIG_TRACER_MAX_TRACE
5528 if (iter->snapshot && iter->tr->current_trace->use_max_tr) { 5502 if (iter->snapshot && iter->tr->current_trace->use_max_tr)
5529 ret = -EBUSY; 5503 return -EBUSY;
5530 goto out;
5531 }
5532#endif 5504#endif
5533 5505
5534 if (splice_grow_spd(pipe, &spd)) { 5506 if (splice_grow_spd(pipe, &spd))
5535 ret = -ENOMEM; 5507 return -ENOMEM;
5536 goto out;
5537 }
5538 5508
5539 if (*ppos & (PAGE_SIZE - 1)) { 5509 if (*ppos & (PAGE_SIZE - 1))
5540 ret = -EINVAL; 5510 return -EINVAL;
5541 goto out;
5542 }
5543 5511
5544 if (len & (PAGE_SIZE - 1)) { 5512 if (len & (PAGE_SIZE - 1)) {
5545 if (len < PAGE_SIZE) { 5513 if (len < PAGE_SIZE)
5546 ret = -EINVAL; 5514 return -EINVAL;
5547 goto out;
5548 }
5549 len &= PAGE_MASK; 5515 len &= PAGE_MASK;
5550 } 5516 }
5551 5517
@@ -5606,25 +5572,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5606 /* did we read anything? */ 5572 /* did we read anything? */
5607 if (!spd.nr_pages) { 5573 if (!spd.nr_pages) {
5608 if (ret) 5574 if (ret)
5609 goto out; 5575 return ret;
5576
5577 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
5578 return -EAGAIN;
5610 5579
5611 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
5612 ret = -EAGAIN;
5613 goto out;
5614 }
5615 mutex_unlock(&trace_types_lock);
5616 ret = wait_on_pipe(iter, true); 5580 ret = wait_on_pipe(iter, true);
5617 mutex_lock(&trace_types_lock);
5618 if (ret) 5581 if (ret)
5619 goto out; 5582 return ret;
5620 5583
5621 goto again; 5584 goto again;
5622 } 5585 }
5623 5586
5624 ret = splice_to_pipe(pipe, &spd); 5587 ret = splice_to_pipe(pipe, &spd);
5625 splice_shrink_spd(&spd); 5588 splice_shrink_spd(&spd);
5626out:
5627 mutex_unlock(&trace_types_lock);
5628 5589
5629 return ret; 5590 return ret;
5630} 5591}
@@ -5854,28 +5815,11 @@ static __init int register_snapshot_cmd(void)
5854static inline __init int register_snapshot_cmd(void) { return 0; } 5815static inline __init int register_snapshot_cmd(void) { return 0; }
5855#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ 5816#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
5856 5817
5857struct dentry *tracing_init_dentry_tr(struct trace_array *tr) 5818static struct dentry *tracing_get_dentry(struct trace_array *tr)
5858{ 5819{
5859 if (tr->dir)
5860 return tr->dir;
5861
5862 if (!debugfs_initialized())
5863 return NULL;
5864
5865 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
5866 tr->dir = debugfs_create_dir("tracing", NULL);
5867
5868 if (!tr->dir)
5869 pr_warn_once("Could not create debugfs directory 'tracing'\n");
5870
5871 return tr->dir; 5820 return tr->dir;
5872} 5821}
5873 5822
5874struct dentry *tracing_init_dentry(void)
5875{
5876 return tracing_init_dentry_tr(&global_trace);
5877}
5878
5879static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) 5823static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
5880{ 5824{
5881 struct dentry *d_tracer; 5825 struct dentry *d_tracer;
@@ -5883,8 +5827,8 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
5883 if (tr->percpu_dir) 5827 if (tr->percpu_dir)
5884 return tr->percpu_dir; 5828 return tr->percpu_dir;
5885 5829
5886 d_tracer = tracing_init_dentry_tr(tr); 5830 d_tracer = tracing_get_dentry(tr);
5887 if (!d_tracer) 5831 if (IS_ERR(d_tracer))
5888 return NULL; 5832 return NULL;
5889 5833
5890 tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); 5834 tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
@@ -6086,8 +6030,8 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
6086 if (tr->options) 6030 if (tr->options)
6087 return tr->options; 6031 return tr->options;
6088 6032
6089 d_tracer = tracing_init_dentry_tr(tr); 6033 d_tracer = tracing_get_dentry(tr);
6090 if (!d_tracer) 6034 if (IS_ERR(d_tracer))
6091 return NULL; 6035 return NULL;
6092 6036
6093 tr->options = debugfs_create_dir("options", d_tracer); 6037 tr->options = debugfs_create_dir("options", d_tracer);
@@ -6416,7 +6360,7 @@ static int instance_delete(const char *name)
6416 goto out_unlock; 6360 goto out_unlock;
6417 6361
6418 ret = -EBUSY; 6362 ret = -EBUSY;
6419 if (tr->ref) 6363 if (tr->ref || (tr->current_trace && tr->current_trace->ref))
6420 goto out_unlock; 6364 goto out_unlock;
6421 6365
6422 list_del(&tr->list); 6366 list_del(&tr->list);
@@ -6571,6 +6515,33 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6571 6515
6572} 6516}
6573 6517
6518/**
6519 * tracing_init_dentry - initialize top level trace array
6520 *
6521 * This is called when creating files or directories in the tracing
6522 * directory. It is called via fs_initcall() by any of the boot up code
6523 * and expects to return the dentry of the top level tracing directory.
6524 */
6525struct dentry *tracing_init_dentry(void)
6526{
6527 struct trace_array *tr = &global_trace;
6528
6529 if (tr->dir)
6530 return tr->dir;
6531
6532 if (WARN_ON(!debugfs_initialized()))
6533 return ERR_PTR(-ENODEV);
6534
6535 tr->dir = debugfs_create_dir("tracing", NULL);
6536
6537 if (!tr->dir) {
6538 pr_warn_once("Could not create debugfs directory 'tracing'\n");
6539 return ERR_PTR(-ENOMEM);
6540 }
6541
6542 return tr->dir;
6543}
6544
6574static __init int tracer_init_debugfs(void) 6545static __init int tracer_init_debugfs(void)
6575{ 6546{
6576 struct dentry *d_tracer; 6547 struct dentry *d_tracer;
@@ -6578,7 +6549,7 @@ static __init int tracer_init_debugfs(void)
6578 trace_access_lock_init(); 6549 trace_access_lock_init();
6579 6550
6580 d_tracer = tracing_init_dentry(); 6551 d_tracer = tracing_init_dentry();
6581 if (!d_tracer) 6552 if (IS_ERR(d_tracer))
6582 return 0; 6553 return 0;
6583 6554
6584 init_tracer_debugfs(&global_trace, d_tracer); 6555 init_tracer_debugfs(&global_trace, d_tracer);
@@ -6811,7 +6782,6 @@ __init static int tracer_alloc_buffers(void)
6811 int ring_buf_size; 6782 int ring_buf_size;
6812 int ret = -ENOMEM; 6783 int ret = -ENOMEM;
6813 6784
6814
6815 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 6785 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
6816 goto out; 6786 goto out;
6817 6787
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8de48bac1ce2..dd8205a35760 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -388,6 +388,7 @@ struct tracer {
388 struct tracer *next; 388 struct tracer *next;
389 struct tracer_flags *flags; 389 struct tracer_flags *flags;
390 int enabled; 390 int enabled;
391 int ref;
391 bool print_max; 392 bool print_max;
392 bool allow_instances; 393 bool allow_instances;
393#ifdef CONFIG_TRACER_MAX_TRACE 394#ifdef CONFIG_TRACER_MAX_TRACE
@@ -541,7 +542,6 @@ struct dentry *trace_create_file(const char *name,
541 void *data, 542 void *data,
542 const struct file_operations *fops); 543 const struct file_operations *fops);
543 544
544struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
545struct dentry *tracing_init_dentry(void); 545struct dentry *tracing_init_dentry(void);
546 546
547struct ring_buffer_event; 547struct ring_buffer_event;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7d6e2afde669..57cbf1efdd44 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -7,7 +7,6 @@
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/spinlock.h> 8#include <linux/spinlock.h>
9#include <linux/irqflags.h> 9#include <linux/irqflags.h>
10#include <linux/debugfs.h>
11#include <linux/uaccess.h> 10#include <linux/uaccess.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/ftrace.h> 12#include <linux/ftrace.h>
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b03a0ea77b99..db54dda10ccc 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2531,7 +2531,7 @@ static __init int event_trace_init(void)
2531 return -ENODEV; 2531 return -ENODEV;
2532 2532
2533 d_tracer = tracing_init_dentry(); 2533 d_tracer = tracing_init_dentry();
2534 if (!d_tracer) 2534 if (IS_ERR(d_tracer))
2535 return 0; 2535 return 0;
2536 2536
2537 entry = debugfs_create_file("available_events", 0444, d_tracer, 2537 entry = debugfs_create_file("available_events", 0444, d_tracer,
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4ddde28a81a..12e2b99be862 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -6,12 +6,10 @@
6#include <linux/stringify.h> 6#include <linux/stringify.h>
7#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 9#include <linux/uaccess.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/init.h> 12#include <linux/init.h>
14#include <linux/fs.h>
15 13
16#include "trace_output.h" 14#include "trace_output.h"
17 15
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ba476009e5de..2d25ad1526bb 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1437,7 +1437,7 @@ static __init int init_graph_debugfs(void)
1437 struct dentry *d_tracer; 1437 struct dentry *d_tracer;
1438 1438
1439 d_tracer = tracing_init_dentry(); 1439 d_tracer = tracing_init_dentry();
1440 if (!d_tracer) 1440 if (IS_ERR(d_tracer))
1441 return 0; 1441 return 0;
1442 1442
1443 trace_create_file("max_graph_depth", 0644, d_tracer, 1443 trace_create_file("max_graph_depth", 0644, d_tracer,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9bb104f748d0..8523ea345f2b 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -10,11 +10,9 @@
10 * Copyright (C) 2004 Nadia Yvette Chambers 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/kallsyms.h> 12#include <linux/kallsyms.h>
13#include <linux/debugfs.h>
14#include <linux/uaccess.h> 13#include <linux/uaccess.h>
15#include <linux/module.h> 14#include <linux/module.h>
16#include <linux/ftrace.h> 15#include <linux/ftrace.h>
17#include <linux/fs.h>
18 16
19#include "trace.h" 17#include "trace.h"
20 18
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 296079ae6583..d73f565b4e06 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1320,7 +1320,7 @@ static __init int init_kprobe_trace(void)
1320 return -EINVAL; 1320 return -EINVAL;
1321 1321
1322 d_tracer = tracing_init_dentry(); 1322 d_tracer = tracing_init_dentry();
1323 if (!d_tracer) 1323 if (IS_ERR(d_tracer))
1324 return 0; 1324 return 0;
1325 1325
1326 entry = debugfs_create_file("kprobe_events", 0644, d_tracer, 1326 entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index fcf0a9e48916..8bb2071474dd 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -6,8 +6,6 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h> 9#include <linux/ftrace.h>
12 10
13#include "trace.h" 11#include "trace.h"
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b77b9a697619..692bf7184c8c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -177,6 +177,50 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
177} 177}
178EXPORT_SYMBOL(ftrace_print_hex_seq); 178EXPORT_SYMBOL(ftrace_print_hex_seq);
179 179
180const char *
181ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
182 size_t el_size)
183{
184 const char *ret = trace_seq_buffer_ptr(p);
185 const char *prefix = "";
186 void *ptr = (void *)buf;
187
188 trace_seq_putc(p, '{');
189
190 while (ptr < buf + buf_len) {
191 switch (el_size) {
192 case 1:
193 trace_seq_printf(p, "%s0x%x", prefix,
194 *(u8 *)ptr);
195 break;
196 case 2:
197 trace_seq_printf(p, "%s0x%x", prefix,
198 *(u16 *)ptr);
199 break;
200 case 4:
201 trace_seq_printf(p, "%s0x%x", prefix,
202 *(u32 *)ptr);
203 break;
204 case 8:
205 trace_seq_printf(p, "%s0x%llx", prefix,
206 *(u64 *)ptr);
207 break;
208 default:
209 trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size,
210 *(u8 *)ptr);
211 el_size = 1;
212 }
213 prefix = ",";
214 ptr += el_size;
215 }
216
217 trace_seq_putc(p, '}');
218 trace_seq_putc(p, 0);
219
220 return ret;
221}
222EXPORT_SYMBOL(ftrace_print_array_seq);
223
180int ftrace_raw_output_prep(struct trace_iterator *iter, 224int ftrace_raw_output_prep(struct trace_iterator *iter,
181 struct trace_event *trace_event) 225 struct trace_event *trace_event)
182{ 226{
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index c4e70b6bd7fa..36c1455b7567 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -5,7 +5,6 @@
5 * 5 *
6 */ 6 */
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/debugfs.h>
9#include <linux/uaccess.h> 8#include <linux/uaccess.h>
10#include <linux/kernel.h> 9#include <linux/kernel.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
@@ -15,7 +14,6 @@
15#include <linux/ctype.h> 14#include <linux/ctype.h>
16#include <linux/list.h> 15#include <linux/list.h>
17#include <linux/slab.h> 16#include <linux/slab.h>
18#include <linux/fs.h>
19 17
20#include "trace.h" 18#include "trace.h"
21 19
@@ -349,7 +347,7 @@ static __init int init_trace_printk_function_export(void)
349 struct dentry *d_tracer; 347 struct dentry *d_tracer;
350 348
351 d_tracer = tracing_init_dentry(); 349 d_tracer = tracing_init_dentry();
352 if (!d_tracer) 350 if (IS_ERR(d_tracer))
353 return 0; 351 return 0;
354 352
355 trace_create_file("printk_formats", 0444, d_tracer, 353 trace_create_file("printk_formats", 0444, d_tracer,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 2e293beb186e..419ca37e72c9 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -5,8 +5,6 @@
5 * 5 *
6 */ 6 */
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/fs.h>
9#include <linux/debugfs.h>
10#include <linux/kallsyms.h> 8#include <linux/kallsyms.h>
11#include <linux/uaccess.h> 9#include <linux/uaccess.h>
12#include <linux/ftrace.h> 10#include <linux/ftrace.h>
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 8fb84b362816..d6e1003724e9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -10,8 +10,6 @@
10 * Copyright (C) 2004 Nadia Yvette Chambers 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/fs.h>
14#include <linux/debugfs.h>
15#include <linux/kallsyms.h> 13#include <linux/kallsyms.h>
16#include <linux/uaccess.h> 14#include <linux/uaccess.h>
17#include <linux/ftrace.h> 15#include <linux/ftrace.h>
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index f8b45d8792f9..e694c9f9efa4 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -120,7 +120,7 @@ void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
120 120
121 __trace_seq_init(s); 121 __trace_seq_init(s);
122 122
123 seq_buf_bitmask(&s->seq, maskp, nmaskbits); 123 seq_buf_printf(&s->seq, "%*pb", nmaskbits, maskp);
124 124
125 if (unlikely(seq_buf_has_overflowed(&s->seq))) { 125 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
126 s->seq.len = save_len; 126 s->seq.len = save_len;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 16eddb308c33..c3e4fcfddd45 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -7,12 +7,10 @@
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/spinlock.h> 8#include <linux/spinlock.h>
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/sysctl.h> 12#include <linux/sysctl.h>
14#include <linux/init.h> 13#include <linux/init.h>
15#include <linux/fs.h>
16 14
17#include <asm/setup.h> 15#include <asm/setup.h>
18 16
@@ -462,7 +460,7 @@ static __init int stack_trace_init(void)
462 struct dentry *d_tracer; 460 struct dentry *d_tracer;
463 461
464 d_tracer = tracing_init_dentry(); 462 d_tracer = tracing_init_dentry();
465 if (!d_tracer) 463 if (IS_ERR(d_tracer))
466 return 0; 464 return 0;
467 465
468 trace_create_file("stack_max_size", 0644, d_tracer, 466 trace_create_file("stack_max_size", 0644, d_tracer,
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 7af67360b330..75e19e86c954 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -276,7 +276,7 @@ static int tracing_stat_init(void)
276 struct dentry *d_tracing; 276 struct dentry *d_tracing;
277 277
278 d_tracing = tracing_init_dentry(); 278 d_tracing = tracing_init_dentry();
279 if (!d_tracing) 279 if (IS_ERR(d_tracing))
280 return 0; 280 return 0;
281 281
282 stat_dir = debugfs_create_dir("trace_stat", d_tracing); 282 stat_dir = debugfs_create_dir("trace_stat", d_tracing);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b11441321e7a..7dc1c8abecd6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1321,7 +1321,7 @@ static __init int init_uprobe_trace(void)
1321 struct dentry *d_tracer; 1321 struct dentry *d_tracer;
1322 1322
1323 d_tracer = tracing_init_dentry(); 1323 d_tracer = tracing_init_dentry();
1324 if (!d_tracer) 1324 if (IS_ERR(d_tracer))
1325 return 0; 1325 return 0;
1326 1326
1327 trace_create_file("uprobe_events", 0644, d_tracer, 1327 trace_create_file("uprobe_events", 0644, d_tracer,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 70bf11815f84..3174bf8e3538 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -154,7 +154,7 @@ static int get_softlockup_thresh(void)
154 */ 154 */
155static unsigned long get_timestamp(void) 155static unsigned long get_timestamp(void)
156{ 156{
157 return local_clock() >> 30LL; /* 2^30 ~= 10^9 */ 157 return running_clock() >> 30LL; /* 2^30 ~= 10^9 */
158} 158}
159 159
160static void set_sample_period(void) 160static void set_sample_period(void)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index beeeac9e0e3e..f28849394791 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3083,10 +3083,9 @@ static ssize_t wq_cpumask_show(struct device *dev,
3083 int written; 3083 int written;
3084 3084
3085 mutex_lock(&wq->mutex); 3085 mutex_lock(&wq->mutex);
3086 written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask); 3086 written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
3087 cpumask_pr_args(wq->unbound_attrs->cpumask));
3087 mutex_unlock(&wq->mutex); 3088 mutex_unlock(&wq->mutex);
3088
3089 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
3090 return written; 3089 return written;
3091} 3090}
3092 3091