aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks4
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/auditfilter.c2
-rw-r--r--kernel/bpf/syscall.c25
-rw-r--r--kernel/cgroup.c12
-rw-r--r--kernel/compat.c5
-rw-r--r--kernel/cpu.c56
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/core.c491
-rw-r--r--kernel/events/ring_buffer.c3
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/futex.c8
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c4
-rw-r--r--kernel/livepatch/Kconfig18
-rw-r--r--kernel/livepatch/Makefile3
-rw-r--r--kernel/livepatch/core.c1015
-rw-r--r--kernel/locking/Makefile11
-rw-r--r--kernel/locking/mcs_spinlock.h16
-rw-r--r--kernel/locking/mutex.c62
-rw-r--r--kernel/locking/osq_lock.c (renamed from kernel/locking/mcs_spinlock.c)9
-rw-r--r--kernel/locking/rtmutex.c7
-rw-r--r--kernel/locking/rwsem-spinlock.c2
-rw-r--r--kernel/locking/rwsem-xadd.c3
-rw-r--r--kernel/locking/spinlock.c8
-rw-r--r--kernel/notifier.c3
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/process.c75
-rw-r--r--kernel/power/qos.c91
-rw-r--r--kernel/power/snapshot.c11
-rw-r--r--kernel/printk/printk.c12
-rw-r--r--kernel/rcu/Makefile3
-rw-r--r--kernel/rcu/rcu.h6
-rw-r--r--kernel/rcu/rcutorture.c66
-rw-r--r--kernel/rcu/srcu.c2
-rw-r--r--kernel/rcu/tiny.c113
-rw-r--r--kernel/rcu/tiny_plugin.h9
-rw-r--r--kernel/rcu/tree.c355
-rw-r--r--kernel/rcu/tree.h62
-rw-r--r--kernel/rcu/tree_plugin.h271
-rw-r--r--kernel/rcu/tree_trace.c8
-rw-r--r--kernel/resource.c25
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/clock.c13
-rw-r--r--kernel/sched/completion.c18
-rw-r--r--kernel/sched/core.c150
-rw-r--r--kernel/sched/cpudeadline.c27
-rw-r--r--kernel/sched/cpudeadline.h2
-rw-r--r--kernel/sched/deadline.c54
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c9
-rw-r--r--kernel/sched/idle.c3
-rw-r--r--kernel/sched/rt.c26
-rw-r--r--kernel/sched/sched.h22
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/smpboot.c2
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sysctl.c3
-rw-r--r--kernel/taskstats.c13
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clocksource.c76
-rw-r--r--kernel/time/hrtimer.c116
-rw-r--r--kernel/time/ntp.c11
-rw-r--r--kernel/time/posix-cpu-timers.c3
-rw-r--r--kernel/time/time.c4
-rw-r--r--kernel/time/timecounter.c112
-rw-r--r--kernel/time/timekeeping.c12
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c42
-rw-r--r--kernel/trace/ring_buffer_benchmark.c18
-rw-r--r--kernel/trace/trace.c188
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_branch.c1
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_nop.c2
-rw-r--r--kernel/trace/trace_output.c44
-rw-r--r--kernel/trace/trace_printk.c4
-rw-r--r--kernel/trace/trace_sched_switch.c2
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_stat.c2
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/trace/trace_uprobe.c4
-rw-r--r--kernel/watchdog.c2
96 files changed, 2728 insertions, 1228 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 76768ee812b2..08561f1acd13 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER
231 def_bool y 231 def_bool y
232 depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW 232 depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
233 233
234config LOCK_SPIN_ON_OWNER
235 def_bool y
236 depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
237
234config ARCH_USE_QUEUE_RWLOCK 238config ARCH_USE_QUEUE_RWLOCK
235 bool 239 bool
236 240
diff --git a/kernel/Makefile b/kernel/Makefile
index a59481a3fa6c..1408b3353a3c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -13,8 +13,8 @@ obj-y = fork.o exec_domain.o panic.o \
13 13
14ifdef CONFIG_FUNCTION_TRACER 14ifdef CONFIG_FUNCTION_TRACER
15# Do not trace debug files and internal ftrace files 15# Do not trace debug files and internal ftrace files
16CFLAGS_REMOVE_cgroup-debug.o = -pg 16CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
17CFLAGS_REMOVE_irq_work.o = -pg 17CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
18endif 18endif
19 19
20# cond_syscall is currently not LTO compatible 20# cond_syscall is currently not LTO compatible
@@ -26,6 +26,7 @@ obj-y += power/
26obj-y += printk/ 26obj-y += printk/
27obj-y += irq/ 27obj-y += irq/
28obj-y += rcu/ 28obj-y += rcu/
29obj-y += livepatch/
29 30
30obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 31obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
31obj-$(CONFIG_FREEZER) += freezer.o 32obj-$(CONFIG_FREEZER) += freezer.o
@@ -142,7 +143,7 @@ endif
142kernel/system_certificates.o: $(obj)/x509_certificate_list 143kernel/system_certificates.o: $(obj)/x509_certificate_list
143 144
144quiet_cmd_x509certs = CERTS $@ 145quiet_cmd_x509certs = CERTS $@
145 cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)") 146 cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)")
146 147
147targets += $(obj)/x509_certificate_list 148targets += $(obj)/x509_certificate_list
148$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list 149$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f68a326d92e..72e1660a79a3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -425,7 +425,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
425 goto exit_nofree; 425 goto exit_nofree;
426 426
427 bufp = data->buf; 427 bufp = data->buf;
428 entry->rule.vers_ops = 2;
429 for (i = 0; i < data->field_count; i++) { 428 for (i = 0; i < data->field_count; i++) {
430 struct audit_field *f = &entry->rule.fields[i]; 429 struct audit_field *f = &entry->rule.fields[i];
431 430
@@ -758,7 +757,6 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
758 return ERR_PTR(-ENOMEM); 757 return ERR_PTR(-ENOMEM);
759 758
760 new = &entry->rule; 759 new = &entry->rule;
761 new->vers_ops = old->vers_ops;
762 new->flags = old->flags; 760 new->flags = old->flags;
763 new->pflags = old->pflags; 761 new->pflags = old->pflags;
764 new->listnr = old->listnr; 762 new->listnr = old->listnr;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 088ac0b1b106..536edc2be307 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -150,7 +150,7 @@ static int map_lookup_elem(union bpf_attr *attr)
150 int ufd = attr->map_fd; 150 int ufd = attr->map_fd;
151 struct fd f = fdget(ufd); 151 struct fd f = fdget(ufd);
152 struct bpf_map *map; 152 struct bpf_map *map;
153 void *key, *value; 153 void *key, *value, *ptr;
154 int err; 154 int err;
155 155
156 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) 156 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
@@ -169,20 +169,29 @@ static int map_lookup_elem(union bpf_attr *attr)
169 if (copy_from_user(key, ukey, map->key_size) != 0) 169 if (copy_from_user(key, ukey, map->key_size) != 0)
170 goto free_key; 170 goto free_key;
171 171
172 err = -ENOENT; 172 err = -ENOMEM;
173 rcu_read_lock(); 173 value = kmalloc(map->value_size, GFP_USER);
174 value = map->ops->map_lookup_elem(map, key);
175 if (!value) 174 if (!value)
176 goto err_unlock; 175 goto free_key;
176
177 rcu_read_lock();
178 ptr = map->ops->map_lookup_elem(map, key);
179 if (ptr)
180 memcpy(value, ptr, map->value_size);
181 rcu_read_unlock();
182
183 err = -ENOENT;
184 if (!ptr)
185 goto free_value;
177 186
178 err = -EFAULT; 187 err = -EFAULT;
179 if (copy_to_user(uvalue, value, map->value_size) != 0) 188 if (copy_to_user(uvalue, value, map->value_size) != 0)
180 goto err_unlock; 189 goto free_value;
181 190
182 err = 0; 191 err = 0;
183 192
184err_unlock: 193free_value:
185 rcu_read_unlock(); 194 kfree(value);
186free_key: 195free_key:
187 kfree(key); 196 kfree(key);
188err_put: 197err_put:
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bb263d0caab3..d5f6ec251fb2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1909,7 +1909,7 @@ static void cgroup_kill_sb(struct super_block *sb)
1909 * 1909 *
1910 * And don't kill the default root. 1910 * And don't kill the default root.
1911 */ 1911 */
1912 if (css_has_online_children(&root->cgrp.self) || 1912 if (!list_empty(&root->cgrp.self.children) ||
1913 root == &cgrp_dfl_root) 1913 root == &cgrp_dfl_root)
1914 cgroup_put(&root->cgrp); 1914 cgroup_put(&root->cgrp);
1915 else 1915 else
@@ -4373,16 +4373,20 @@ static void css_free_work_fn(struct work_struct *work)
4373{ 4373{
4374 struct cgroup_subsys_state *css = 4374 struct cgroup_subsys_state *css =
4375 container_of(work, struct cgroup_subsys_state, destroy_work); 4375 container_of(work, struct cgroup_subsys_state, destroy_work);
4376 struct cgroup_subsys *ss = css->ss;
4376 struct cgroup *cgrp = css->cgroup; 4377 struct cgroup *cgrp = css->cgroup;
4377 4378
4378 percpu_ref_exit(&css->refcnt); 4379 percpu_ref_exit(&css->refcnt);
4379 4380
4380 if (css->ss) { 4381 if (ss) {
4381 /* css free path */ 4382 /* css free path */
4383 int id = css->id;
4384
4382 if (css->parent) 4385 if (css->parent)
4383 css_put(css->parent); 4386 css_put(css->parent);
4384 4387
4385 css->ss->css_free(css); 4388 ss->css_free(css);
4389 cgroup_idr_remove(&ss->css_idr, id);
4386 cgroup_put(cgrp); 4390 cgroup_put(cgrp);
4387 } else { 4391 } else {
4388 /* cgroup free path */ 4392 /* cgroup free path */
@@ -4434,7 +4438,7 @@ static void css_release_work_fn(struct work_struct *work)
4434 4438
4435 if (ss) { 4439 if (ss) {
4436 /* css release path */ 4440 /* css release path */
4437 cgroup_idr_remove(&ss->css_idr, css->id); 4441 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4438 if (ss->css_released) 4442 if (ss->css_released)
4439 ss->css_released(css); 4443 ss->css_released(css);
4440 } else { 4444 } else {
diff --git a/kernel/compat.c b/kernel/compat.c
index ebb3c369d03d..24f00610c575 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -276,8 +276,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
276 * core implementation decides to return random nonsense. 276 * core implementation decides to return random nonsense.
277 */ 277 */
278 if (ret == -ERESTART_RESTARTBLOCK) { 278 if (ret == -ERESTART_RESTARTBLOCK) {
279 struct restart_block *restart 279 struct restart_block *restart = &current->restart_block;
280 = &current_thread_info()->restart_block;
281 280
282 restart->fn = compat_nanosleep_restart; 281 restart->fn = compat_nanosleep_restart;
283 restart->nanosleep.compat_rmtp = rmtp; 282 restart->nanosleep.compat_rmtp = rmtp;
@@ -860,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
860 return -EFAULT; 859 return -EFAULT;
861 860
862 if (err == -ERESTART_RESTARTBLOCK) { 861 if (err == -ERESTART_RESTARTBLOCK) {
863 restart = &current_thread_info()->restart_block; 862 restart = &current->restart_block;
864 restart->fn = compat_clock_nanosleep_restart; 863 restart->fn = compat_clock_nanosleep_restart;
865 restart->nanosleep.compat_rmtp = rmtp; 864 restart->nanosleep.compat_rmtp = rmtp;
866 } 865 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5d220234b3ca..1972b161c61e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,22 +58,23 @@ static int cpu_hotplug_disabled;
58 58
59static struct { 59static struct {
60 struct task_struct *active_writer; 60 struct task_struct *active_writer;
61 struct mutex lock; /* Synchronizes accesses to refcount, */ 61 /* wait queue to wake up the active_writer */
62 wait_queue_head_t wq;
63 /* verifies that no writer will get active while readers are active */
64 struct mutex lock;
62 /* 65 /*
63 * Also blocks the new readers during 66 * Also blocks the new readers during
64 * an ongoing cpu hotplug operation. 67 * an ongoing cpu hotplug operation.
65 */ 68 */
66 int refcount; 69 atomic_t refcount;
67 /* And allows lockless put_online_cpus(). */
68 atomic_t puts_pending;
69 70
70#ifdef CONFIG_DEBUG_LOCK_ALLOC 71#ifdef CONFIG_DEBUG_LOCK_ALLOC
71 struct lockdep_map dep_map; 72 struct lockdep_map dep_map;
72#endif 73#endif
73} cpu_hotplug = { 74} cpu_hotplug = {
74 .active_writer = NULL, 75 .active_writer = NULL,
76 .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
75 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), 77 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
76 .refcount = 0,
77#ifdef CONFIG_DEBUG_LOCK_ALLOC 78#ifdef CONFIG_DEBUG_LOCK_ALLOC
78 .dep_map = {.name = "cpu_hotplug.lock" }, 79 .dep_map = {.name = "cpu_hotplug.lock" },
79#endif 80#endif
@@ -86,15 +87,6 @@ static struct {
86#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) 87#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
87#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) 88#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
88 89
89static void apply_puts_pending(int max)
90{
91 int delta;
92
93 if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
94 delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
95 cpu_hotplug.refcount -= delta;
96 }
97}
98 90
99void get_online_cpus(void) 91void get_online_cpus(void)
100{ 92{
@@ -103,8 +95,7 @@ void get_online_cpus(void)
103 return; 95 return;
104 cpuhp_lock_acquire_read(); 96 cpuhp_lock_acquire_read();
105 mutex_lock(&cpu_hotplug.lock); 97 mutex_lock(&cpu_hotplug.lock);
106 apply_puts_pending(65536); 98 atomic_inc(&cpu_hotplug.refcount);
107 cpu_hotplug.refcount++;
108 mutex_unlock(&cpu_hotplug.lock); 99 mutex_unlock(&cpu_hotplug.lock);
109} 100}
110EXPORT_SYMBOL_GPL(get_online_cpus); 101EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -116,8 +107,7 @@ bool try_get_online_cpus(void)
116 if (!mutex_trylock(&cpu_hotplug.lock)) 107 if (!mutex_trylock(&cpu_hotplug.lock))
117 return false; 108 return false;
118 cpuhp_lock_acquire_tryread(); 109 cpuhp_lock_acquire_tryread();
119 apply_puts_pending(65536); 110 atomic_inc(&cpu_hotplug.refcount);
120 cpu_hotplug.refcount++;
121 mutex_unlock(&cpu_hotplug.lock); 111 mutex_unlock(&cpu_hotplug.lock);
122 return true; 112 return true;
123} 113}
@@ -125,20 +115,18 @@ EXPORT_SYMBOL_GPL(try_get_online_cpus);
125 115
126void put_online_cpus(void) 116void put_online_cpus(void)
127{ 117{
118 int refcount;
119
128 if (cpu_hotplug.active_writer == current) 120 if (cpu_hotplug.active_writer == current)
129 return; 121 return;
130 if (!mutex_trylock(&cpu_hotplug.lock)) {
131 atomic_inc(&cpu_hotplug.puts_pending);
132 cpuhp_lock_release();
133 return;
134 }
135 122
136 if (WARN_ON(!cpu_hotplug.refcount)) 123 refcount = atomic_dec_return(&cpu_hotplug.refcount);
137 cpu_hotplug.refcount++; /* try to fix things up */ 124 if (WARN_ON(refcount < 0)) /* try to fix things up */
125 atomic_inc(&cpu_hotplug.refcount);
126
127 if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
128 wake_up(&cpu_hotplug.wq);
138 129
139 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
140 wake_up_process(cpu_hotplug.active_writer);
141 mutex_unlock(&cpu_hotplug.lock);
142 cpuhp_lock_release(); 130 cpuhp_lock_release();
143 131
144} 132}
@@ -168,18 +156,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
168 */ 156 */
169void cpu_hotplug_begin(void) 157void cpu_hotplug_begin(void)
170{ 158{
171 cpu_hotplug.active_writer = current; 159 DEFINE_WAIT(wait);
172 160
161 cpu_hotplug.active_writer = current;
173 cpuhp_lock_acquire(); 162 cpuhp_lock_acquire();
163
174 for (;;) { 164 for (;;) {
175 mutex_lock(&cpu_hotplug.lock); 165 mutex_lock(&cpu_hotplug.lock);
176 apply_puts_pending(1); 166 prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
177 if (likely(!cpu_hotplug.refcount)) 167 if (likely(!atomic_read(&cpu_hotplug.refcount)))
178 break; 168 break;
179 __set_current_state(TASK_UNINTERRUPTIBLE);
180 mutex_unlock(&cpu_hotplug.lock); 169 mutex_unlock(&cpu_hotplug.lock);
181 schedule(); 170 schedule();
182 } 171 }
172 finish_wait(&cpu_hotplug.wq, &wait);
183} 173}
184 174
185void cpu_hotplug_done(void) 175void cpu_hotplug_done(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64b257f6bca2..c54a1dae6c11 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2400,7 +2400,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2400 */ 2400 */
2401} 2401}
2402 2402
2403void cpuset_init_current_mems_allowed(void) 2403void __init cpuset_init_current_mems_allowed(void)
2404{ 2404{
2405 nodes_setall(current->mems_allowed); 2405 nodes_setall(current->mems_allowed);
2406} 2406}
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 103f5d147b2f..2925188f50ea 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -1,5 +1,5 @@
1ifdef CONFIG_FUNCTION_TRACER 1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
3endif 3endif
4 4
5obj-y := core.o ring_buffer.o callchain.o 5obj-y := core.o ring_buffer.o callchain.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 882f835a0d85..8812d8e35f5b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -872,22 +872,32 @@ void perf_pmu_enable(struct pmu *pmu)
872 pmu->pmu_enable(pmu); 872 pmu->pmu_enable(pmu);
873} 873}
874 874
875static DEFINE_PER_CPU(struct list_head, rotation_list); 875static DEFINE_PER_CPU(struct list_head, active_ctx_list);
876 876
877/* 877/*
878 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized 878 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
879 * because they're strictly cpu affine and rotate_start is called with IRQs 879 * perf_event_task_tick() are fully serialized because they're strictly cpu
880 * disabled, while rotate_context is called from IRQ context. 880 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
881 * disabled, while perf_event_task_tick is called from IRQ context.
881 */ 882 */
882static void perf_pmu_rotate_start(struct pmu *pmu) 883static void perf_event_ctx_activate(struct perf_event_context *ctx)
883{ 884{
884 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 885 struct list_head *head = this_cpu_ptr(&active_ctx_list);
885 struct list_head *head = this_cpu_ptr(&rotation_list);
886 886
887 WARN_ON(!irqs_disabled()); 887 WARN_ON(!irqs_disabled());
888 888
889 if (list_empty(&cpuctx->rotation_list)) 889 WARN_ON(!list_empty(&ctx->active_ctx_list));
890 list_add(&cpuctx->rotation_list, head); 890
891 list_add(&ctx->active_ctx_list, head);
892}
893
894static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
895{
896 WARN_ON(!irqs_disabled());
897
898 WARN_ON(list_empty(&ctx->active_ctx_list));
899
900 list_del_init(&ctx->active_ctx_list);
891} 901}
892 902
893static void get_ctx(struct perf_event_context *ctx) 903static void get_ctx(struct perf_event_context *ctx)
@@ -907,6 +917,84 @@ static void put_ctx(struct perf_event_context *ctx)
907} 917}
908 918
909/* 919/*
920 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
921 * perf_pmu_migrate_context() we need some magic.
922 *
923 * Those places that change perf_event::ctx will hold both
924 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
925 *
926 * Lock ordering is by mutex address. There is one other site where
927 * perf_event_context::mutex nests and that is put_event(). But remember that
928 * that is a parent<->child context relation, and migration does not affect
929 * children, therefore these two orderings should not interact.
930 *
931 * The change in perf_event::ctx does not affect children (as claimed above)
932 * because the sys_perf_event_open() case will install a new event and break
933 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
934 * concerned with cpuctx and that doesn't have children.
935 *
936 * The places that change perf_event::ctx will issue:
937 *
938 * perf_remove_from_context();
939 * synchronize_rcu();
940 * perf_install_in_context();
941 *
942 * to affect the change. The remove_from_context() + synchronize_rcu() should
943 * quiesce the event, after which we can install it in the new location. This
944 * means that only external vectors (perf_fops, prctl) can perturb the event
945 * while in transit. Therefore all such accessors should also acquire
946 * perf_event_context::mutex to serialize against this.
947 *
948 * However; because event->ctx can change while we're waiting to acquire
949 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
950 * function.
951 *
952 * Lock order:
953 * task_struct::perf_event_mutex
954 * perf_event_context::mutex
955 * perf_event_context::lock
956 * perf_event::child_mutex;
957 * perf_event::mmap_mutex
958 * mmap_sem
959 */
960static struct perf_event_context *
961perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
962{
963 struct perf_event_context *ctx;
964
965again:
966 rcu_read_lock();
967 ctx = ACCESS_ONCE(event->ctx);
968 if (!atomic_inc_not_zero(&ctx->refcount)) {
969 rcu_read_unlock();
970 goto again;
971 }
972 rcu_read_unlock();
973
974 mutex_lock_nested(&ctx->mutex, nesting);
975 if (event->ctx != ctx) {
976 mutex_unlock(&ctx->mutex);
977 put_ctx(ctx);
978 goto again;
979 }
980
981 return ctx;
982}
983
984static inline struct perf_event_context *
985perf_event_ctx_lock(struct perf_event *event)
986{
987 return perf_event_ctx_lock_nested(event, 0);
988}
989
990static void perf_event_ctx_unlock(struct perf_event *event,
991 struct perf_event_context *ctx)
992{
993 mutex_unlock(&ctx->mutex);
994 put_ctx(ctx);
995}
996
997/*
910 * This must be done under the ctx->lock, such as to serialize against 998 * This must be done under the ctx->lock, such as to serialize against
911 * context_equiv(), therefore we cannot call put_ctx() since that might end up 999 * context_equiv(), therefore we cannot call put_ctx() since that might end up
912 * calling scheduler related locks and ctx->lock nests inside those. 1000 * calling scheduler related locks and ctx->lock nests inside those.
@@ -1155,8 +1243,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1155 ctx->nr_branch_stack++; 1243 ctx->nr_branch_stack++;
1156 1244
1157 list_add_rcu(&event->event_entry, &ctx->event_list); 1245 list_add_rcu(&event->event_entry, &ctx->event_list);
1158 if (!ctx->nr_events)
1159 perf_pmu_rotate_start(ctx->pmu);
1160 ctx->nr_events++; 1246 ctx->nr_events++;
1161 if (event->attr.inherit_stat) 1247 if (event->attr.inherit_stat)
1162 ctx->nr_stat++; 1248 ctx->nr_stat++;
@@ -1275,6 +1361,8 @@ static void perf_group_attach(struct perf_event *event)
1275 if (group_leader == event) 1361 if (group_leader == event)
1276 return; 1362 return;
1277 1363
1364 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1365
1278 if (group_leader->group_flags & PERF_GROUP_SOFTWARE && 1366 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1279 !is_software_event(event)) 1367 !is_software_event(event))
1280 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; 1368 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
@@ -1296,6 +1384,10 @@ static void
1296list_del_event(struct perf_event *event, struct perf_event_context *ctx) 1384list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1297{ 1385{
1298 struct perf_cpu_context *cpuctx; 1386 struct perf_cpu_context *cpuctx;
1387
1388 WARN_ON_ONCE(event->ctx != ctx);
1389 lockdep_assert_held(&ctx->lock);
1390
1299 /* 1391 /*
1300 * We can have double detach due to exit/hot-unplug + close. 1392 * We can have double detach due to exit/hot-unplug + close.
1301 */ 1393 */
@@ -1380,6 +1472,8 @@ static void perf_group_detach(struct perf_event *event)
1380 1472
1381 /* Inherit group flags from the previous leader */ 1473 /* Inherit group flags from the previous leader */
1382 sibling->group_flags = event->group_flags; 1474 sibling->group_flags = event->group_flags;
1475
1476 WARN_ON_ONCE(sibling->ctx != event->ctx);
1383 } 1477 }
1384 1478
1385out: 1479out:
@@ -1442,6 +1536,10 @@ event_sched_out(struct perf_event *event,
1442{ 1536{
1443 u64 tstamp = perf_event_time(event); 1537 u64 tstamp = perf_event_time(event);
1444 u64 delta; 1538 u64 delta;
1539
1540 WARN_ON_ONCE(event->ctx != ctx);
1541 lockdep_assert_held(&ctx->lock);
1542
1445 /* 1543 /*
1446 * An event which could not be activated because of 1544 * An event which could not be activated because of
1447 * filter mismatch still needs to have its timings 1545 * filter mismatch still needs to have its timings
@@ -1471,7 +1569,8 @@ event_sched_out(struct perf_event *event,
1471 1569
1472 if (!is_software_event(event)) 1570 if (!is_software_event(event))
1473 cpuctx->active_oncpu--; 1571 cpuctx->active_oncpu--;
1474 ctx->nr_active--; 1572 if (!--ctx->nr_active)
1573 perf_event_ctx_deactivate(ctx);
1475 if (event->attr.freq && event->attr.sample_freq) 1574 if (event->attr.freq && event->attr.sample_freq)
1476 ctx->nr_freq--; 1575 ctx->nr_freq--;
1477 if (event->attr.exclusive || !cpuctx->active_oncpu) 1576 if (event->attr.exclusive || !cpuctx->active_oncpu)
@@ -1654,7 +1753,7 @@ int __perf_event_disable(void *info)
1654 * is the current context on this CPU and preemption is disabled, 1753 * is the current context on this CPU and preemption is disabled,
1655 * hence we can't get into perf_event_task_sched_out for this context. 1754 * hence we can't get into perf_event_task_sched_out for this context.
1656 */ 1755 */
1657void perf_event_disable(struct perf_event *event) 1756static void _perf_event_disable(struct perf_event *event)
1658{ 1757{
1659 struct perf_event_context *ctx = event->ctx; 1758 struct perf_event_context *ctx = event->ctx;
1660 struct task_struct *task = ctx->task; 1759 struct task_struct *task = ctx->task;
@@ -1695,6 +1794,19 @@ retry:
1695 } 1794 }
1696 raw_spin_unlock_irq(&ctx->lock); 1795 raw_spin_unlock_irq(&ctx->lock);
1697} 1796}
1797
1798/*
1799 * Strictly speaking kernel users cannot create groups and therefore this
1800 * interface does not need the perf_event_ctx_lock() magic.
1801 */
1802void perf_event_disable(struct perf_event *event)
1803{
1804 struct perf_event_context *ctx;
1805
1806 ctx = perf_event_ctx_lock(event);
1807 _perf_event_disable(event);
1808 perf_event_ctx_unlock(event, ctx);
1809}
1698EXPORT_SYMBOL_GPL(perf_event_disable); 1810EXPORT_SYMBOL_GPL(perf_event_disable);
1699 1811
1700static void perf_set_shadow_time(struct perf_event *event, 1812static void perf_set_shadow_time(struct perf_event *event,
@@ -1782,7 +1894,8 @@ event_sched_in(struct perf_event *event,
1782 1894
1783 if (!is_software_event(event)) 1895 if (!is_software_event(event))
1784 cpuctx->active_oncpu++; 1896 cpuctx->active_oncpu++;
1785 ctx->nr_active++; 1897 if (!ctx->nr_active++)
1898 perf_event_ctx_activate(ctx);
1786 if (event->attr.freq && event->attr.sample_freq) 1899 if (event->attr.freq && event->attr.sample_freq)
1787 ctx->nr_freq++; 1900 ctx->nr_freq++;
1788 1901
@@ -2158,7 +2271,7 @@ unlock:
2158 * perf_event_for_each_child or perf_event_for_each as described 2271 * perf_event_for_each_child or perf_event_for_each as described
2159 * for perf_event_disable. 2272 * for perf_event_disable.
2160 */ 2273 */
2161void perf_event_enable(struct perf_event *event) 2274static void _perf_event_enable(struct perf_event *event)
2162{ 2275{
2163 struct perf_event_context *ctx = event->ctx; 2276 struct perf_event_context *ctx = event->ctx;
2164 struct task_struct *task = ctx->task; 2277 struct task_struct *task = ctx->task;
@@ -2214,9 +2327,21 @@ retry:
2214out: 2327out:
2215 raw_spin_unlock_irq(&ctx->lock); 2328 raw_spin_unlock_irq(&ctx->lock);
2216} 2329}
2330
2331/*
2332 * See perf_event_disable();
2333 */
2334void perf_event_enable(struct perf_event *event)
2335{
2336 struct perf_event_context *ctx;
2337
2338 ctx = perf_event_ctx_lock(event);
2339 _perf_event_enable(event);
2340 perf_event_ctx_unlock(event, ctx);
2341}
2217EXPORT_SYMBOL_GPL(perf_event_enable); 2342EXPORT_SYMBOL_GPL(perf_event_enable);
2218 2343
2219int perf_event_refresh(struct perf_event *event, int refresh) 2344static int _perf_event_refresh(struct perf_event *event, int refresh)
2220{ 2345{
2221 /* 2346 /*
2222 * not supported on inherited events 2347 * not supported on inherited events
@@ -2225,10 +2350,25 @@ int perf_event_refresh(struct perf_event *event, int refresh)
2225 return -EINVAL; 2350 return -EINVAL;
2226 2351
2227 atomic_add(refresh, &event->event_limit); 2352 atomic_add(refresh, &event->event_limit);
2228 perf_event_enable(event); 2353 _perf_event_enable(event);
2229 2354
2230 return 0; 2355 return 0;
2231} 2356}
2357
2358/*
2359 * See perf_event_disable()
2360 */
2361int perf_event_refresh(struct perf_event *event, int refresh)
2362{
2363 struct perf_event_context *ctx;
2364 int ret;
2365
2366 ctx = perf_event_ctx_lock(event);
2367 ret = _perf_event_refresh(event, refresh);
2368 perf_event_ctx_unlock(event, ctx);
2369
2370 return ret;
2371}
2232EXPORT_SYMBOL_GPL(perf_event_refresh); 2372EXPORT_SYMBOL_GPL(perf_event_refresh);
2233 2373
2234static void ctx_sched_out(struct perf_event_context *ctx, 2374static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2612,12 +2752,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2612 2752
2613 perf_pmu_enable(ctx->pmu); 2753 perf_pmu_enable(ctx->pmu);
2614 perf_ctx_unlock(cpuctx, ctx); 2754 perf_ctx_unlock(cpuctx, ctx);
2615
2616 /*
2617 * Since these rotations are per-cpu, we need to ensure the
2618 * cpu-context we got scheduled on is actually rotating.
2619 */
2620 perf_pmu_rotate_start(ctx->pmu);
2621} 2755}
2622 2756
2623/* 2757/*
@@ -2905,25 +3039,18 @@ static void rotate_ctx(struct perf_event_context *ctx)
2905 list_rotate_left(&ctx->flexible_groups); 3039 list_rotate_left(&ctx->flexible_groups);
2906} 3040}
2907 3041
2908/*
2909 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2910 * because they're strictly cpu affine and rotate_start is called with IRQs
2911 * disabled, while rotate_context is called from IRQ context.
2912 */
2913static int perf_rotate_context(struct perf_cpu_context *cpuctx) 3042static int perf_rotate_context(struct perf_cpu_context *cpuctx)
2914{ 3043{
2915 struct perf_event_context *ctx = NULL; 3044 struct perf_event_context *ctx = NULL;
2916 int rotate = 0, remove = 1; 3045 int rotate = 0;
2917 3046
2918 if (cpuctx->ctx.nr_events) { 3047 if (cpuctx->ctx.nr_events) {
2919 remove = 0;
2920 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 3048 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2921 rotate = 1; 3049 rotate = 1;
2922 } 3050 }
2923 3051
2924 ctx = cpuctx->task_ctx; 3052 ctx = cpuctx->task_ctx;
2925 if (ctx && ctx->nr_events) { 3053 if (ctx && ctx->nr_events) {
2926 remove = 0;
2927 if (ctx->nr_events != ctx->nr_active) 3054 if (ctx->nr_events != ctx->nr_active)
2928 rotate = 1; 3055 rotate = 1;
2929 } 3056 }
@@ -2947,8 +3074,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
2947 perf_pmu_enable(cpuctx->ctx.pmu); 3074 perf_pmu_enable(cpuctx->ctx.pmu);
2948 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 3075 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2949done: 3076done:
2950 if (remove)
2951 list_del_init(&cpuctx->rotation_list);
2952 3077
2953 return rotate; 3078 return rotate;
2954} 3079}
@@ -2966,9 +3091,8 @@ bool perf_event_can_stop_tick(void)
2966 3091
2967void perf_event_task_tick(void) 3092void perf_event_task_tick(void)
2968{ 3093{
2969 struct list_head *head = this_cpu_ptr(&rotation_list); 3094 struct list_head *head = this_cpu_ptr(&active_ctx_list);
2970 struct perf_cpu_context *cpuctx, *tmp; 3095 struct perf_event_context *ctx, *tmp;
2971 struct perf_event_context *ctx;
2972 int throttled; 3096 int throttled;
2973 3097
2974 WARN_ON(!irqs_disabled()); 3098 WARN_ON(!irqs_disabled());
@@ -2976,14 +3100,8 @@ void perf_event_task_tick(void)
2976 __this_cpu_inc(perf_throttled_seq); 3100 __this_cpu_inc(perf_throttled_seq);
2977 throttled = __this_cpu_xchg(perf_throttled_count, 0); 3101 throttled = __this_cpu_xchg(perf_throttled_count, 0);
2978 3102
2979 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { 3103 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
2980 ctx = &cpuctx->ctx;
2981 perf_adjust_freq_unthr_context(ctx, throttled); 3104 perf_adjust_freq_unthr_context(ctx, throttled);
2982
2983 ctx = cpuctx->task_ctx;
2984 if (ctx)
2985 perf_adjust_freq_unthr_context(ctx, throttled);
2986 }
2987} 3105}
2988 3106
2989static int event_enable_on_exec(struct perf_event *event, 3107static int event_enable_on_exec(struct perf_event *event,
@@ -3142,6 +3260,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
3142{ 3260{
3143 raw_spin_lock_init(&ctx->lock); 3261 raw_spin_lock_init(&ctx->lock);
3144 mutex_init(&ctx->mutex); 3262 mutex_init(&ctx->mutex);
3263 INIT_LIST_HEAD(&ctx->active_ctx_list);
3145 INIT_LIST_HEAD(&ctx->pinned_groups); 3264 INIT_LIST_HEAD(&ctx->pinned_groups);
3146 INIT_LIST_HEAD(&ctx->flexible_groups); 3265 INIT_LIST_HEAD(&ctx->flexible_groups);
3147 INIT_LIST_HEAD(&ctx->event_list); 3266 INIT_LIST_HEAD(&ctx->event_list);
@@ -3421,7 +3540,16 @@ static void perf_remove_from_owner(struct perf_event *event)
3421 rcu_read_unlock(); 3540 rcu_read_unlock();
3422 3541
3423 if (owner) { 3542 if (owner) {
3424 mutex_lock(&owner->perf_event_mutex); 3543 /*
3544 * If we're here through perf_event_exit_task() we're already
3545 * holding ctx->mutex which would be an inversion wrt. the
3546 * normal lock order.
3547 *
3548 * However we can safely take this lock because its the child
3549 * ctx->mutex.
3550 */
3551 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3552
3425 /* 3553 /*
3426 * We have to re-check the event->owner field, if it is cleared 3554 * We have to re-check the event->owner field, if it is cleared
3427 * we raced with perf_event_exit_task(), acquiring the mutex 3555 * we raced with perf_event_exit_task(), acquiring the mutex
@@ -3440,7 +3568,7 @@ static void perf_remove_from_owner(struct perf_event *event)
3440 */ 3568 */
3441static void put_event(struct perf_event *event) 3569static void put_event(struct perf_event *event)
3442{ 3570{
3443 struct perf_event_context *ctx = event->ctx; 3571 struct perf_event_context *ctx;
3444 3572
3445 if (!atomic_long_dec_and_test(&event->refcount)) 3573 if (!atomic_long_dec_and_test(&event->refcount))
3446 return; 3574 return;
@@ -3448,7 +3576,6 @@ static void put_event(struct perf_event *event)
3448 if (!is_kernel_event(event)) 3576 if (!is_kernel_event(event))
3449 perf_remove_from_owner(event); 3577 perf_remove_from_owner(event);
3450 3578
3451 WARN_ON_ONCE(ctx->parent_ctx);
3452 /* 3579 /*
3453 * There are two ways this annotation is useful: 3580 * There are two ways this annotation is useful:
3454 * 3581 *
@@ -3461,7 +3588,8 @@ static void put_event(struct perf_event *event)
3461 * the last filedesc died, so there is no possibility 3588 * the last filedesc died, so there is no possibility
3462 * to trigger the AB-BA case. 3589 * to trigger the AB-BA case.
3463 */ 3590 */
3464 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); 3591 ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
3592 WARN_ON_ONCE(ctx->parent_ctx);
3465 perf_remove_from_context(event, true); 3593 perf_remove_from_context(event, true);
3466 mutex_unlock(&ctx->mutex); 3594 mutex_unlock(&ctx->mutex);
3467 3595
@@ -3547,12 +3675,13 @@ static int perf_event_read_group(struct perf_event *event,
3547 u64 read_format, char __user *buf) 3675 u64 read_format, char __user *buf)
3548{ 3676{
3549 struct perf_event *leader = event->group_leader, *sub; 3677 struct perf_event *leader = event->group_leader, *sub;
3550 int n = 0, size = 0, ret = -EFAULT;
3551 struct perf_event_context *ctx = leader->ctx; 3678 struct perf_event_context *ctx = leader->ctx;
3552 u64 values[5]; 3679 int n = 0, size = 0, ret;
3553 u64 count, enabled, running; 3680 u64 count, enabled, running;
3681 u64 values[5];
3682
3683 lockdep_assert_held(&ctx->mutex);
3554 3684
3555 mutex_lock(&ctx->mutex);
3556 count = perf_event_read_value(leader, &enabled, &running); 3685 count = perf_event_read_value(leader, &enabled, &running);
3557 3686
3558 values[n++] = 1 + leader->nr_siblings; 3687 values[n++] = 1 + leader->nr_siblings;
@@ -3567,7 +3696,7 @@ static int perf_event_read_group(struct perf_event *event,
3567 size = n * sizeof(u64); 3696 size = n * sizeof(u64);
3568 3697
3569 if (copy_to_user(buf, values, size)) 3698 if (copy_to_user(buf, values, size))
3570 goto unlock; 3699 return -EFAULT;
3571 3700
3572 ret = size; 3701 ret = size;
3573 3702
@@ -3581,14 +3710,11 @@ static int perf_event_read_group(struct perf_event *event,
3581 size = n * sizeof(u64); 3710 size = n * sizeof(u64);
3582 3711
3583 if (copy_to_user(buf + ret, values, size)) { 3712 if (copy_to_user(buf + ret, values, size)) {
3584 ret = -EFAULT; 3713 return -EFAULT;
3585 goto unlock;
3586 } 3714 }
3587 3715
3588 ret += size; 3716 ret += size;
3589 } 3717 }
3590unlock:
3591 mutex_unlock(&ctx->mutex);
3592 3718
3593 return ret; 3719 return ret;
3594} 3720}
@@ -3660,8 +3786,14 @@ static ssize_t
3660perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) 3786perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3661{ 3787{
3662 struct perf_event *event = file->private_data; 3788 struct perf_event *event = file->private_data;
3789 struct perf_event_context *ctx;
3790 int ret;
3663 3791
3664 return perf_read_hw(event, buf, count); 3792 ctx = perf_event_ctx_lock(event);
3793 ret = perf_read_hw(event, buf, count);
3794 perf_event_ctx_unlock(event, ctx);
3795
3796 return ret;
3665} 3797}
3666 3798
3667static unsigned int perf_poll(struct file *file, poll_table *wait) 3799static unsigned int perf_poll(struct file *file, poll_table *wait)
@@ -3687,7 +3819,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3687 return events; 3819 return events;
3688} 3820}
3689 3821
3690static void perf_event_reset(struct perf_event *event) 3822static void _perf_event_reset(struct perf_event *event)
3691{ 3823{
3692 (void)perf_event_read(event); 3824 (void)perf_event_read(event);
3693 local64_set(&event->count, 0); 3825 local64_set(&event->count, 0);
@@ -3706,6 +3838,7 @@ static void perf_event_for_each_child(struct perf_event *event,
3706 struct perf_event *child; 3838 struct perf_event *child;
3707 3839
3708 WARN_ON_ONCE(event->ctx->parent_ctx); 3840 WARN_ON_ONCE(event->ctx->parent_ctx);
3841
3709 mutex_lock(&event->child_mutex); 3842 mutex_lock(&event->child_mutex);
3710 func(event); 3843 func(event);
3711 list_for_each_entry(child, &event->child_list, child_list) 3844 list_for_each_entry(child, &event->child_list, child_list)
@@ -3719,14 +3852,13 @@ static void perf_event_for_each(struct perf_event *event,
3719 struct perf_event_context *ctx = event->ctx; 3852 struct perf_event_context *ctx = event->ctx;
3720 struct perf_event *sibling; 3853 struct perf_event *sibling;
3721 3854
3722 WARN_ON_ONCE(ctx->parent_ctx); 3855 lockdep_assert_held(&ctx->mutex);
3723 mutex_lock(&ctx->mutex); 3856
3724 event = event->group_leader; 3857 event = event->group_leader;
3725 3858
3726 perf_event_for_each_child(event, func); 3859 perf_event_for_each_child(event, func);
3727 list_for_each_entry(sibling, &event->sibling_list, group_entry) 3860 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3728 perf_event_for_each_child(sibling, func); 3861 perf_event_for_each_child(sibling, func);
3729 mutex_unlock(&ctx->mutex);
3730} 3862}
3731 3863
3732static int perf_event_period(struct perf_event *event, u64 __user *arg) 3864static int perf_event_period(struct perf_event *event, u64 __user *arg)
@@ -3796,25 +3928,24 @@ static int perf_event_set_output(struct perf_event *event,
3796 struct perf_event *output_event); 3928 struct perf_event *output_event);
3797static int perf_event_set_filter(struct perf_event *event, void __user *arg); 3929static int perf_event_set_filter(struct perf_event *event, void __user *arg);
3798 3930
3799static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 3931static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
3800{ 3932{
3801 struct perf_event *event = file->private_data;
3802 void (*func)(struct perf_event *); 3933 void (*func)(struct perf_event *);
3803 u32 flags = arg; 3934 u32 flags = arg;
3804 3935
3805 switch (cmd) { 3936 switch (cmd) {
3806 case PERF_EVENT_IOC_ENABLE: 3937 case PERF_EVENT_IOC_ENABLE:
3807 func = perf_event_enable; 3938 func = _perf_event_enable;
3808 break; 3939 break;
3809 case PERF_EVENT_IOC_DISABLE: 3940 case PERF_EVENT_IOC_DISABLE:
3810 func = perf_event_disable; 3941 func = _perf_event_disable;
3811 break; 3942 break;
3812 case PERF_EVENT_IOC_RESET: 3943 case PERF_EVENT_IOC_RESET:
3813 func = perf_event_reset; 3944 func = _perf_event_reset;
3814 break; 3945 break;
3815 3946
3816 case PERF_EVENT_IOC_REFRESH: 3947 case PERF_EVENT_IOC_REFRESH:
3817 return perf_event_refresh(event, arg); 3948 return _perf_event_refresh(event, arg);
3818 3949
3819 case PERF_EVENT_IOC_PERIOD: 3950 case PERF_EVENT_IOC_PERIOD:
3820 return perf_event_period(event, (u64 __user *)arg); 3951 return perf_event_period(event, (u64 __user *)arg);
@@ -3861,6 +3992,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3861 return 0; 3992 return 0;
3862} 3993}
3863 3994
3995static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3996{
3997 struct perf_event *event = file->private_data;
3998 struct perf_event_context *ctx;
3999 long ret;
4000
4001 ctx = perf_event_ctx_lock(event);
4002 ret = _perf_ioctl(event, cmd, arg);
4003 perf_event_ctx_unlock(event, ctx);
4004
4005 return ret;
4006}
4007
3864#ifdef CONFIG_COMPAT 4008#ifdef CONFIG_COMPAT
3865static long perf_compat_ioctl(struct file *file, unsigned int cmd, 4009static long perf_compat_ioctl(struct file *file, unsigned int cmd,
3866 unsigned long arg) 4010 unsigned long arg)
@@ -3883,11 +4027,15 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
3883 4027
3884int perf_event_task_enable(void) 4028int perf_event_task_enable(void)
3885{ 4029{
4030 struct perf_event_context *ctx;
3886 struct perf_event *event; 4031 struct perf_event *event;
3887 4032
3888 mutex_lock(&current->perf_event_mutex); 4033 mutex_lock(&current->perf_event_mutex);
3889 list_for_each_entry(event, &current->perf_event_list, owner_entry) 4034 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
3890 perf_event_for_each_child(event, perf_event_enable); 4035 ctx = perf_event_ctx_lock(event);
4036 perf_event_for_each_child(event, _perf_event_enable);
4037 perf_event_ctx_unlock(event, ctx);
4038 }
3891 mutex_unlock(&current->perf_event_mutex); 4039 mutex_unlock(&current->perf_event_mutex);
3892 4040
3893 return 0; 4041 return 0;
@@ -3895,11 +4043,15 @@ int perf_event_task_enable(void)
3895 4043
3896int perf_event_task_disable(void) 4044int perf_event_task_disable(void)
3897{ 4045{
4046 struct perf_event_context *ctx;
3898 struct perf_event *event; 4047 struct perf_event *event;
3899 4048
3900 mutex_lock(&current->perf_event_mutex); 4049 mutex_lock(&current->perf_event_mutex);
3901 list_for_each_entry(event, &current->perf_event_list, owner_entry) 4050 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
3902 perf_event_for_each_child(event, perf_event_disable); 4051 ctx = perf_event_ctx_lock(event);
4052 perf_event_for_each_child(event, _perf_event_disable);
4053 perf_event_ctx_unlock(event, ctx);
4054 }
3903 mutex_unlock(&current->perf_event_mutex); 4055 mutex_unlock(&current->perf_event_mutex);
3904 4056
3905 return 0; 4057 return 0;
@@ -5889,6 +6041,8 @@ end:
5889 rcu_read_unlock(); 6041 rcu_read_unlock();
5890} 6042}
5891 6043
6044DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
6045
5892int perf_swevent_get_recursion_context(void) 6046int perf_swevent_get_recursion_context(void)
5893{ 6047{
5894 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 6048 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
@@ -5904,21 +6058,30 @@ inline void perf_swevent_put_recursion_context(int rctx)
5904 put_recursion_context(swhash->recursion, rctx); 6058 put_recursion_context(swhash->recursion, rctx);
5905} 6059}
5906 6060
5907void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 6061void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5908{ 6062{
5909 struct perf_sample_data data; 6063 struct perf_sample_data data;
5910 int rctx;
5911 6064
5912 preempt_disable_notrace(); 6065 if (WARN_ON_ONCE(!regs))
5913 rctx = perf_swevent_get_recursion_context();
5914 if (rctx < 0)
5915 return; 6066 return;
5916 6067
5917 perf_sample_data_init(&data, addr, 0); 6068 perf_sample_data_init(&data, addr, 0);
5918
5919 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); 6069 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
6070}
6071
6072void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6073{
6074 int rctx;
6075
6076 preempt_disable_notrace();
6077 rctx = perf_swevent_get_recursion_context();
6078 if (unlikely(rctx < 0))
6079 goto fail;
6080
6081 ___perf_sw_event(event_id, nr, regs, addr);
5920 6082
5921 perf_swevent_put_recursion_context(rctx); 6083 perf_swevent_put_recursion_context(rctx);
6084fail:
5922 preempt_enable_notrace(); 6085 preempt_enable_notrace();
5923} 6086}
5924 6087
@@ -6776,12 +6939,10 @@ skip_type:
6776 __perf_event_init_context(&cpuctx->ctx); 6939 __perf_event_init_context(&cpuctx->ctx);
6777 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); 6940 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
6778 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); 6941 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6779 cpuctx->ctx.type = cpu_context;
6780 cpuctx->ctx.pmu = pmu; 6942 cpuctx->ctx.pmu = pmu;
6781 6943
6782 __perf_cpu_hrtimer_init(cpuctx, cpu); 6944 __perf_cpu_hrtimer_init(cpuctx, cpu);
6783 6945
6784 INIT_LIST_HEAD(&cpuctx->rotation_list);
6785 cpuctx->unique_pmu = pmu; 6946 cpuctx->unique_pmu = pmu;
6786 } 6947 }
6787 6948
@@ -6854,6 +7015,20 @@ void perf_pmu_unregister(struct pmu *pmu)
6854} 7015}
6855EXPORT_SYMBOL_GPL(perf_pmu_unregister); 7016EXPORT_SYMBOL_GPL(perf_pmu_unregister);
6856 7017
7018static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7019{
7020 int ret;
7021
7022 if (!try_module_get(pmu->module))
7023 return -ENODEV;
7024 event->pmu = pmu;
7025 ret = pmu->event_init(event);
7026 if (ret)
7027 module_put(pmu->module);
7028
7029 return ret;
7030}
7031
6857struct pmu *perf_init_event(struct perf_event *event) 7032struct pmu *perf_init_event(struct perf_event *event)
6858{ 7033{
6859 struct pmu *pmu = NULL; 7034 struct pmu *pmu = NULL;
@@ -6866,24 +7041,14 @@ struct pmu *perf_init_event(struct perf_event *event)
6866 pmu = idr_find(&pmu_idr, event->attr.type); 7041 pmu = idr_find(&pmu_idr, event->attr.type);
6867 rcu_read_unlock(); 7042 rcu_read_unlock();
6868 if (pmu) { 7043 if (pmu) {
6869 if (!try_module_get(pmu->module)) { 7044 ret = perf_try_init_event(pmu, event);
6870 pmu = ERR_PTR(-ENODEV);
6871 goto unlock;
6872 }
6873 event->pmu = pmu;
6874 ret = pmu->event_init(event);
6875 if (ret) 7045 if (ret)
6876 pmu = ERR_PTR(ret); 7046 pmu = ERR_PTR(ret);
6877 goto unlock; 7047 goto unlock;
6878 } 7048 }
6879 7049
6880 list_for_each_entry_rcu(pmu, &pmus, entry) { 7050 list_for_each_entry_rcu(pmu, &pmus, entry) {
6881 if (!try_module_get(pmu->module)) { 7051 ret = perf_try_init_event(pmu, event);
6882 pmu = ERR_PTR(-ENODEV);
6883 goto unlock;
6884 }
6885 event->pmu = pmu;
6886 ret = pmu->event_init(event);
6887 if (!ret) 7052 if (!ret)
6888 goto unlock; 7053 goto unlock;
6889 7054
@@ -7247,6 +7412,15 @@ out:
7247 return ret; 7412 return ret;
7248} 7413}
7249 7414
7415static void mutex_lock_double(struct mutex *a, struct mutex *b)
7416{
7417 if (b < a)
7418 swap(a, b);
7419
7420 mutex_lock(a);
7421 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
7422}
7423
7250/** 7424/**
7251 * sys_perf_event_open - open a performance event, associate it to a task/cpu 7425 * sys_perf_event_open - open a performance event, associate it to a task/cpu
7252 * 7426 *
@@ -7262,7 +7436,7 @@ SYSCALL_DEFINE5(perf_event_open,
7262 struct perf_event *group_leader = NULL, *output_event = NULL; 7436 struct perf_event *group_leader = NULL, *output_event = NULL;
7263 struct perf_event *event, *sibling; 7437 struct perf_event *event, *sibling;
7264 struct perf_event_attr attr; 7438 struct perf_event_attr attr;
7265 struct perf_event_context *ctx; 7439 struct perf_event_context *ctx, *uninitialized_var(gctx);
7266 struct file *event_file = NULL; 7440 struct file *event_file = NULL;
7267 struct fd group = {NULL, 0}; 7441 struct fd group = {NULL, 0};
7268 struct task_struct *task = NULL; 7442 struct task_struct *task = NULL;
@@ -7420,7 +7594,19 @@ SYSCALL_DEFINE5(perf_event_open,
7420 * task or CPU context: 7594 * task or CPU context:
7421 */ 7595 */
7422 if (move_group) { 7596 if (move_group) {
7423 if (group_leader->ctx->type != ctx->type) 7597 /*
7598 * Make sure we're both on the same task, or both
7599 * per-cpu events.
7600 */
7601 if (group_leader->ctx->task != ctx->task)
7602 goto err_context;
7603
7604 /*
7605 * Make sure we're both events for the same CPU;
7606 * grouping events for different CPUs is broken; since
7607 * you can never concurrently schedule them anyhow.
7608 */
7609 if (group_leader->cpu != event->cpu)
7424 goto err_context; 7610 goto err_context;
7425 } else { 7611 } else {
7426 if (group_leader->ctx != ctx) 7612 if (group_leader->ctx != ctx)
@@ -7448,43 +7634,68 @@ SYSCALL_DEFINE5(perf_event_open,
7448 } 7634 }
7449 7635
7450 if (move_group) { 7636 if (move_group) {
7451 struct perf_event_context *gctx = group_leader->ctx; 7637 gctx = group_leader->ctx;
7452
7453 mutex_lock(&gctx->mutex);
7454 perf_remove_from_context(group_leader, false);
7455 7638
7456 /* 7639 /*
7457 * Removing from the context ends up with disabled 7640 * See perf_event_ctx_lock() for comments on the details
7458 * event. What we want here is event in the initial 7641 * of swizzling perf_event::ctx.
7459 * startup state, ready to be add into new context.
7460 */ 7642 */
7461 perf_event__state_init(group_leader); 7643 mutex_lock_double(&gctx->mutex, &ctx->mutex);
7644
7645 perf_remove_from_context(group_leader, false);
7646
7462 list_for_each_entry(sibling, &group_leader->sibling_list, 7647 list_for_each_entry(sibling, &group_leader->sibling_list,
7463 group_entry) { 7648 group_entry) {
7464 perf_remove_from_context(sibling, false); 7649 perf_remove_from_context(sibling, false);
7465 perf_event__state_init(sibling);
7466 put_ctx(gctx); 7650 put_ctx(gctx);
7467 } 7651 }
7468 mutex_unlock(&gctx->mutex); 7652 } else {
7469 put_ctx(gctx); 7653 mutex_lock(&ctx->mutex);
7470 } 7654 }
7471 7655
7472 WARN_ON_ONCE(ctx->parent_ctx); 7656 WARN_ON_ONCE(ctx->parent_ctx);
7473 mutex_lock(&ctx->mutex);
7474 7657
7475 if (move_group) { 7658 if (move_group) {
7659 /*
7660 * Wait for everybody to stop referencing the events through
7661 * the old lists, before installing it on new lists.
7662 */
7476 synchronize_rcu(); 7663 synchronize_rcu();
7477 perf_install_in_context(ctx, group_leader, group_leader->cpu); 7664
7478 get_ctx(ctx); 7665 /*
7666 * Install the group siblings before the group leader.
7667 *
7668 * Because a group leader will try and install the entire group
7669 * (through the sibling list, which is still in-tact), we can
7670 * end up with siblings installed in the wrong context.
7671 *
7672 * By installing siblings first we NO-OP because they're not
7673 * reachable through the group lists.
7674 */
7479 list_for_each_entry(sibling, &group_leader->sibling_list, 7675 list_for_each_entry(sibling, &group_leader->sibling_list,
7480 group_entry) { 7676 group_entry) {
7677 perf_event__state_init(sibling);
7481 perf_install_in_context(ctx, sibling, sibling->cpu); 7678 perf_install_in_context(ctx, sibling, sibling->cpu);
7482 get_ctx(ctx); 7679 get_ctx(ctx);
7483 } 7680 }
7681
7682 /*
7683 * Removing from the context ends up with disabled
7684 * event. What we want here is event in the initial
7685 * startup state, ready to be add into new context.
7686 */
7687 perf_event__state_init(group_leader);
7688 perf_install_in_context(ctx, group_leader, group_leader->cpu);
7689 get_ctx(ctx);
7484 } 7690 }
7485 7691
7486 perf_install_in_context(ctx, event, event->cpu); 7692 perf_install_in_context(ctx, event, event->cpu);
7487 perf_unpin_context(ctx); 7693 perf_unpin_context(ctx);
7694
7695 if (move_group) {
7696 mutex_unlock(&gctx->mutex);
7697 put_ctx(gctx);
7698 }
7488 mutex_unlock(&ctx->mutex); 7699 mutex_unlock(&ctx->mutex);
7489 7700
7490 put_online_cpus(); 7701 put_online_cpus();
@@ -7592,7 +7803,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7592 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; 7803 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
7593 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; 7804 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
7594 7805
7595 mutex_lock(&src_ctx->mutex); 7806 /*
7807 * See perf_event_ctx_lock() for comments on the details
7808 * of swizzling perf_event::ctx.
7809 */
7810 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
7596 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 7811 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
7597 event_entry) { 7812 event_entry) {
7598 perf_remove_from_context(event, false); 7813 perf_remove_from_context(event, false);
@@ -7600,11 +7815,36 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7600 put_ctx(src_ctx); 7815 put_ctx(src_ctx);
7601 list_add(&event->migrate_entry, &events); 7816 list_add(&event->migrate_entry, &events);
7602 } 7817 }
7603 mutex_unlock(&src_ctx->mutex);
7604 7818
7819 /*
7820 * Wait for the events to quiesce before re-instating them.
7821 */
7605 synchronize_rcu(); 7822 synchronize_rcu();
7606 7823
7607 mutex_lock(&dst_ctx->mutex); 7824 /*
7825 * Re-instate events in 2 passes.
7826 *
7827 * Skip over group leaders and only install siblings on this first
7828 * pass, siblings will not get enabled without a leader, however a
7829 * leader will enable its siblings, even if those are still on the old
7830 * context.
7831 */
7832 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
7833 if (event->group_leader == event)
7834 continue;
7835
7836 list_del(&event->migrate_entry);
7837 if (event->state >= PERF_EVENT_STATE_OFF)
7838 event->state = PERF_EVENT_STATE_INACTIVE;
7839 account_event_cpu(event, dst_cpu);
7840 perf_install_in_context(dst_ctx, event, dst_cpu);
7841 get_ctx(dst_ctx);
7842 }
7843
7844 /*
7845 * Once all the siblings are setup properly, install the group leaders
7846 * to make it go.
7847 */
7608 list_for_each_entry_safe(event, tmp, &events, migrate_entry) { 7848 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
7609 list_del(&event->migrate_entry); 7849 list_del(&event->migrate_entry);
7610 if (event->state >= PERF_EVENT_STATE_OFF) 7850 if (event->state >= PERF_EVENT_STATE_OFF)
@@ -7614,6 +7854,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7614 get_ctx(dst_ctx); 7854 get_ctx(dst_ctx);
7615 } 7855 }
7616 mutex_unlock(&dst_ctx->mutex); 7856 mutex_unlock(&dst_ctx->mutex);
7857 mutex_unlock(&src_ctx->mutex);
7617} 7858}
7618EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); 7859EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
7619 7860
@@ -7800,14 +8041,19 @@ static void perf_free_event(struct perf_event *event,
7800 8041
7801 put_event(parent); 8042 put_event(parent);
7802 8043
8044 raw_spin_lock_irq(&ctx->lock);
7803 perf_group_detach(event); 8045 perf_group_detach(event);
7804 list_del_event(event, ctx); 8046 list_del_event(event, ctx);
8047 raw_spin_unlock_irq(&ctx->lock);
7805 free_event(event); 8048 free_event(event);
7806} 8049}
7807 8050
7808/* 8051/*
7809 * free an unexposed, unused context as created by inheritance by 8052 * Free an unexposed, unused context as created by inheritance by
7810 * perf_event_init_task below, used by fork() in case of fail. 8053 * perf_event_init_task below, used by fork() in case of fail.
8054 *
8055 * Not all locks are strictly required, but take them anyway to be nice and
8056 * help out with the lockdep assertions.
7811 */ 8057 */
7812void perf_event_free_task(struct task_struct *task) 8058void perf_event_free_task(struct task_struct *task)
7813{ 8059{
@@ -8126,7 +8372,7 @@ static void __init perf_event_init_all_cpus(void)
8126 for_each_possible_cpu(cpu) { 8372 for_each_possible_cpu(cpu) {
8127 swhash = &per_cpu(swevent_htable, cpu); 8373 swhash = &per_cpu(swevent_htable, cpu);
8128 mutex_init(&swhash->hlist_mutex); 8374 mutex_init(&swhash->hlist_mutex);
8129 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); 8375 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
8130 } 8376 }
8131} 8377}
8132 8378
@@ -8147,22 +8393,11 @@ static void perf_event_init_cpu(int cpu)
8147} 8393}
8148 8394
8149#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC 8395#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
8150static void perf_pmu_rotate_stop(struct pmu *pmu)
8151{
8152 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
8153
8154 WARN_ON(!irqs_disabled());
8155
8156 list_del_init(&cpuctx->rotation_list);
8157}
8158
8159static void __perf_event_exit_context(void *__info) 8396static void __perf_event_exit_context(void *__info)
8160{ 8397{
8161 struct remove_event re = { .detach_group = true }; 8398 struct remove_event re = { .detach_group = true };
8162 struct perf_event_context *ctx = __info; 8399 struct perf_event_context *ctx = __info;
8163 8400
8164 perf_pmu_rotate_stop(ctx->pmu);
8165
8166 rcu_read_lock(); 8401 rcu_read_lock();
8167 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) 8402 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
8168 __perf_remove_from_context(&re); 8403 __perf_remove_from_context(&re);
@@ -8273,6 +8508,18 @@ void __init perf_event_init(void)
8273 != 1024); 8508 != 1024);
8274} 8509}
8275 8510
8511ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
8512 char *page)
8513{
8514 struct perf_pmu_events_attr *pmu_attr =
8515 container_of(attr, struct perf_pmu_events_attr, attr);
8516
8517 if (pmu_attr->event_str)
8518 return sprintf(page, "%s\n", pmu_attr->event_str);
8519
8520 return 0;
8521}
8522
8276static int __init perf_event_sysfs_init(void) 8523static int __init perf_event_sysfs_init(void)
8277{ 8524{
8278 struct pmu *pmu; 8525 struct pmu *pmu;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 146a5792b1d2..eadb95ce7aac 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -13,12 +13,13 @@
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/circ_buf.h> 15#include <linux/circ_buf.h>
16#include <linux/poll.h>
16 17
17#include "internal.h" 18#include "internal.h"
18 19
19static void perf_output_wakeup(struct perf_output_handle *handle) 20static void perf_output_wakeup(struct perf_output_handle *handle)
20{ 21{
21 atomic_set(&handle->rb->poll, POLL_IN); 22 atomic_set(&handle->rb->poll, POLLIN);
22 23
23 handle->event->pending_wakeup = 1; 24 handle->event->pending_wakeup = 1;
24 irq_work_queue(&handle->event->pending); 25 irq_work_queue(&handle->event->pending);
diff --git a/kernel/exit.c b/kernel/exit.c
index 6806c55475ee..feff10bbb307 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk)
435 task_unlock(tsk); 435 task_unlock(tsk);
436 mm_update_next_owner(mm); 436 mm_update_next_owner(mm);
437 mmput(mm); 437 mmput(mm);
438 clear_thread_flag(TIF_MEMDIE); 438 if (test_thread_flag(TIF_MEMDIE))
439 unmark_oom_victim();
439} 440}
440 441
441static struct task_struct *find_alive_thread(struct task_struct *p) 442static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2ddade9f1..cf65139615a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
438 atomic_inc(&mapping->i_mmap_writable); 438 atomic_inc(&mapping->i_mmap_writable);
439 flush_dcache_mmap_lock(mapping); 439 flush_dcache_mmap_lock(mapping);
440 /* insert tmp into the share list, just after mpnt */ 440 /* insert tmp into the share list, just after mpnt */
441 if (unlikely(tmp->vm_flags & VM_NONLINEAR)) 441 vma_interval_tree_insert_after(tmp, mpnt,
442 vma_nonlinear_insert(tmp, 442 &mapping->i_mmap);
443 &mapping->i_mmap_nonlinear);
444 else
445 vma_interval_tree_insert_after(tmp, mpnt,
446 &mapping->i_mmap);
447 flush_dcache_mmap_unlock(mapping); 443 flush_dcache_mmap_unlock(mapping);
448 i_mmap_unlock_write(mapping); 444 i_mmap_unlock_write(mapping);
449 } 445 }
@@ -559,6 +555,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
559 INIT_LIST_HEAD(&mm->mmlist); 555 INIT_LIST_HEAD(&mm->mmlist);
560 mm->core_state = NULL; 556 mm->core_state = NULL;
561 atomic_long_set(&mm->nr_ptes, 0); 557 atomic_long_set(&mm->nr_ptes, 0);
558 mm_nr_pmds_init(mm);
562 mm->map_count = 0; 559 mm->map_count = 0;
563 mm->locked_vm = 0; 560 mm->locked_vm = 0;
564 mm->pinned_vm = 0; 561 mm->pinned_vm = 0;
@@ -607,6 +604,14 @@ static void check_mm(struct mm_struct *mm)
607 printk(KERN_ALERT "BUG: Bad rss-counter state " 604 printk(KERN_ALERT "BUG: Bad rss-counter state "
608 "mm:%p idx:%d val:%ld\n", mm, i, x); 605 "mm:%p idx:%d val:%ld\n", mm, i, x);
609 } 606 }
607
608 if (atomic_long_read(&mm->nr_ptes))
609 pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
610 atomic_long_read(&mm->nr_ptes));
611 if (mm_nr_pmds(mm))
612 pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
613 mm_nr_pmds(mm));
614
610#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 615#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
611 VM_BUG_ON_MM(mm->pmd_huge_pte, mm); 616 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
612#endif 617#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index 63678b573d61..2a5e3830e953 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2217,7 +2217,7 @@ retry:
2217 if (!abs_time) 2217 if (!abs_time)
2218 goto out; 2218 goto out;
2219 2219
2220 restart = &current_thread_info()->restart_block; 2220 restart = &current->restart_block;
2221 restart->fn = futex_wait_restart; 2221 restart->fn = futex_wait_restart;
2222 restart->futex.uaddr = uaddr; 2222 restart->futex.uaddr = uaddr;
2223 restart->futex.val = val; 2223 restart->futex.val = val;
@@ -2258,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart)
2258 * if there are waiters then it will block, it does PI, etc. (Due to 2258 * if there are waiters then it will block, it does PI, etc. (Due to
2259 * races the kernel might see a 0 value of the futex too.) 2259 * races the kernel might see a 0 value of the futex too.)
2260 */ 2260 */
2261static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, 2261static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
2262 ktime_t *time, int trylock) 2262 ktime_t *time, int trylock)
2263{ 2263{
2264 struct hrtimer_sleeper timeout, *to = NULL; 2264 struct hrtimer_sleeper timeout, *to = NULL;
@@ -2953,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2953 case FUTEX_WAKE_OP: 2953 case FUTEX_WAKE_OP:
2954 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); 2954 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2955 case FUTEX_LOCK_PI: 2955 case FUTEX_LOCK_PI:
2956 return futex_lock_pi(uaddr, flags, val, timeout, 0); 2956 return futex_lock_pi(uaddr, flags, timeout, 0);
2957 case FUTEX_UNLOCK_PI: 2957 case FUTEX_UNLOCK_PI:
2958 return futex_unlock_pi(uaddr, flags); 2958 return futex_unlock_pi(uaddr, flags);
2959 case FUTEX_TRYLOCK_PI: 2959 case FUTEX_TRYLOCK_PI:
2960 return futex_lock_pi(uaddr, flags, 0, timeout, 1); 2960 return futex_lock_pi(uaddr, flags, NULL, 1);
2961 case FUTEX_WAIT_REQUEUE_PI: 2961 case FUTEX_WAIT_REQUEUE_PI:
2962 val3 = FUTEX_BITSET_MATCH_ANY; 2962 val3 = FUTEX_BITSET_MATCH_ANY;
2963 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, 2963 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9a8a01abbaed..c85277639b34 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -2512,7 +2512,7 @@ static int kexec_apply_relocations(struct kimage *image)
2512 continue; 2512 continue;
2513 2513
2514 /* 2514 /*
2515 * Respective archicture needs to provide support for applying 2515 * Respective architecture needs to provide support for applying
2516 * relocations of type SHT_RELA/SHT_REL. 2516 * relocations of type SHT_RELA/SHT_REL.
2517 */ 2517 */
2518 if (sechdrs[i].sh_type == SHT_RELA) 2518 if (sechdrs[i].sh_type == SHT_RELA)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ee619929cf90..2ca272f8f62e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -717,7 +717,7 @@ static void prepare_optimized_kprobe(struct kprobe *p)
717 struct optimized_kprobe *op; 717 struct optimized_kprobe *op;
718 718
719 op = container_of(p, struct optimized_kprobe, kp); 719 op = container_of(p, struct optimized_kprobe, kp);
720 arch_prepare_optimized_kprobe(op); 720 arch_prepare_optimized_kprobe(op, p);
721} 721}
722 722
723/* Allocate new optimized_kprobe and try to prepare optimized instructions */ 723/* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -731,7 +731,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
731 731
732 INIT_LIST_HEAD(&op->list); 732 INIT_LIST_HEAD(&op->list);
733 op->kp.addr = p->addr; 733 op->kp.addr = p->addr;
734 arch_prepare_optimized_kprobe(op); 734 arch_prepare_optimized_kprobe(op, p);
735 735
736 return &op->kp; 736 return &op->kp;
737} 737}
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
new file mode 100644
index 000000000000..045022557936
--- /dev/null
+++ b/kernel/livepatch/Kconfig
@@ -0,0 +1,18 @@
1config HAVE_LIVEPATCH
2 bool
3 help
4 Arch supports kernel live patching
5
6config LIVEPATCH
7 bool "Kernel Live Patching"
8 depends on DYNAMIC_FTRACE_WITH_REGS
9 depends on MODULES
10 depends on SYSFS
11 depends on KALLSYMS_ALL
12 depends on HAVE_LIVEPATCH
13 help
14 Say Y here if you want to support kernel live patching.
15 This option has no runtime impact until a kernel "patch"
16 module uses the interface provided by this option to register
17 a patch, causing calls to patched functions to be redirected
18 to new function code contained in the patch module.
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
new file mode 100644
index 000000000000..e8780c0901d9
--- /dev/null
+++ b/kernel/livepatch/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_LIVEPATCH) += livepatch.o
2
3livepatch-objs := core.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
new file mode 100644
index 000000000000..ff7f47d026ac
--- /dev/null
+++ b/kernel/livepatch/core.c
@@ -0,0 +1,1015 @@
1/*
2 * core.c - Kernel Live Patching Core
3 *
4 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
5 * Copyright (C) 2014 SUSE
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
23#include <linux/module.h>
24#include <linux/kernel.h>
25#include <linux/mutex.h>
26#include <linux/slab.h>
27#include <linux/ftrace.h>
28#include <linux/list.h>
29#include <linux/kallsyms.h>
30#include <linux/livepatch.h>
31
32/**
33 * struct klp_ops - structure for tracking registered ftrace ops structs
34 *
35 * A single ftrace_ops is shared between all enabled replacement functions
36 * (klp_func structs) which have the same old_addr. This allows the switch
37 * between function versions to happen instantaneously by updating the klp_ops
38 * struct's func_stack list. The winner is the klp_func at the top of the
39 * func_stack (front of the list).
40 *
41 * @node: node for the global klp_ops list
42 * @func_stack: list head for the stack of klp_func's (active func is on top)
43 * @fops: registered ftrace ops struct
44 */
45struct klp_ops {
46 struct list_head node;
47 struct list_head func_stack;
48 struct ftrace_ops fops;
49};
50
51/*
52 * The klp_mutex protects the global lists and state transitions of any
53 * structure reachable from them. References to any structure must be obtained
54 * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
55 * ensure it gets consistent data).
56 */
57static DEFINE_MUTEX(klp_mutex);
58
59static LIST_HEAD(klp_patches);
60static LIST_HEAD(klp_ops);
61
62static struct kobject *klp_root_kobj;
63
64static struct klp_ops *klp_find_ops(unsigned long old_addr)
65{
66 struct klp_ops *ops;
67 struct klp_func *func;
68
69 list_for_each_entry(ops, &klp_ops, node) {
70 func = list_first_entry(&ops->func_stack, struct klp_func,
71 stack_node);
72 if (func->old_addr == old_addr)
73 return ops;
74 }
75
76 return NULL;
77}
78
79static bool klp_is_module(struct klp_object *obj)
80{
81 return obj->name;
82}
83
84static bool klp_is_object_loaded(struct klp_object *obj)
85{
86 return !obj->name || obj->mod;
87}
88
89/* sets obj->mod if object is not vmlinux and module is found */
90static void klp_find_object_module(struct klp_object *obj)
91{
92 if (!klp_is_module(obj))
93 return;
94
95 mutex_lock(&module_mutex);
96 /*
97 * We don't need to take a reference on the module here because we have
98 * the klp_mutex, which is also taken by the module notifier. This
99 * prevents any module from unloading until we release the klp_mutex.
100 */
101 obj->mod = find_module(obj->name);
102 mutex_unlock(&module_mutex);
103}
104
105/* klp_mutex must be held by caller */
106static bool klp_is_patch_registered(struct klp_patch *patch)
107{
108 struct klp_patch *mypatch;
109
110 list_for_each_entry(mypatch, &klp_patches, list)
111 if (mypatch == patch)
112 return true;
113
114 return false;
115}
116
117static bool klp_initialized(void)
118{
119 return klp_root_kobj;
120}
121
122struct klp_find_arg {
123 const char *objname;
124 const char *name;
125 unsigned long addr;
126 /*
127 * If count == 0, the symbol was not found. If count == 1, a unique
128 * match was found and addr is set. If count > 1, there is
129 * unresolvable ambiguity among "count" number of symbols with the same
130 * name in the same object.
131 */
132 unsigned long count;
133};
134
135static int klp_find_callback(void *data, const char *name,
136 struct module *mod, unsigned long addr)
137{
138 struct klp_find_arg *args = data;
139
140 if ((mod && !args->objname) || (!mod && args->objname))
141 return 0;
142
143 if (strcmp(args->name, name))
144 return 0;
145
146 if (args->objname && strcmp(args->objname, mod->name))
147 return 0;
148
149 /*
150 * args->addr might be overwritten if another match is found
151 * but klp_find_object_symbol() handles this and only returns the
152 * addr if count == 1.
153 */
154 args->addr = addr;
155 args->count++;
156
157 return 0;
158}
159
160static int klp_find_object_symbol(const char *objname, const char *name,
161 unsigned long *addr)
162{
163 struct klp_find_arg args = {
164 .objname = objname,
165 .name = name,
166 .addr = 0,
167 .count = 0
168 };
169
170 kallsyms_on_each_symbol(klp_find_callback, &args);
171
172 if (args.count == 0)
173 pr_err("symbol '%s' not found in symbol table\n", name);
174 else if (args.count > 1)
175 pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
176 args.count, name, objname);
177 else {
178 *addr = args.addr;
179 return 0;
180 }
181
182 *addr = 0;
183 return -EINVAL;
184}
185
186struct klp_verify_args {
187 const char *name;
188 const unsigned long addr;
189};
190
191static int klp_verify_callback(void *data, const char *name,
192 struct module *mod, unsigned long addr)
193{
194 struct klp_verify_args *args = data;
195
196 if (!mod &&
197 !strcmp(args->name, name) &&
198 args->addr == addr)
199 return 1;
200
201 return 0;
202}
203
204static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
205{
206 struct klp_verify_args args = {
207 .name = name,
208 .addr = addr,
209 };
210
211 if (kallsyms_on_each_symbol(klp_verify_callback, &args))
212 return 0;
213
214 pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
215 name, addr);
216 return -EINVAL;
217}
218
219static int klp_find_verify_func_addr(struct klp_object *obj,
220 struct klp_func *func)
221{
222 int ret;
223
224#if defined(CONFIG_RANDOMIZE_BASE)
225 /* KASLR is enabled, disregard old_addr from user */
226 func->old_addr = 0;
227#endif
228
229 if (!func->old_addr || klp_is_module(obj))
230 ret = klp_find_object_symbol(obj->name, func->old_name,
231 &func->old_addr);
232 else
233 ret = klp_verify_vmlinux_symbol(func->old_name,
234 func->old_addr);
235
236 return ret;
237}
238
239/*
240 * external symbols are located outside the parent object (where the parent
241 * object is either vmlinux or the kmod being patched).
242 */
243static int klp_find_external_symbol(struct module *pmod, const char *name,
244 unsigned long *addr)
245{
246 const struct kernel_symbol *sym;
247
248 /* first, check if it's an exported symbol */
249 preempt_disable();
250 sym = find_symbol(name, NULL, NULL, true, true);
251 preempt_enable();
252 if (sym) {
253 *addr = sym->value;
254 return 0;
255 }
256
257 /* otherwise check if it's in another .o within the patch module */
258 return klp_find_object_symbol(pmod->name, name, addr);
259}
260
261static int klp_write_object_relocations(struct module *pmod,
262 struct klp_object *obj)
263{
264 int ret;
265 struct klp_reloc *reloc;
266
267 if (WARN_ON(!klp_is_object_loaded(obj)))
268 return -EINVAL;
269
270 if (WARN_ON(!obj->relocs))
271 return -EINVAL;
272
273 for (reloc = obj->relocs; reloc->name; reloc++) {
274 if (!klp_is_module(obj)) {
275 ret = klp_verify_vmlinux_symbol(reloc->name,
276 reloc->val);
277 if (ret)
278 return ret;
279 } else {
280 /* module, reloc->val needs to be discovered */
281 if (reloc->external)
282 ret = klp_find_external_symbol(pmod,
283 reloc->name,
284 &reloc->val);
285 else
286 ret = klp_find_object_symbol(obj->mod->name,
287 reloc->name,
288 &reloc->val);
289 if (ret)
290 return ret;
291 }
292 ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
293 reloc->val + reloc->addend);
294 if (ret) {
295 pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
296 reloc->name, reloc->val, ret);
297 return ret;
298 }
299 }
300
301 return 0;
302}
303
304static void notrace klp_ftrace_handler(unsigned long ip,
305 unsigned long parent_ip,
306 struct ftrace_ops *fops,
307 struct pt_regs *regs)
308{
309 struct klp_ops *ops;
310 struct klp_func *func;
311
312 ops = container_of(fops, struct klp_ops, fops);
313
314 rcu_read_lock();
315 func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
316 stack_node);
317 rcu_read_unlock();
318
319 if (WARN_ON_ONCE(!func))
320 return;
321
322 klp_arch_set_pc(regs, (unsigned long)func->new_func);
323}
324
325static int klp_disable_func(struct klp_func *func)
326{
327 struct klp_ops *ops;
328 int ret;
329
330 if (WARN_ON(func->state != KLP_ENABLED))
331 return -EINVAL;
332
333 if (WARN_ON(!func->old_addr))
334 return -EINVAL;
335
336 ops = klp_find_ops(func->old_addr);
337 if (WARN_ON(!ops))
338 return -EINVAL;
339
340 if (list_is_singular(&ops->func_stack)) {
341 ret = unregister_ftrace_function(&ops->fops);
342 if (ret) {
343 pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
344 func->old_name, ret);
345 return ret;
346 }
347
348 ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
349 if (ret)
350 pr_warn("function unregister succeeded but failed to clear the filter\n");
351
352 list_del_rcu(&func->stack_node);
353 list_del(&ops->node);
354 kfree(ops);
355 } else {
356 list_del_rcu(&func->stack_node);
357 }
358
359 func->state = KLP_DISABLED;
360
361 return 0;
362}
363
364static int klp_enable_func(struct klp_func *func)
365{
366 struct klp_ops *ops;
367 int ret;
368
369 if (WARN_ON(!func->old_addr))
370 return -EINVAL;
371
372 if (WARN_ON(func->state != KLP_DISABLED))
373 return -EINVAL;
374
375 ops = klp_find_ops(func->old_addr);
376 if (!ops) {
377 ops = kzalloc(sizeof(*ops), GFP_KERNEL);
378 if (!ops)
379 return -ENOMEM;
380
381 ops->fops.func = klp_ftrace_handler;
382 ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
383 FTRACE_OPS_FL_DYNAMIC |
384 FTRACE_OPS_FL_IPMODIFY;
385
386 list_add(&ops->node, &klp_ops);
387
388 INIT_LIST_HEAD(&ops->func_stack);
389 list_add_rcu(&func->stack_node, &ops->func_stack);
390
391 ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0);
392 if (ret) {
393 pr_err("failed to set ftrace filter for function '%s' (%d)\n",
394 func->old_name, ret);
395 goto err;
396 }
397
398 ret = register_ftrace_function(&ops->fops);
399 if (ret) {
400 pr_err("failed to register ftrace handler for function '%s' (%d)\n",
401 func->old_name, ret);
402 ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
403 goto err;
404 }
405
406
407 } else {
408 list_add_rcu(&func->stack_node, &ops->func_stack);
409 }
410
411 func->state = KLP_ENABLED;
412
413 return 0;
414
415err:
416 list_del_rcu(&func->stack_node);
417 list_del(&ops->node);
418 kfree(ops);
419 return ret;
420}
421
422static int klp_disable_object(struct klp_object *obj)
423{
424 struct klp_func *func;
425 int ret;
426
427 for (func = obj->funcs; func->old_name; func++) {
428 if (func->state != KLP_ENABLED)
429 continue;
430
431 ret = klp_disable_func(func);
432 if (ret)
433 return ret;
434 }
435
436 obj->state = KLP_DISABLED;
437
438 return 0;
439}
440
441static int klp_enable_object(struct klp_object *obj)
442{
443 struct klp_func *func;
444 int ret;
445
446 if (WARN_ON(obj->state != KLP_DISABLED))
447 return -EINVAL;
448
449 if (WARN_ON(!klp_is_object_loaded(obj)))
450 return -EINVAL;
451
452 for (func = obj->funcs; func->old_name; func++) {
453 ret = klp_enable_func(func);
454 if (ret)
455 goto unregister;
456 }
457 obj->state = KLP_ENABLED;
458
459 return 0;
460
461unregister:
462 WARN_ON(klp_disable_object(obj));
463 return ret;
464}
465
466static int __klp_disable_patch(struct klp_patch *patch)
467{
468 struct klp_object *obj;
469 int ret;
470
471 /* enforce stacking: only the last enabled patch can be disabled */
472 if (!list_is_last(&patch->list, &klp_patches) &&
473 list_next_entry(patch, list)->state == KLP_ENABLED)
474 return -EBUSY;
475
476 pr_notice("disabling patch '%s'\n", patch->mod->name);
477
478 for (obj = patch->objs; obj->funcs; obj++) {
479 if (obj->state != KLP_ENABLED)
480 continue;
481
482 ret = klp_disable_object(obj);
483 if (ret)
484 return ret;
485 }
486
487 patch->state = KLP_DISABLED;
488
489 return 0;
490}
491
492/**
493 * klp_disable_patch() - disables a registered patch
494 * @patch: The registered, enabled patch to be disabled
495 *
496 * Unregisters the patched functions from ftrace.
497 *
498 * Return: 0 on success, otherwise error
499 */
500int klp_disable_patch(struct klp_patch *patch)
501{
502 int ret;
503
504 mutex_lock(&klp_mutex);
505
506 if (!klp_is_patch_registered(patch)) {
507 ret = -EINVAL;
508 goto err;
509 }
510
511 if (patch->state == KLP_DISABLED) {
512 ret = -EINVAL;
513 goto err;
514 }
515
516 ret = __klp_disable_patch(patch);
517
518err:
519 mutex_unlock(&klp_mutex);
520 return ret;
521}
522EXPORT_SYMBOL_GPL(klp_disable_patch);
523
524static int __klp_enable_patch(struct klp_patch *patch)
525{
526 struct klp_object *obj;
527 int ret;
528
529 if (WARN_ON(patch->state != KLP_DISABLED))
530 return -EINVAL;
531
532 /* enforce stacking: only the first disabled patch can be enabled */
533 if (patch->list.prev != &klp_patches &&
534 list_prev_entry(patch, list)->state == KLP_DISABLED)
535 return -EBUSY;
536
537 pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
538 add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
539
540 pr_notice("enabling patch '%s'\n", patch->mod->name);
541
542 for (obj = patch->objs; obj->funcs; obj++) {
543 klp_find_object_module(obj);
544
545 if (!klp_is_object_loaded(obj))
546 continue;
547
548 ret = klp_enable_object(obj);
549 if (ret)
550 goto unregister;
551 }
552
553 patch->state = KLP_ENABLED;
554
555 return 0;
556
557unregister:
558 WARN_ON(__klp_disable_patch(patch));
559 return ret;
560}
561
562/**
563 * klp_enable_patch() - enables a registered patch
564 * @patch: The registered, disabled patch to be enabled
565 *
566 * Performs the needed symbol lookups and code relocations,
567 * then registers the patched functions with ftrace.
568 *
569 * Return: 0 on success, otherwise error
570 */
571int klp_enable_patch(struct klp_patch *patch)
572{
573 int ret;
574
575 mutex_lock(&klp_mutex);
576
577 if (!klp_is_patch_registered(patch)) {
578 ret = -EINVAL;
579 goto err;
580 }
581
582 ret = __klp_enable_patch(patch);
583
584err:
585 mutex_unlock(&klp_mutex);
586 return ret;
587}
588EXPORT_SYMBOL_GPL(klp_enable_patch);
589
590/*
591 * Sysfs Interface
592 *
593 * /sys/kernel/livepatch
594 * /sys/kernel/livepatch/<patch>
595 * /sys/kernel/livepatch/<patch>/enabled
596 * /sys/kernel/livepatch/<patch>/<object>
597 * /sys/kernel/livepatch/<patch>/<object>/<func>
598 */
599
600static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
601 const char *buf, size_t count)
602{
603 struct klp_patch *patch;
604 int ret;
605 unsigned long val;
606
607 ret = kstrtoul(buf, 10, &val);
608 if (ret)
609 return -EINVAL;
610
611 if (val != KLP_DISABLED && val != KLP_ENABLED)
612 return -EINVAL;
613
614 patch = container_of(kobj, struct klp_patch, kobj);
615
616 mutex_lock(&klp_mutex);
617
618 if (val == patch->state) {
619 /* already in requested state */
620 ret = -EINVAL;
621 goto err;
622 }
623
624 if (val == KLP_ENABLED) {
625 ret = __klp_enable_patch(patch);
626 if (ret)
627 goto err;
628 } else {
629 ret = __klp_disable_patch(patch);
630 if (ret)
631 goto err;
632 }
633
634 mutex_unlock(&klp_mutex);
635
636 return count;
637
638err:
639 mutex_unlock(&klp_mutex);
640 return ret;
641}
642
643static ssize_t enabled_show(struct kobject *kobj,
644 struct kobj_attribute *attr, char *buf)
645{
646 struct klp_patch *patch;
647
648 patch = container_of(kobj, struct klp_patch, kobj);
649 return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
650}
651
652static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
653static struct attribute *klp_patch_attrs[] = {
654 &enabled_kobj_attr.attr,
655 NULL
656};
657
658static void klp_kobj_release_patch(struct kobject *kobj)
659{
660 /*
661 * Once we have a consistency model we'll need to module_put() the
662 * patch module here. See klp_register_patch() for more details.
663 */
664}
665
666static struct kobj_type klp_ktype_patch = {
667 .release = klp_kobj_release_patch,
668 .sysfs_ops = &kobj_sysfs_ops,
669 .default_attrs = klp_patch_attrs,
670};
671
672static void klp_kobj_release_func(struct kobject *kobj)
673{
674}
675
676static struct kobj_type klp_ktype_func = {
677 .release = klp_kobj_release_func,
678 .sysfs_ops = &kobj_sysfs_ops,
679};
680
681/*
682 * Free all functions' kobjects in the array up to some limit. When limit is
683 * NULL, all kobjects are freed.
684 */
685static void klp_free_funcs_limited(struct klp_object *obj,
686 struct klp_func *limit)
687{
688 struct klp_func *func;
689
690 for (func = obj->funcs; func->old_name && func != limit; func++)
691 kobject_put(&func->kobj);
692}
693
694/* Clean up when a patched object is unloaded */
695static void klp_free_object_loaded(struct klp_object *obj)
696{
697 struct klp_func *func;
698
699 obj->mod = NULL;
700
701 for (func = obj->funcs; func->old_name; func++)
702 func->old_addr = 0;
703}
704
705/*
706 * Free all objects' kobjects in the array up to some limit. When limit is
707 * NULL, all kobjects are freed.
708 */
709static void klp_free_objects_limited(struct klp_patch *patch,
710 struct klp_object *limit)
711{
712 struct klp_object *obj;
713
714 for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
715 klp_free_funcs_limited(obj, NULL);
716 kobject_put(obj->kobj);
717 }
718}
719
720static void klp_free_patch(struct klp_patch *patch)
721{
722 klp_free_objects_limited(patch, NULL);
723 if (!list_empty(&patch->list))
724 list_del(&patch->list);
725 kobject_put(&patch->kobj);
726}
727
728static int klp_init_func(struct klp_object *obj, struct klp_func *func)
729{
730 INIT_LIST_HEAD(&func->stack_node);
731 func->state = KLP_DISABLED;
732
733 return kobject_init_and_add(&func->kobj, &klp_ktype_func,
734 obj->kobj, func->old_name);
735}
736
737/* parts of the initialization that is done only when the object is loaded */
738static int klp_init_object_loaded(struct klp_patch *patch,
739 struct klp_object *obj)
740{
741 struct klp_func *func;
742 int ret;
743
744 if (obj->relocs) {
745 ret = klp_write_object_relocations(patch->mod, obj);
746 if (ret)
747 return ret;
748 }
749
750 for (func = obj->funcs; func->old_name; func++) {
751 ret = klp_find_verify_func_addr(obj, func);
752 if (ret)
753 return ret;
754 }
755
756 return 0;
757}
758
759static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
760{
761 struct klp_func *func;
762 int ret;
763 const char *name;
764
765 if (!obj->funcs)
766 return -EINVAL;
767
768 obj->state = KLP_DISABLED;
769
770 klp_find_object_module(obj);
771
772 name = klp_is_module(obj) ? obj->name : "vmlinux";
773 obj->kobj = kobject_create_and_add(name, &patch->kobj);
774 if (!obj->kobj)
775 return -ENOMEM;
776
777 for (func = obj->funcs; func->old_name; func++) {
778 ret = klp_init_func(obj, func);
779 if (ret)
780 goto free;
781 }
782
783 if (klp_is_object_loaded(obj)) {
784 ret = klp_init_object_loaded(patch, obj);
785 if (ret)
786 goto free;
787 }
788
789 return 0;
790
791free:
792 klp_free_funcs_limited(obj, func);
793 kobject_put(obj->kobj);
794 return ret;
795}
796
797static int klp_init_patch(struct klp_patch *patch)
798{
799 struct klp_object *obj;
800 int ret;
801
802 if (!patch->objs)
803 return -EINVAL;
804
805 mutex_lock(&klp_mutex);
806
807 patch->state = KLP_DISABLED;
808
809 ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
810 klp_root_kobj, patch->mod->name);
811 if (ret)
812 goto unlock;
813
814 for (obj = patch->objs; obj->funcs; obj++) {
815 ret = klp_init_object(patch, obj);
816 if (ret)
817 goto free;
818 }
819
820 list_add_tail(&patch->list, &klp_patches);
821
822 mutex_unlock(&klp_mutex);
823
824 return 0;
825
826free:
827 klp_free_objects_limited(patch, obj);
828 kobject_put(&patch->kobj);
829unlock:
830 mutex_unlock(&klp_mutex);
831 return ret;
832}
833
834/**
835 * klp_unregister_patch() - unregisters a patch
836 * @patch: Disabled patch to be unregistered
837 *
838 * Frees the data structures and removes the sysfs interface.
839 *
840 * Return: 0 on success, otherwise error
841 */
842int klp_unregister_patch(struct klp_patch *patch)
843{
844 int ret = 0;
845
846 mutex_lock(&klp_mutex);
847
848 if (!klp_is_patch_registered(patch)) {
849 ret = -EINVAL;
850 goto out;
851 }
852
853 if (patch->state == KLP_ENABLED) {
854 ret = -EBUSY;
855 goto out;
856 }
857
858 klp_free_patch(patch);
859
860out:
861 mutex_unlock(&klp_mutex);
862 return ret;
863}
864EXPORT_SYMBOL_GPL(klp_unregister_patch);
865
866/**
867 * klp_register_patch() - registers a patch
868 * @patch: Patch to be registered
869 *
870 * Initializes the data structure associated with the patch and
871 * creates the sysfs interface.
872 *
873 * Return: 0 on success, otherwise error
874 */
875int klp_register_patch(struct klp_patch *patch)
876{
877 int ret;
878
879 if (!klp_initialized())
880 return -ENODEV;
881
882 if (!patch || !patch->mod)
883 return -EINVAL;
884
885 /*
886 * A reference is taken on the patch module to prevent it from being
887 * unloaded. Right now, we don't allow patch modules to unload since
888 * there is currently no method to determine if a thread is still
889 * running in the patched code contained in the patch module once
890 * the ftrace registration is successful.
891 */
892 if (!try_module_get(patch->mod))
893 return -ENODEV;
894
895 ret = klp_init_patch(patch);
896 if (ret)
897 module_put(patch->mod);
898
899 return ret;
900}
901EXPORT_SYMBOL_GPL(klp_register_patch);
902
903static void klp_module_notify_coming(struct klp_patch *patch,
904 struct klp_object *obj)
905{
906 struct module *pmod = patch->mod;
907 struct module *mod = obj->mod;
908 int ret;
909
910 ret = klp_init_object_loaded(patch, obj);
911 if (ret)
912 goto err;
913
914 if (patch->state == KLP_DISABLED)
915 return;
916
917 pr_notice("applying patch '%s' to loading module '%s'\n",
918 pmod->name, mod->name);
919
920 ret = klp_enable_object(obj);
921 if (!ret)
922 return;
923
924err:
925 pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
926 pmod->name, mod->name, ret);
927}
928
929static void klp_module_notify_going(struct klp_patch *patch,
930 struct klp_object *obj)
931{
932 struct module *pmod = patch->mod;
933 struct module *mod = obj->mod;
934 int ret;
935
936 if (patch->state == KLP_DISABLED)
937 goto disabled;
938
939 pr_notice("reverting patch '%s' on unloading module '%s'\n",
940 pmod->name, mod->name);
941
942 ret = klp_disable_object(obj);
943 if (ret)
944 pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
945 pmod->name, mod->name, ret);
946
947disabled:
948 klp_free_object_loaded(obj);
949}
950
951static int klp_module_notify(struct notifier_block *nb, unsigned long action,
952 void *data)
953{
954 struct module *mod = data;
955 struct klp_patch *patch;
956 struct klp_object *obj;
957
958 if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING)
959 return 0;
960
961 mutex_lock(&klp_mutex);
962
963 list_for_each_entry(patch, &klp_patches, list) {
964 for (obj = patch->objs; obj->funcs; obj++) {
965 if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
966 continue;
967
968 if (action == MODULE_STATE_COMING) {
969 obj->mod = mod;
970 klp_module_notify_coming(patch, obj);
971 } else /* MODULE_STATE_GOING */
972 klp_module_notify_going(patch, obj);
973
974 break;
975 }
976 }
977
978 mutex_unlock(&klp_mutex);
979
980 return 0;
981}
982
983static struct notifier_block klp_module_nb = {
984 .notifier_call = klp_module_notify,
985 .priority = INT_MIN+1, /* called late but before ftrace notifier */
986};
987
988static int klp_init(void)
989{
990 int ret;
991
992 ret = klp_check_compiler_support();
993 if (ret) {
994 pr_info("Your compiler is too old; turning off.\n");
995 return -EINVAL;
996 }
997
998 ret = register_module_notifier(&klp_module_nb);
999 if (ret)
1000 return ret;
1001
1002 klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
1003 if (!klp_root_kobj) {
1004 ret = -ENOMEM;
1005 goto unregister;
1006 }
1007
1008 return 0;
1009
1010unregister:
1011 unregister_module_notifier(&klp_module_nb);
1012 return ret;
1013}
1014
1015module_init(klp_init);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8541bfdfd232..de7a416cca2a 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,11 +1,11 @@
1 1
2obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o 2obj-y += mutex.o semaphore.o rwsem.o
3 3
4ifdef CONFIG_FUNCTION_TRACER 4ifdef CONFIG_FUNCTION_TRACER
5CFLAGS_REMOVE_lockdep.o = -pg 5CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
6CFLAGS_REMOVE_lockdep_proc.o = -pg 6CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
7CFLAGS_REMOVE_mutex-debug.o = -pg 7CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
8CFLAGS_REMOVE_rtmutex-debug.o = -pg 8CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
9endif 9endif
10 10
11obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 11obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
14obj-$(CONFIG_LOCKDEP) += lockdep_proc.o 14obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
15endif 15endif
16obj-$(CONFIG_SMP) += spinlock.o 16obj-$(CONFIG_SMP) += spinlock.o
17obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
17obj-$(CONFIG_SMP) += lglock.o 18obj-$(CONFIG_SMP) += lglock.o
18obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 19obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
19obj-$(CONFIG_RT_MUTEXES) += rtmutex.o 20obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 4d60986fcbee..d1fe2ba5bac9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
108 arch_mcs_spin_unlock_contended(&next->locked); 108 arch_mcs_spin_unlock_contended(&next->locked);
109} 109}
110 110
111/*
112 * Cancellable version of the MCS lock above.
113 *
114 * Intended for adaptive spinning of sleeping locks:
115 * mutex_lock()/rwsem_down_{read,write}() etc.
116 */
117
118struct optimistic_spin_node {
119 struct optimistic_spin_node *next, *prev;
120 int locked; /* 1 if lock acquired */
121 int cpu; /* encoded CPU # value */
122};
123
124extern bool osq_lock(struct optimistic_spin_queue *lock);
125extern void osq_unlock(struct optimistic_spin_queue *lock);
126
127#endif /* __LINUX_MCS_SPINLOCK_H */ 111#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 454195194d4a..94674e5919cb 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -81,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
81 * The mutex must later on be released by the same task that 81 * The mutex must later on be released by the same task that
82 * acquired it. Recursive locking is not allowed. The task 82 * acquired it. Recursive locking is not allowed. The task
83 * may not exit without first unlocking the mutex. Also, kernel 83 * may not exit without first unlocking the mutex. Also, kernel
84 * memory where the mutex resides mutex must not be freed with 84 * memory where the mutex resides must not be freed with
85 * the mutex still locked. The mutex must first be initialized 85 * the mutex still locked. The mutex must first be initialized
86 * (or statically defined) before it can be locked. memset()-ing 86 * (or statically defined) before it can be locked. memset()-ing
87 * the mutex to 0 is not allowed. 87 * the mutex to 0 is not allowed.
@@ -147,7 +147,7 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
147} 147}
148 148
149/* 149/*
150 * after acquiring lock with fastpath or when we lost out in contested 150 * After acquiring lock with fastpath or when we lost out in contested
151 * slowpath, set ctx and wake up any waiters so they can recheck. 151 * slowpath, set ctx and wake up any waiters so they can recheck.
152 * 152 *
153 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, 153 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
@@ -191,19 +191,32 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
191 spin_unlock_mutex(&lock->base.wait_lock, flags); 191 spin_unlock_mutex(&lock->base.wait_lock, flags);
192} 192}
193 193
194
195#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
196/* 194/*
197 * In order to avoid a stampede of mutex spinners from acquiring the mutex 195 * After acquiring lock in the slowpath set ctx and wake up any
198 * more or less simultaneously, the spinners need to acquire a MCS lock 196 * waiters so they can recheck.
199 * first before spinning on the owner field.
200 * 197 *
198 * Callers must hold the mutex wait_lock.
201 */ 199 */
200static __always_inline void
201ww_mutex_set_context_slowpath(struct ww_mutex *lock,
202 struct ww_acquire_ctx *ctx)
203{
204 struct mutex_waiter *cur;
202 205
203/* 206 ww_mutex_lock_acquired(lock, ctx);
204 * Mutex spinning code migrated from kernel/sched/core.c 207 lock->ctx = ctx;
205 */ 208
209 /*
210 * Give any possible sleeping processes the chance to wake up,
211 * so they can recheck if they have to back off.
212 */
213 list_for_each_entry(cur, &lock->base.wait_list, list) {
214 debug_mutex_wake_waiter(&lock->base, cur);
215 wake_up_process(cur->task);
216 }
217}
206 218
219#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
207static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 220static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
208{ 221{
209 if (lock->owner != owner) 222 if (lock->owner != owner)
@@ -307,6 +320,11 @@ static bool mutex_optimistic_spin(struct mutex *lock,
307 if (!mutex_can_spin_on_owner(lock)) 320 if (!mutex_can_spin_on_owner(lock))
308 goto done; 321 goto done;
309 322
323 /*
324 * In order to avoid a stampede of mutex spinners trying to
325 * acquire the mutex all at once, the spinners need to take a
326 * MCS (queued) lock first before spinning on the owner field.
327 */
310 if (!osq_lock(&lock->osq)) 328 if (!osq_lock(&lock->osq))
311 goto done; 329 goto done;
312 330
@@ -469,7 +487,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
469EXPORT_SYMBOL(ww_mutex_unlock); 487EXPORT_SYMBOL(ww_mutex_unlock);
470 488
471static inline int __sched 489static inline int __sched
472__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) 490__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
473{ 491{
474 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); 492 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
475 struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); 493 struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
@@ -557,7 +575,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
557 } 575 }
558 576
559 if (use_ww_ctx && ww_ctx->acquired > 0) { 577 if (use_ww_ctx && ww_ctx->acquired > 0) {
560 ret = __mutex_lock_check_stamp(lock, ww_ctx); 578 ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
561 if (ret) 579 if (ret)
562 goto err; 580 goto err;
563 } 581 }
@@ -569,6 +587,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
569 schedule_preempt_disabled(); 587 schedule_preempt_disabled();
570 spin_lock_mutex(&lock->wait_lock, flags); 588 spin_lock_mutex(&lock->wait_lock, flags);
571 } 589 }
590 __set_task_state(task, TASK_RUNNING);
591
572 mutex_remove_waiter(lock, &waiter, current_thread_info()); 592 mutex_remove_waiter(lock, &waiter, current_thread_info());
573 /* set it to 0 if there are no waiters left: */ 593 /* set it to 0 if there are no waiters left: */
574 if (likely(list_empty(&lock->wait_list))) 594 if (likely(list_empty(&lock->wait_list)))
@@ -582,23 +602,7 @@ skip_wait:
582 602
583 if (use_ww_ctx) { 603 if (use_ww_ctx) {
584 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); 604 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
585 struct mutex_waiter *cur; 605 ww_mutex_set_context_slowpath(ww, ww_ctx);
586
587 /*
588 * This branch gets optimized out for the common case,
589 * and is only important for ww_mutex_lock.
590 */
591 ww_mutex_lock_acquired(ww, ww_ctx);
592 ww->ctx = ww_ctx;
593
594 /*
595 * Give any possible sleeping processes the chance to wake up,
596 * so they can recheck if they have to back off.
597 */
598 list_for_each_entry(cur, &lock->wait_list, list) {
599 debug_mutex_wake_waiter(lock, cur);
600 wake_up_process(cur->task);
601 }
602 } 606 }
603 607
604 spin_unlock_mutex(&lock->wait_lock, flags); 608 spin_unlock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/osq_lock.c
index 9887a905a762..c112d00341b0 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,8 +1,6 @@
1#include <linux/percpu.h> 1#include <linux/percpu.h>
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include "mcs_spinlock.h" 3#include <linux/osq_lock.h>
4
5#ifdef CONFIG_SMP
6 4
7/* 5/*
8 * An MCS like lock especially tailored for optimistic spinning for sleeping 6 * An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -111,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
111 * cmpxchg in an attempt to undo our queueing. 109 * cmpxchg in an attempt to undo our queueing.
112 */ 110 */
113 111
114 while (!smp_load_acquire(&node->locked)) { 112 while (!ACCESS_ONCE(node->locked)) {
115 /* 113 /*
116 * If we need to reschedule bail... so we can block. 114 * If we need to reschedule bail... so we can block.
117 */ 115 */
@@ -203,6 +201,3 @@ void osq_unlock(struct optimistic_spin_queue *lock)
203 if (next) 201 if (next)
204 ACCESS_ONCE(next->locked) = 1; 202 ACCESS_ONCE(next->locked) = 1;
205} 203}
206
207#endif
208
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7c98873a3077..3059bc2f022d 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1130,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
1130 set_current_state(state); 1130 set_current_state(state);
1131 } 1131 }
1132 1132
1133 __set_current_state(TASK_RUNNING);
1133 return ret; 1134 return ret;
1134} 1135}
1135 1136
@@ -1188,10 +1189,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
1188 ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); 1189 ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
1189 1190
1190 if (likely(!ret)) 1191 if (likely(!ret))
1192 /* sleep on the mutex */
1191 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); 1193 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
1192 1194
1193 set_current_state(TASK_RUNNING);
1194
1195 if (unlikely(ret)) { 1195 if (unlikely(ret)) {
1196 remove_waiter(lock, &waiter); 1196 remove_waiter(lock, &waiter);
1197 rt_mutex_handle_deadlock(ret, chwalk, &waiter); 1197 rt_mutex_handle_deadlock(ret, chwalk, &waiter);
@@ -1626,10 +1626,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1626 1626
1627 set_current_state(TASK_INTERRUPTIBLE); 1627 set_current_state(TASK_INTERRUPTIBLE);
1628 1628
1629 /* sleep on the mutex */
1629 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); 1630 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
1630 1631
1631 set_current_state(TASK_RUNNING);
1632
1633 if (unlikely(ret)) 1632 if (unlikely(ret))
1634 remove_waiter(lock, waiter); 1633 remove_waiter(lock, waiter);
1635 1634
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2c93571162cb..2555ae15ec14 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -154,7 +154,7 @@ void __sched __down_read(struct rw_semaphore *sem)
154 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 154 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
155 } 155 }
156 156
157 tsk->state = TASK_RUNNING; 157 __set_task_state(tsk, TASK_RUNNING);
158 out: 158 out:
159 ; 159 ;
160} 160}
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 7628c3fc37ca..2f7cc4076f50 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -242,8 +242,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
242 schedule(); 242 schedule();
243 } 243 }
244 244
245 tsk->state = TASK_RUNNING; 245 __set_task_state(tsk, TASK_RUNNING);
246
247 return sem; 246 return sem;
248} 247}
249EXPORT_SYMBOL(rwsem_down_read_failed); 248EXPORT_SYMBOL(rwsem_down_read_failed);
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..db3ccb1dd614 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
363} 363}
364EXPORT_SYMBOL(_raw_spin_lock_nested); 364EXPORT_SYMBOL(_raw_spin_lock_nested);
365 365
366void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
367{
368 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
369 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
370 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
371}
372EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
373
366unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, 374unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
367 int subclass) 375 int subclass)
368{ 376{
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4803da6eab62..ae9fc7cc360e 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
402} 402}
403EXPORT_SYMBOL_GPL(raw_notifier_call_chain); 403EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
404 404
405#ifdef CONFIG_SRCU
405/* 406/*
406 * SRCU notifier chain routines. Registration and unregistration 407 * SRCU notifier chain routines. Registration and unregistration
407 * use a mutex, and call_chain is synchronized by SRCU (no locks). 408 * use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
528} 529}
529EXPORT_SYMBOL_GPL(srcu_init_notifier_head); 530EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
530 531
532#endif /* CONFIG_SRCU */
533
531static ATOMIC_NOTIFIER_HEAD(die_chain); 534static ATOMIC_NOTIFIER_HEAD(die_chain);
532 535
533int notrace notify_die(enum die_val val, const char *str, 536int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d8d6f906dec..8136ad76e5fd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -226,6 +226,7 @@ static const struct tnt tnts[] = {
226 { TAINT_OOT_MODULE, 'O', ' ' }, 226 { TAINT_OOT_MODULE, 'O', ' ' },
227 { TAINT_UNSIGNED_MODULE, 'E', ' ' }, 227 { TAINT_UNSIGNED_MODULE, 'E', ' ' },
228 { TAINT_SOFTLOCKUP, 'L', ' ' }, 228 { TAINT_SOFTLOCKUP, 'L', ' ' },
229 { TAINT_LIVEPATCH, 'K', ' ' },
229}; 230};
230 231
231/** 232/**
@@ -246,6 +247,7 @@ static const struct tnt tnts[] = {
246 * 'O' - Out-of-tree module has been loaded. 247 * 'O' - Out-of-tree module has been loaded.
247 * 'E' - Unsigned module has been loaded. 248 * 'E' - Unsigned module has been loaded.
248 * 'L' - A soft lockup has previously occurred. 249 * 'L' - A soft lockup has previously occurred.
250 * 'K' - Kernel has been live patched.
249 * 251 *
250 * The string is overwritten by the next call to print_tainted(). 252 * The string is overwritten by the next call to print_tainted().
251 */ 253 */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 48b28d387c7f..7e01f78f0417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -251,6 +251,7 @@ config APM_EMULATION
251 251
252config PM_OPP 252config PM_OPP
253 bool 253 bool
254 select SRCU
254 ---help--- 255 ---help---
255 SOCs have a standard set of tuples consisting of frequency and 256 SOCs have a standard set of tuples consisting of frequency and
256 voltage pairs that the device will support per voltage domain. This 257 voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5a6ec8678b9a..564f786df470 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -84,8 +84,8 @@ static int try_to_freeze_tasks(bool user_only)
84 elapsed_msecs = elapsed_msecs64; 84 elapsed_msecs = elapsed_msecs64;
85 85
86 if (todo) { 86 if (todo) {
87 printk("\n"); 87 pr_cont("\n");
88 printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " 88 pr_err("Freezing of tasks %s after %d.%03d seconds "
89 "(%d tasks refusing to freeze, wq_busy=%d):\n", 89 "(%d tasks refusing to freeze, wq_busy=%d):\n",
90 wakeup ? "aborted" : "failed", 90 wakeup ? "aborted" : "failed",
91 elapsed_msecs / 1000, elapsed_msecs % 1000, 91 elapsed_msecs / 1000, elapsed_msecs % 1000,
@@ -101,37 +101,13 @@ static int try_to_freeze_tasks(bool user_only)
101 read_unlock(&tasklist_lock); 101 read_unlock(&tasklist_lock);
102 } 102 }
103 } else { 103 } else {
104 printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, 104 pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
105 elapsed_msecs % 1000); 105 elapsed_msecs % 1000);
106 } 106 }
107 107
108 return todo ? -EBUSY : 0; 108 return todo ? -EBUSY : 0;
109} 109}
110 110
111static bool __check_frozen_processes(void)
112{
113 struct task_struct *g, *p;
114
115 for_each_process_thread(g, p)
116 if (p != current && !freezer_should_skip(p) && !frozen(p))
117 return false;
118
119 return true;
120}
121
122/*
123 * Returns true if all freezable tasks (except for current) are frozen already
124 */
125static bool check_frozen_processes(void)
126{
127 bool ret;
128
129 read_lock(&tasklist_lock);
130 ret = __check_frozen_processes();
131 read_unlock(&tasklist_lock);
132 return ret;
133}
134
135/** 111/**
136 * freeze_processes - Signal user space processes to enter the refrigerator. 112 * freeze_processes - Signal user space processes to enter the refrigerator.
137 * The current thread will not be frozen. The same process that calls 113 * The current thread will not be frozen. The same process that calls
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void)
142int freeze_processes(void) 118int freeze_processes(void)
143{ 119{
144 int error; 120 int error;
145 int oom_kills_saved;
146 121
147 error = __usermodehelper_disable(UMH_FREEZING); 122 error = __usermodehelper_disable(UMH_FREEZING);
148 if (error) 123 if (error)
@@ -155,31 +130,24 @@ int freeze_processes(void)
155 atomic_inc(&system_freezing_cnt); 130 atomic_inc(&system_freezing_cnt);
156 131
157 pm_wakeup_clear(); 132 pm_wakeup_clear();
158 printk("Freezing user space processes ... "); 133 pr_info("Freezing user space processes ... ");
159 pm_freezing = true; 134 pm_freezing = true;
160 oom_kills_saved = oom_kills_count();
161 error = try_to_freeze_tasks(true); 135 error = try_to_freeze_tasks(true);
162 if (!error) { 136 if (!error) {
163 __usermodehelper_set_disable_depth(UMH_DISABLED); 137 __usermodehelper_set_disable_depth(UMH_DISABLED);
164 oom_killer_disable(); 138 pr_cont("done.");
165
166 /*
167 * There might have been an OOM kill while we were
168 * freezing tasks and the killed task might be still
169 * on the way out so we have to double check for race.
170 */
171 if (oom_kills_count() != oom_kills_saved &&
172 !check_frozen_processes()) {
173 __usermodehelper_set_disable_depth(UMH_ENABLED);
174 printk("OOM in progress.");
175 error = -EBUSY;
176 } else {
177 printk("done.");
178 }
179 } 139 }
180 printk("\n"); 140 pr_cont("\n");
181 BUG_ON(in_atomic()); 141 BUG_ON(in_atomic());
182 142
143 /*
144 * Now that the whole userspace is frozen we need to disbale
145 * the OOM killer to disallow any further interference with
146 * killable tasks.
147 */
148 if (!error && !oom_killer_disable())
149 error = -EBUSY;
150
183 if (error) 151 if (error)
184 thaw_processes(); 152 thaw_processes();
185 return error; 153 return error;
@@ -197,13 +165,14 @@ int freeze_kernel_threads(void)
197{ 165{
198 int error; 166 int error;
199 167
200 printk("Freezing remaining freezable tasks ... "); 168 pr_info("Freezing remaining freezable tasks ... ");
169
201 pm_nosig_freezing = true; 170 pm_nosig_freezing = true;
202 error = try_to_freeze_tasks(false); 171 error = try_to_freeze_tasks(false);
203 if (!error) 172 if (!error)
204 printk("done."); 173 pr_cont("done.");
205 174
206 printk("\n"); 175 pr_cont("\n");
207 BUG_ON(in_atomic()); 176 BUG_ON(in_atomic());
208 177
209 if (error) 178 if (error)
@@ -224,7 +193,7 @@ void thaw_processes(void)
224 193
225 oom_killer_enable(); 194 oom_killer_enable();
226 195
227 printk("Restarting tasks ... "); 196 pr_info("Restarting tasks ... ");
228 197
229 __usermodehelper_set_disable_depth(UMH_FREEZING); 198 __usermodehelper_set_disable_depth(UMH_FREEZING);
230 thaw_workqueues(); 199 thaw_workqueues();
@@ -243,7 +212,7 @@ void thaw_processes(void)
243 usermodehelper_enable(); 212 usermodehelper_enable();
244 213
245 schedule(); 214 schedule();
246 printk("done.\n"); 215 pr_cont("done.\n");
247 trace_suspend_resume(TPS("thaw_processes"), 0, false); 216 trace_suspend_resume(TPS("thaw_processes"), 0, false);
248} 217}
249 218
@@ -252,7 +221,7 @@ void thaw_kernel_threads(void)
252 struct task_struct *g, *p; 221 struct task_struct *g, *p;
253 222
254 pm_nosig_freezing = false; 223 pm_nosig_freezing = false;
255 printk("Restarting kernel threads ... "); 224 pr_info("Restarting kernel threads ... ");
256 225
257 thaw_workqueues(); 226 thaw_workqueues();
258 227
@@ -264,5 +233,5 @@ void thaw_kernel_threads(void)
264 read_unlock(&tasklist_lock); 233 read_unlock(&tasklist_lock);
265 234
266 schedule(); 235 schedule();
267 printk("done.\n"); 236 pr_cont("done.\n");
268} 237}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 5f4c006c4b1e..97b0df71303e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -41,6 +41,8 @@
41#include <linux/platform_device.h> 41#include <linux/platform_device.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/kernel.h> 43#include <linux/kernel.h>
44#include <linux/debugfs.h>
45#include <linux/seq_file.h>
44 46
45#include <linux/uaccess.h> 47#include <linux/uaccess.h>
46#include <linux/export.h> 48#include <linux/export.h>
@@ -182,6 +184,81 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
182 c->target_value = value; 184 c->target_value = value;
183} 185}
184 186
187static inline int pm_qos_get_value(struct pm_qos_constraints *c);
188static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
189{
190 struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
191 struct pm_qos_constraints *c;
192 struct pm_qos_request *req;
193 char *type;
194 unsigned long flags;
195 int tot_reqs = 0;
196 int active_reqs = 0;
197
198 if (IS_ERR_OR_NULL(qos)) {
199 pr_err("%s: bad qos param!\n", __func__);
200 return -EINVAL;
201 }
202 c = qos->constraints;
203 if (IS_ERR_OR_NULL(c)) {
204 pr_err("%s: Bad constraints on qos?\n", __func__);
205 return -EINVAL;
206 }
207
208 /* Lock to ensure we have a snapshot */
209 spin_lock_irqsave(&pm_qos_lock, flags);
210 if (plist_head_empty(&c->list)) {
211 seq_puts(s, "Empty!\n");
212 goto out;
213 }
214
215 switch (c->type) {
216 case PM_QOS_MIN:
217 type = "Minimum";
218 break;
219 case PM_QOS_MAX:
220 type = "Maximum";
221 break;
222 case PM_QOS_SUM:
223 type = "Sum";
224 break;
225 default:
226 type = "Unknown";
227 }
228
229 plist_for_each_entry(req, &c->list, node) {
230 char *state = "Default";
231
232 if ((req->node).prio != c->default_value) {
233 active_reqs++;
234 state = "Active";
235 }
236 tot_reqs++;
237 seq_printf(s, "%d: %d: %s\n", tot_reqs,
238 (req->node).prio, state);
239 }
240
241 seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n",
242 type, pm_qos_get_value(c), active_reqs, tot_reqs);
243
244out:
245 spin_unlock_irqrestore(&pm_qos_lock, flags);
246 return 0;
247}
248
249static int pm_qos_dbg_open(struct inode *inode, struct file *file)
250{
251 return single_open(file, pm_qos_dbg_show_requests,
252 inode->i_private);
253}
254
255static const struct file_operations pm_qos_debug_fops = {
256 .open = pm_qos_dbg_open,
257 .read = seq_read,
258 .llseek = seq_lseek,
259 .release = single_release,
260};
261
185/** 262/**
186 * pm_qos_update_target - manages the constraints list and calls the notifiers 263 * pm_qos_update_target - manages the constraints list and calls the notifiers
187 * if needed 264 * if needed
@@ -509,12 +586,17 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
509EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 586EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
510 587
511/* User space interface to PM QoS classes via misc devices */ 588/* User space interface to PM QoS classes via misc devices */
512static int register_pm_qos_misc(struct pm_qos_object *qos) 589static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
513{ 590{
514 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; 591 qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
515 qos->pm_qos_power_miscdev.name = qos->name; 592 qos->pm_qos_power_miscdev.name = qos->name;
516 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; 593 qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
517 594
595 if (d) {
596 (void)debugfs_create_file(qos->name, S_IRUGO, d,
597 (void *)qos, &pm_qos_debug_fops);
598 }
599
518 return misc_register(&qos->pm_qos_power_miscdev); 600 return misc_register(&qos->pm_qos_power_miscdev);
519} 601}
520 602
@@ -608,11 +690,16 @@ static int __init pm_qos_power_init(void)
608{ 690{
609 int ret = 0; 691 int ret = 0;
610 int i; 692 int i;
693 struct dentry *d;
611 694
612 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); 695 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
613 696
697 d = debugfs_create_dir("pm_qos", NULL);
698 if (IS_ERR_OR_NULL(d))
699 d = NULL;
700
614 for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { 701 for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
615 ret = register_pm_qos_misc(pm_qos_array[i]); 702 ret = register_pm_qos_misc(pm_qos_array[i], d);
616 if (ret < 0) { 703 if (ret < 0) {
617 printk(KERN_ERR "pm_qos_param: %s setup failed\n", 704 printk(KERN_ERR "pm_qos_param: %s setup failed\n",
618 pm_qos_array[i]->name); 705 pm_qos_array[i]->name);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0c40c16174b4..c24d5a23bf93 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1472,9 +1472,9 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1472/** 1472/**
1473 * free_unnecessary_pages - Release preallocated pages not needed for the image 1473 * free_unnecessary_pages - Release preallocated pages not needed for the image
1474 */ 1474 */
1475static void free_unnecessary_pages(void) 1475static unsigned long free_unnecessary_pages(void)
1476{ 1476{
1477 unsigned long save, to_free_normal, to_free_highmem; 1477 unsigned long save, to_free_normal, to_free_highmem, free;
1478 1478
1479 save = count_data_pages(); 1479 save = count_data_pages();
1480 if (alloc_normal >= save) { 1480 if (alloc_normal >= save) {
@@ -1495,6 +1495,7 @@ static void free_unnecessary_pages(void)
1495 else 1495 else
1496 to_free_normal = 0; 1496 to_free_normal = 0;
1497 } 1497 }
1498 free = to_free_normal + to_free_highmem;
1498 1499
1499 memory_bm_position_reset(&copy_bm); 1500 memory_bm_position_reset(&copy_bm);
1500 1501
@@ -1518,6 +1519,8 @@ static void free_unnecessary_pages(void)
1518 swsusp_unset_page_free(page); 1519 swsusp_unset_page_free(page);
1519 __free_page(page); 1520 __free_page(page);
1520 } 1521 }
1522
1523 return free;
1521} 1524}
1522 1525
1523/** 1526/**
@@ -1707,7 +1710,7 @@ int hibernate_preallocate_memory(void)
1707 * pages in memory, but we have allocated more. Release the excessive 1710 * pages in memory, but we have allocated more. Release the excessive
1708 * ones now. 1711 * ones now.
1709 */ 1712 */
1710 free_unnecessary_pages(); 1713 pages -= free_unnecessary_pages();
1711 1714
1712 out: 1715 out:
1713 stop = ktime_get(); 1716 stop = ktime_get();
@@ -2310,8 +2313,6 @@ static inline void free_highmem_data(void)
2310 free_image_page(buffer, PG_UNSAFE_CLEAR); 2313 free_image_page(buffer, PG_UNSAFE_CLEAR);
2311} 2314}
2312#else 2315#else
2313static inline int get_safe_write_buffer(void) { return 0; }
2314
2315static unsigned int 2316static unsigned int
2316count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } 2317count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
2317 2318
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 02d6b6d28796..c06df7de0963 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -935,8 +935,8 @@ static int __init ignore_loglevel_setup(char *str)
935 935
936early_param("ignore_loglevel", ignore_loglevel_setup); 936early_param("ignore_loglevel", ignore_loglevel_setup);
937module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); 937module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
938MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" 938MODULE_PARM_DESC(ignore_loglevel,
939 "print all kernel messages to the console."); 939 "ignore loglevel setting (prints all kernel messages to the console)");
940 940
941#ifdef CONFIG_BOOT_PRINTK_DELAY 941#ifdef CONFIG_BOOT_PRINTK_DELAY
942 942
@@ -1419,16 +1419,16 @@ static void call_console_drivers(int level, const char *text, size_t len)
1419} 1419}
1420 1420
1421/* 1421/*
1422 * Zap console related locks when oopsing. Only zap at most once 1422 * Zap console related locks when oopsing.
1423 * every 10 seconds, to leave time for slow consoles to print a 1423 * To leave time for slow consoles to print a full oops,
1424 * full oops. 1424 * only zap at most once every 30 seconds.
1425 */ 1425 */
1426static void zap_locks(void) 1426static void zap_locks(void)
1427{ 1427{
1428 static unsigned long oops_timestamp; 1428 static unsigned long oops_timestamp;
1429 1429
1430 if (time_after_eq(jiffies, oops_timestamp) && 1430 if (time_after_eq(jiffies, oops_timestamp) &&
1431 !time_after(jiffies, oops_timestamp + 30 * HZ)) 1431 !time_after(jiffies, oops_timestamp + 30 * HZ))
1432 return; 1432 return;
1433 1433
1434 oops_timestamp = jiffies; 1434 oops_timestamp = jiffies;
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index e6fae503d1bc..50a808424b06 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,5 @@
1obj-y += update.o srcu.o 1obj-y += update.o
2obj-$(CONFIG_SRCU) += srcu.o
2obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 3obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
3obj-$(CONFIG_TREE_RCU) += tree.o 4obj-$(CONFIG_TREE_RCU) += tree.o
4obj-$(CONFIG_PREEMPT_RCU) += tree.o 5obj-$(CONFIG_PREEMPT_RCU) += tree.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 07bb02eda844..80adef7d4c3d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -137,4 +137,10 @@ int rcu_jiffies_till_stall_check(void);
137 137
138void rcu_early_boot_tests(void); 138void rcu_early_boot_tests(void);
139 139
140/*
141 * This function really isn't for public consumption, but RCU is special in
142 * that context switches can allow the state machine to make progress.
143 */
144extern void resched_cpu(int cpu);
145
140#endif /* __LINUX_RCU_H */ 146#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4d559baf06e0..30d42aa55d83 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -244,7 +244,8 @@ struct rcu_torture_ops {
244 int (*readlock)(void); 244 int (*readlock)(void);
245 void (*read_delay)(struct torture_random_state *rrsp); 245 void (*read_delay)(struct torture_random_state *rrsp);
246 void (*readunlock)(int idx); 246 void (*readunlock)(int idx);
247 int (*completed)(void); 247 unsigned long (*started)(void);
248 unsigned long (*completed)(void);
248 void (*deferred_free)(struct rcu_torture *p); 249 void (*deferred_free)(struct rcu_torture *p);
249 void (*sync)(void); 250 void (*sync)(void);
250 void (*exp_sync)(void); 251 void (*exp_sync)(void);
@@ -296,11 +297,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU)
296 rcu_read_unlock(); 297 rcu_read_unlock();
297} 298}
298 299
299static int rcu_torture_completed(void)
300{
301 return rcu_batches_completed();
302}
303
304/* 300/*
305 * Update callback in the pipe. This should be invoked after a grace period. 301 * Update callback in the pipe. This should be invoked after a grace period.
306 */ 302 */
@@ -356,7 +352,7 @@ rcu_torture_cb(struct rcu_head *p)
356 cur_ops->deferred_free(rp); 352 cur_ops->deferred_free(rp);
357} 353}
358 354
359static int rcu_no_completed(void) 355static unsigned long rcu_no_completed(void)
360{ 356{
361 return 0; 357 return 0;
362} 358}
@@ -377,7 +373,8 @@ static struct rcu_torture_ops rcu_ops = {
377 .readlock = rcu_torture_read_lock, 373 .readlock = rcu_torture_read_lock,
378 .read_delay = rcu_read_delay, 374 .read_delay = rcu_read_delay,
379 .readunlock = rcu_torture_read_unlock, 375 .readunlock = rcu_torture_read_unlock,
380 .completed = rcu_torture_completed, 376 .started = rcu_batches_started,
377 .completed = rcu_batches_completed,
381 .deferred_free = rcu_torture_deferred_free, 378 .deferred_free = rcu_torture_deferred_free,
382 .sync = synchronize_rcu, 379 .sync = synchronize_rcu,
383 .exp_sync = synchronize_rcu_expedited, 380 .exp_sync = synchronize_rcu_expedited,
@@ -407,11 +404,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
407 rcu_read_unlock_bh(); 404 rcu_read_unlock_bh();
408} 405}
409 406
410static int rcu_bh_torture_completed(void)
411{
412 return rcu_batches_completed_bh();
413}
414
415static void rcu_bh_torture_deferred_free(struct rcu_torture *p) 407static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
416{ 408{
417 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); 409 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
@@ -423,7 +415,8 @@ static struct rcu_torture_ops rcu_bh_ops = {
423 .readlock = rcu_bh_torture_read_lock, 415 .readlock = rcu_bh_torture_read_lock,
424 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 416 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
425 .readunlock = rcu_bh_torture_read_unlock, 417 .readunlock = rcu_bh_torture_read_unlock,
426 .completed = rcu_bh_torture_completed, 418 .started = rcu_batches_started_bh,
419 .completed = rcu_batches_completed_bh,
427 .deferred_free = rcu_bh_torture_deferred_free, 420 .deferred_free = rcu_bh_torture_deferred_free,
428 .sync = synchronize_rcu_bh, 421 .sync = synchronize_rcu_bh,
429 .exp_sync = synchronize_rcu_bh_expedited, 422 .exp_sync = synchronize_rcu_bh_expedited,
@@ -466,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
466 .readlock = rcu_torture_read_lock, 459 .readlock = rcu_torture_read_lock,
467 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 460 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
468 .readunlock = rcu_torture_read_unlock, 461 .readunlock = rcu_torture_read_unlock,
462 .started = rcu_no_completed,
469 .completed = rcu_no_completed, 463 .completed = rcu_no_completed,
470 .deferred_free = rcu_busted_torture_deferred_free, 464 .deferred_free = rcu_busted_torture_deferred_free,
471 .sync = synchronize_rcu_busted, 465 .sync = synchronize_rcu_busted,
@@ -510,7 +504,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
510 srcu_read_unlock(&srcu_ctl, idx); 504 srcu_read_unlock(&srcu_ctl, idx);
511} 505}
512 506
513static int srcu_torture_completed(void) 507static unsigned long srcu_torture_completed(void)
514{ 508{
515 return srcu_batches_completed(&srcu_ctl); 509 return srcu_batches_completed(&srcu_ctl);
516} 510}
@@ -564,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = {
564 .readlock = srcu_torture_read_lock, 558 .readlock = srcu_torture_read_lock,
565 .read_delay = srcu_read_delay, 559 .read_delay = srcu_read_delay,
566 .readunlock = srcu_torture_read_unlock, 560 .readunlock = srcu_torture_read_unlock,
561 .started = NULL,
567 .completed = srcu_torture_completed, 562 .completed = srcu_torture_completed,
568 .deferred_free = srcu_torture_deferred_free, 563 .deferred_free = srcu_torture_deferred_free,
569 .sync = srcu_torture_synchronize, 564 .sync = srcu_torture_synchronize,
@@ -600,7 +595,8 @@ static struct rcu_torture_ops sched_ops = {
600 .readlock = sched_torture_read_lock, 595 .readlock = sched_torture_read_lock,
601 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 596 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
602 .readunlock = sched_torture_read_unlock, 597 .readunlock = sched_torture_read_unlock,
603 .completed = rcu_no_completed, 598 .started = rcu_batches_started_sched,
599 .completed = rcu_batches_completed_sched,
604 .deferred_free = rcu_sched_torture_deferred_free, 600 .deferred_free = rcu_sched_torture_deferred_free,
605 .sync = synchronize_sched, 601 .sync = synchronize_sched,
606 .exp_sync = synchronize_sched_expedited, 602 .exp_sync = synchronize_sched_expedited,
@@ -638,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = {
638 .readlock = tasks_torture_read_lock, 634 .readlock = tasks_torture_read_lock,
639 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 635 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
640 .readunlock = tasks_torture_read_unlock, 636 .readunlock = tasks_torture_read_unlock,
637 .started = rcu_no_completed,
641 .completed = rcu_no_completed, 638 .completed = rcu_no_completed,
642 .deferred_free = rcu_tasks_torture_deferred_free, 639 .deferred_free = rcu_tasks_torture_deferred_free,
643 .sync = synchronize_rcu_tasks, 640 .sync = synchronize_rcu_tasks,
@@ -1015,8 +1012,8 @@ static void rcutorture_trace_dump(void)
1015static void rcu_torture_timer(unsigned long unused) 1012static void rcu_torture_timer(unsigned long unused)
1016{ 1013{
1017 int idx; 1014 int idx;
1018 int completed; 1015 unsigned long started;
1019 int completed_end; 1016 unsigned long completed;
1020 static DEFINE_TORTURE_RANDOM(rand); 1017 static DEFINE_TORTURE_RANDOM(rand);
1021 static DEFINE_SPINLOCK(rand_lock); 1018 static DEFINE_SPINLOCK(rand_lock);
1022 struct rcu_torture *p; 1019 struct rcu_torture *p;
@@ -1024,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused)
1024 unsigned long long ts; 1021 unsigned long long ts;
1025 1022
1026 idx = cur_ops->readlock(); 1023 idx = cur_ops->readlock();
1027 completed = cur_ops->completed(); 1024 if (cur_ops->started)
1025 started = cur_ops->started();
1026 else
1027 started = cur_ops->completed();
1028 ts = rcu_trace_clock_local(); 1028 ts = rcu_trace_clock_local();
1029 p = rcu_dereference_check(rcu_torture_current, 1029 p = rcu_dereference_check(rcu_torture_current,
1030 rcu_read_lock_bh_held() || 1030 rcu_read_lock_bh_held() ||
@@ -1047,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused)
1047 /* Should not happen, but... */ 1047 /* Should not happen, but... */
1048 pipe_count = RCU_TORTURE_PIPE_LEN; 1048 pipe_count = RCU_TORTURE_PIPE_LEN;
1049 } 1049 }
1050 completed_end = cur_ops->completed(); 1050 completed = cur_ops->completed();
1051 if (pipe_count > 1) { 1051 if (pipe_count > 1) {
1052 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, 1052 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
1053 completed, completed_end); 1053 started, completed);
1054 rcutorture_trace_dump(); 1054 rcutorture_trace_dump();
1055 } 1055 }
1056 __this_cpu_inc(rcu_torture_count[pipe_count]); 1056 __this_cpu_inc(rcu_torture_count[pipe_count]);
1057 completed = completed_end - completed; 1057 completed = completed - started;
1058 if (cur_ops->started)
1059 completed++;
1058 if (completed > RCU_TORTURE_PIPE_LEN) { 1060 if (completed > RCU_TORTURE_PIPE_LEN) {
1059 /* Should not happen, but... */ 1061 /* Should not happen, but... */
1060 completed = RCU_TORTURE_PIPE_LEN; 1062 completed = RCU_TORTURE_PIPE_LEN;
@@ -1073,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused)
1073static int 1075static int
1074rcu_torture_reader(void *arg) 1076rcu_torture_reader(void *arg)
1075{ 1077{
1076 int completed; 1078 unsigned long started;
1077 int completed_end; 1079 unsigned long completed;
1078 int idx; 1080 int idx;
1079 DEFINE_TORTURE_RANDOM(rand); 1081 DEFINE_TORTURE_RANDOM(rand);
1080 struct rcu_torture *p; 1082 struct rcu_torture *p;
@@ -1093,7 +1095,10 @@ rcu_torture_reader(void *arg)
1093 mod_timer(&t, jiffies + 1); 1095 mod_timer(&t, jiffies + 1);
1094 } 1096 }
1095 idx = cur_ops->readlock(); 1097 idx = cur_ops->readlock();
1096 completed = cur_ops->completed(); 1098 if (cur_ops->started)
1099 started = cur_ops->started();
1100 else
1101 started = cur_ops->completed();
1097 ts = rcu_trace_clock_local(); 1102 ts = rcu_trace_clock_local();
1098 p = rcu_dereference_check(rcu_torture_current, 1103 p = rcu_dereference_check(rcu_torture_current,
1099 rcu_read_lock_bh_held() || 1104 rcu_read_lock_bh_held() ||
@@ -1114,14 +1119,16 @@ rcu_torture_reader(void *arg)
1114 /* Should not happen, but... */ 1119 /* Should not happen, but... */
1115 pipe_count = RCU_TORTURE_PIPE_LEN; 1120 pipe_count = RCU_TORTURE_PIPE_LEN;
1116 } 1121 }
1117 completed_end = cur_ops->completed(); 1122 completed = cur_ops->completed();
1118 if (pipe_count > 1) { 1123 if (pipe_count > 1) {
1119 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, 1124 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
1120 ts, completed, completed_end); 1125 ts, started, completed);
1121 rcutorture_trace_dump(); 1126 rcutorture_trace_dump();
1122 } 1127 }
1123 __this_cpu_inc(rcu_torture_count[pipe_count]); 1128 __this_cpu_inc(rcu_torture_count[pipe_count]);
1124 completed = completed_end - completed; 1129 completed = completed - started;
1130 if (cur_ops->started)
1131 completed++;
1125 if (completed > RCU_TORTURE_PIPE_LEN) { 1132 if (completed > RCU_TORTURE_PIPE_LEN) {
1126 /* Should not happen, but... */ 1133 /* Should not happen, but... */
1127 completed = RCU_TORTURE_PIPE_LEN; 1134 completed = RCU_TORTURE_PIPE_LEN;
@@ -1420,6 +1427,9 @@ static int rcu_torture_barrier(void *arg)
1420 cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ 1427 cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
1421 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { 1428 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
1422 n_rcu_torture_barrier_error++; 1429 n_rcu_torture_barrier_error++;
1430 pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
1431 atomic_read(&barrier_cbs_invoked),
1432 n_barrier_cbs);
1423 WARN_ON_ONCE(1); 1433 WARN_ON_ONCE(1);
1424 } 1434 }
1425 n_barrier_successes++; 1435 n_barrier_successes++;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e037f3eb2f7b..445bf8ffe3fb 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
546 * Report the number of batches, correlated with, but not necessarily 546 * Report the number of batches, correlated with, but not necessarily
547 * precisely the same as, the number of grace periods that have elapsed. 547 * precisely the same as, the number of grace periods that have elapsed.
548 */ 548 */
549long srcu_batches_completed(struct srcu_struct *sp) 549unsigned long srcu_batches_completed(struct srcu_struct *sp)
550{ 550{
551 return sp->completed; 551 return sp->completed;
552} 552}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 0db5649f8817..cc9ceca7bde1 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head,
47 void (*func)(struct rcu_head *rcu), 47 void (*func)(struct rcu_head *rcu),
48 struct rcu_ctrlblk *rcp); 48 struct rcu_ctrlblk *rcp);
49 49
50static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
51
52#include "tiny_plugin.h" 50#include "tiny_plugin.h"
53 51
54/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */
55static void rcu_idle_enter_common(long long newval)
56{
57 if (newval) {
58 RCU_TRACE(trace_rcu_dyntick(TPS("--="),
59 rcu_dynticks_nesting, newval));
60 rcu_dynticks_nesting = newval;
61 return;
62 }
63 RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
64 rcu_dynticks_nesting, newval));
65 if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
66 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
67
68 RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
69 rcu_dynticks_nesting, newval));
70 ftrace_dump(DUMP_ALL);
71 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
72 current->pid, current->comm,
73 idle->pid, idle->comm); /* must be idle task! */
74 }
75 rcu_sched_qs(); /* implies rcu_bh_inc() */
76 barrier();
77 rcu_dynticks_nesting = newval;
78}
79
80/* 52/*
81 * Enter idle, which is an extended quiescent state if we have fully 53 * Enter idle, which is an extended quiescent state if we have fully
82 * entered that mode (i.e., if the new value of dynticks_nesting is zero). 54 * entered that mode.
83 */ 55 */
84void rcu_idle_enter(void) 56void rcu_idle_enter(void)
85{ 57{
86 unsigned long flags;
87 long long newval;
88
89 local_irq_save(flags);
90 WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
91 if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
92 DYNTICK_TASK_NEST_VALUE)
93 newval = 0;
94 else
95 newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
96 rcu_idle_enter_common(newval);
97 local_irq_restore(flags);
98} 58}
99EXPORT_SYMBOL_GPL(rcu_idle_enter); 59EXPORT_SYMBOL_GPL(rcu_idle_enter);
100 60
@@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
103 */ 63 */
104void rcu_irq_exit(void) 64void rcu_irq_exit(void)
105{ 65{
106 unsigned long flags;
107 long long newval;
108
109 local_irq_save(flags);
110 newval = rcu_dynticks_nesting - 1;
111 WARN_ON_ONCE(newval < 0);
112 rcu_idle_enter_common(newval);
113 local_irq_restore(flags);
114} 66}
115EXPORT_SYMBOL_GPL(rcu_irq_exit); 67EXPORT_SYMBOL_GPL(rcu_irq_exit);
116 68
117/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */
118static void rcu_idle_exit_common(long long oldval)
119{
120 if (oldval) {
121 RCU_TRACE(trace_rcu_dyntick(TPS("++="),
122 oldval, rcu_dynticks_nesting));
123 return;
124 }
125 RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
126 if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
127 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
128
129 RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
130 oldval, rcu_dynticks_nesting));
131 ftrace_dump(DUMP_ALL);
132 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
133 current->pid, current->comm,
134 idle->pid, idle->comm); /* must be idle task! */
135 }
136}
137
138/* 69/*
139 * Exit idle, so that we are no longer in an extended quiescent state. 70 * Exit idle, so that we are no longer in an extended quiescent state.
140 */ 71 */
141void rcu_idle_exit(void) 72void rcu_idle_exit(void)
142{ 73{
143 unsigned long flags;
144 long long oldval;
145
146 local_irq_save(flags);
147 oldval = rcu_dynticks_nesting;
148 WARN_ON_ONCE(rcu_dynticks_nesting < 0);
149 if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
150 rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
151 else
152 rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
153 rcu_idle_exit_common(oldval);
154 local_irq_restore(flags);
155} 74}
156EXPORT_SYMBOL_GPL(rcu_idle_exit); 75EXPORT_SYMBOL_GPL(rcu_idle_exit);
157 76
@@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
160 */ 79 */
161void rcu_irq_enter(void) 80void rcu_irq_enter(void)
162{ 81{
163 unsigned long flags;
164 long long oldval;
165
166 local_irq_save(flags);
167 oldval = rcu_dynticks_nesting;
168 rcu_dynticks_nesting++;
169 WARN_ON_ONCE(rcu_dynticks_nesting == 0);
170 rcu_idle_exit_common(oldval);
171 local_irq_restore(flags);
172} 82}
173EXPORT_SYMBOL_GPL(rcu_irq_enter); 83EXPORT_SYMBOL_GPL(rcu_irq_enter);
174 84
@@ -179,23 +89,13 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter);
179 */ 89 */
180bool notrace __rcu_is_watching(void) 90bool notrace __rcu_is_watching(void)
181{ 91{
182 return rcu_dynticks_nesting; 92 return true;
183} 93}
184EXPORT_SYMBOL(__rcu_is_watching); 94EXPORT_SYMBOL(__rcu_is_watching);
185 95
186#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ 96#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
187 97
188/* 98/*
189 * Test whether the current CPU was interrupted from idle. Nested
190 * interrupts don't count, we must be running at the first interrupt
191 * level.
192 */
193static int rcu_is_cpu_rrupt_from_idle(void)
194{
195 return rcu_dynticks_nesting <= 1;
196}
197
198/*
199 * Helper function for rcu_sched_qs() and rcu_bh_qs(). 99 * Helper function for rcu_sched_qs() and rcu_bh_qs().
200 * Also irqs are disabled to avoid confusion due to interrupt handlers 100 * Also irqs are disabled to avoid confusion due to interrupt handlers
201 * invoking call_rcu(). 101 * invoking call_rcu().
@@ -250,7 +150,7 @@ void rcu_bh_qs(void)
250void rcu_check_callbacks(int user) 150void rcu_check_callbacks(int user)
251{ 151{
252 RCU_TRACE(check_cpu_stalls()); 152 RCU_TRACE(check_cpu_stalls());
253 if (user || rcu_is_cpu_rrupt_from_idle()) 153 if (user)
254 rcu_sched_qs(); 154 rcu_sched_qs();
255 else if (!in_softirq()) 155 else if (!in_softirq())
256 rcu_bh_qs(); 156 rcu_bh_qs();
@@ -357,6 +257,11 @@ static void __call_rcu(struct rcu_head *head,
357 rcp->curtail = &head->next; 257 rcp->curtail = &head->next;
358 RCU_TRACE(rcp->qlen++); 258 RCU_TRACE(rcp->qlen++);
359 local_irq_restore(flags); 259 local_irq_restore(flags);
260
261 if (unlikely(is_idle_task(current))) {
262 /* force scheduling for rcu_sched_qs() */
263 resched_cpu(0);
264 }
360} 265}
361 266
362/* 267/*
@@ -383,6 +288,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
383void __init rcu_init(void) 288void __init rcu_init(void)
384{ 289{
385 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 290 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
291 RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
292 RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
386 293
387 rcu_early_boot_tests(); 294 rcu_early_boot_tests();
388} 295}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 858c56569127..f94e209a10d6 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
145 rcp->ticks_this_gp++; 145 rcp->ticks_this_gp++;
146 j = jiffies; 146 j = jiffies;
147 js = ACCESS_ONCE(rcp->jiffies_stall); 147 js = ACCESS_ONCE(rcp->jiffies_stall);
148 if (*rcp->curtail && ULONG_CMP_GE(j, js)) { 148 if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", 149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
150 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, 150 rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
151 jiffies - rcp->gp_start, rcp->qlen); 151 jiffies - rcp->gp_start, rcp->qlen);
152 dump_stack(); 152 dump_stack();
153 }
154 if (*rcp->curtail && ULONG_CMP_GE(j, js))
155 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + 153 ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
156 3 * rcu_jiffies_till_stall_check() + 3; 154 3 * rcu_jiffies_till_stall_check() + 3;
157 else if (ULONG_CMP_GE(j, js)) 155 } else if (ULONG_CMP_GE(j, js)) {
158 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); 156 ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
157 }
159} 158}
160 159
161static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) 160static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7680fc275036..48d640ca1a05 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
156static void invoke_rcu_core(void); 156static void invoke_rcu_core(void);
157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
158 158
159/* rcuc/rcub kthread realtime priority */
160static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
161module_param(kthread_prio, int, 0644);
162
159/* 163/*
160 * Track the rcutorture test sequence number and the update version 164 * Track the rcutorture test sequence number and the update version
161 * number within a given test. The rcutorture_testseq is incremented 165 * number within a given test. The rcutorture_testseq is incremented
@@ -215,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
215#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 219#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
216}; 220};
217 221
222DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
223EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
224
218/* 225/*
219 * Let the RCU core know that this CPU has gone through the scheduler, 226 * Let the RCU core know that this CPU has gone through the scheduler,
220 * which is a quiescent state. This is called when the need for a 227 * which is a quiescent state. This is called when the need for a
@@ -284,6 +291,22 @@ void rcu_note_context_switch(void)
284} 291}
285EXPORT_SYMBOL_GPL(rcu_note_context_switch); 292EXPORT_SYMBOL_GPL(rcu_note_context_switch);
286 293
294/*
295 * Register a quiesecent state for all RCU flavors. If there is an
296 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
297 * dyntick-idle quiescent state visible to other CPUs (but only for those
298 * RCU flavors in desparate need of a quiescent state, which will normally
299 * be none of them). Either way, do a lightweight quiescent state for
300 * all RCU flavors.
301 */
302void rcu_all_qs(void)
303{
304 if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
305 rcu_momentary_dyntick_idle();
306 this_cpu_inc(rcu_qs_ctr);
307}
308EXPORT_SYMBOL_GPL(rcu_all_qs);
309
287static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 310static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
288static long qhimark = 10000; /* If this many pending, ignore blimit. */ 311static long qhimark = 10000; /* If this many pending, ignore blimit. */
289static long qlowmark = 100; /* Once only this many pending, use blimit. */ 312static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -315,18 +338,54 @@ static void force_quiescent_state(struct rcu_state *rsp);
315static int rcu_pending(void); 338static int rcu_pending(void);
316 339
317/* 340/*
318 * Return the number of RCU-sched batches processed thus far for debug & stats. 341 * Return the number of RCU batches started thus far for debug & stats.
342 */
343unsigned long rcu_batches_started(void)
344{
345 return rcu_state_p->gpnum;
346}
347EXPORT_SYMBOL_GPL(rcu_batches_started);
348
349/*
350 * Return the number of RCU-sched batches started thus far for debug & stats.
351 */
352unsigned long rcu_batches_started_sched(void)
353{
354 return rcu_sched_state.gpnum;
355}
356EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
357
358/*
359 * Return the number of RCU BH batches started thus far for debug & stats.
319 */ 360 */
320long rcu_batches_completed_sched(void) 361unsigned long rcu_batches_started_bh(void)
362{
363 return rcu_bh_state.gpnum;
364}
365EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
366
367/*
368 * Return the number of RCU batches completed thus far for debug & stats.
369 */
370unsigned long rcu_batches_completed(void)
371{
372 return rcu_state_p->completed;
373}
374EXPORT_SYMBOL_GPL(rcu_batches_completed);
375
376/*
377 * Return the number of RCU-sched batches completed thus far for debug & stats.
378 */
379unsigned long rcu_batches_completed_sched(void)
321{ 380{
322 return rcu_sched_state.completed; 381 return rcu_sched_state.completed;
323} 382}
324EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); 383EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
325 384
326/* 385/*
327 * Return the number of RCU BH batches processed thus far for debug & stats. 386 * Return the number of RCU BH batches completed thus far for debug & stats.
328 */ 387 */
329long rcu_batches_completed_bh(void) 388unsigned long rcu_batches_completed_bh(void)
330{ 389{
331 return rcu_bh_state.completed; 390 return rcu_bh_state.completed;
332} 391}
@@ -759,39 +818,71 @@ void rcu_irq_enter(void)
759/** 818/**
760 * rcu_nmi_enter - inform RCU of entry to NMI context 819 * rcu_nmi_enter - inform RCU of entry to NMI context
761 * 820 *
762 * If the CPU was idle with dynamic ticks active, and there is no 821 * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
763 * irq handler running, this updates rdtp->dynticks_nmi to let the 822 * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
764 * RCU grace-period handling know that the CPU is active. 823 * that the CPU is active. This implementation permits nested NMIs, as
824 * long as the nesting level does not overflow an int. (You will probably
825 * run out of stack space first.)
765 */ 826 */
766void rcu_nmi_enter(void) 827void rcu_nmi_enter(void)
767{ 828{
768 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 829 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
830 int incby = 2;
769 831
770 if (rdtp->dynticks_nmi_nesting == 0 && 832 /* Complain about underflow. */
771 (atomic_read(&rdtp->dynticks) & 0x1)) 833 WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
772 return; 834
773 rdtp->dynticks_nmi_nesting++; 835 /*
774 smp_mb__before_atomic(); /* Force delay from prior write. */ 836 * If idle from RCU viewpoint, atomically increment ->dynticks
775 atomic_inc(&rdtp->dynticks); 837 * to mark non-idle and increment ->dynticks_nmi_nesting by one.
776 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 838 * Otherwise, increment ->dynticks_nmi_nesting by two. This means
777 smp_mb__after_atomic(); /* See above. */ 839 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
778 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 840 * to be in the outermost NMI handler that interrupted an RCU-idle
841 * period (observation due to Andy Lutomirski).
842 */
843 if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
844 smp_mb__before_atomic(); /* Force delay from prior write. */
845 atomic_inc(&rdtp->dynticks);
846 /* atomic_inc() before later RCU read-side crit sects */
847 smp_mb__after_atomic(); /* See above. */
848 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
849 incby = 1;
850 }
851 rdtp->dynticks_nmi_nesting += incby;
852 barrier();
779} 853}
780 854
781/** 855/**
782 * rcu_nmi_exit - inform RCU of exit from NMI context 856 * rcu_nmi_exit - inform RCU of exit from NMI context
783 * 857 *
784 * If the CPU was idle with dynamic ticks active, and there is no 858 * If we are returning from the outermost NMI handler that interrupted an
785 * irq handler running, this updates rdtp->dynticks_nmi to let the 859 * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
786 * RCU grace-period handling know that the CPU is no longer active. 860 * to let the RCU grace-period handling know that the CPU is back to
861 * being RCU-idle.
787 */ 862 */
788void rcu_nmi_exit(void) 863void rcu_nmi_exit(void)
789{ 864{
790 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 865 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
791 866
792 if (rdtp->dynticks_nmi_nesting == 0 || 867 /*
793 --rdtp->dynticks_nmi_nesting != 0) 868 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
869 * (We are exiting an NMI handler, so RCU better be paying attention
870 * to us!)
871 */
872 WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
873 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
874
875 /*
876 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
877 * leave it in non-RCU-idle state.
878 */
879 if (rdtp->dynticks_nmi_nesting != 1) {
880 rdtp->dynticks_nmi_nesting -= 2;
794 return; 881 return;
882 }
883
884 /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
885 rdtp->dynticks_nmi_nesting = 0;
795 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 886 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
796 smp_mb__before_atomic(); /* See above. */ 887 smp_mb__before_atomic(); /* See above. */
797 atomic_inc(&rdtp->dynticks); 888 atomic_inc(&rdtp->dynticks);
@@ -898,17 +989,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
898 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); 989 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
899 return 1; 990 return 1;
900 } else { 991 } else {
992 if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
993 rdp->mynode->gpnum))
994 ACCESS_ONCE(rdp->gpwrap) = true;
901 return 0; 995 return 0;
902 } 996 }
903} 997}
904 998
905/* 999/*
906 * This function really isn't for public consumption, but RCU is special in
907 * that context switches can allow the state machine to make progress.
908 */
909extern void resched_cpu(int cpu);
910
911/*
912 * Return true if the specified CPU has passed through a quiescent 1000 * Return true if the specified CPU has passed through a quiescent
913 * state by virtue of being in or having passed through an dynticks 1001 * state by virtue of being in or having passed through an dynticks
914 * idle state since the last call to dyntick_save_progress_counter() 1002 * idle state since the last call to dyntick_save_progress_counter()
@@ -1011,6 +1099,22 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
1011 j1 = rcu_jiffies_till_stall_check(); 1099 j1 = rcu_jiffies_till_stall_check();
1012 ACCESS_ONCE(rsp->jiffies_stall) = j + j1; 1100 ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
1013 rsp->jiffies_resched = j + j1 / 2; 1101 rsp->jiffies_resched = j + j1 / 2;
1102 rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
1103}
1104
1105/*
1106 * Complain about starvation of grace-period kthread.
1107 */
1108static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
1109{
1110 unsigned long gpa;
1111 unsigned long j;
1112
1113 j = jiffies;
1114 gpa = ACCESS_ONCE(rsp->gp_activity);
1115 if (j - gpa > 2 * HZ)
1116 pr_err("%s kthread starved for %ld jiffies!\n",
1117 rsp->name, j - gpa);
1014} 1118}
1015 1119
1016/* 1120/*
@@ -1033,11 +1137,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
1033 } 1137 }
1034} 1138}
1035 1139
1036static void print_other_cpu_stall(struct rcu_state *rsp) 1140static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1037{ 1141{
1038 int cpu; 1142 int cpu;
1039 long delta; 1143 long delta;
1040 unsigned long flags; 1144 unsigned long flags;
1145 unsigned long gpa;
1146 unsigned long j;
1041 int ndetected = 0; 1147 int ndetected = 0;
1042 struct rcu_node *rnp = rcu_get_root(rsp); 1148 struct rcu_node *rnp = rcu_get_root(rsp);
1043 long totqlen = 0; 1149 long totqlen = 0;
@@ -1075,30 +1181,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
1075 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1181 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1076 } 1182 }
1077 1183
1078 /*
1079 * Now rat on any tasks that got kicked up to the root rcu_node
1080 * due to CPU offlining.
1081 */
1082 rnp = rcu_get_root(rsp);
1083 raw_spin_lock_irqsave(&rnp->lock, flags);
1084 ndetected += rcu_print_task_stall(rnp);
1085 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1086
1087 print_cpu_stall_info_end(); 1184 print_cpu_stall_info_end();
1088 for_each_possible_cpu(cpu) 1185 for_each_possible_cpu(cpu)
1089 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1186 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
1090 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", 1187 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
1091 smp_processor_id(), (long)(jiffies - rsp->gp_start), 1188 smp_processor_id(), (long)(jiffies - rsp->gp_start),
1092 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1189 (long)rsp->gpnum, (long)rsp->completed, totqlen);
1093 if (ndetected == 0) 1190 if (ndetected) {
1094 pr_err("INFO: Stall ended before state dump start\n");
1095 else
1096 rcu_dump_cpu_stacks(rsp); 1191 rcu_dump_cpu_stacks(rsp);
1192 } else {
1193 if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
1194 ACCESS_ONCE(rsp->completed) == gpnum) {
1195 pr_err("INFO: Stall ended before state dump start\n");
1196 } else {
1197 j = jiffies;
1198 gpa = ACCESS_ONCE(rsp->gp_activity);
1199 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
1200 rsp->name, j - gpa, j, gpa,
1201 jiffies_till_next_fqs);
1202 /* In this case, the current CPU might be at fault. */
1203 sched_show_task(current);
1204 }
1205 }
1097 1206
1098 /* Complain about tasks blocking the grace period. */ 1207 /* Complain about tasks blocking the grace period. */
1099
1100 rcu_print_detail_task_stall(rsp); 1208 rcu_print_detail_task_stall(rsp);
1101 1209
1210 rcu_check_gp_kthread_starvation(rsp);
1211
1102 force_quiescent_state(rsp); /* Kick them all. */ 1212 force_quiescent_state(rsp); /* Kick them all. */
1103} 1213}
1104 1214
@@ -1123,6 +1233,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
1123 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", 1233 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
1124 jiffies - rsp->gp_start, 1234 jiffies - rsp->gp_start,
1125 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1235 (long)rsp->gpnum, (long)rsp->completed, totqlen);
1236
1237 rcu_check_gp_kthread_starvation(rsp);
1238
1126 rcu_dump_cpu_stacks(rsp); 1239 rcu_dump_cpu_stacks(rsp);
1127 1240
1128 raw_spin_lock_irqsave(&rnp->lock, flags); 1241 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1193,7 +1306,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
1193 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { 1306 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
1194 1307
1195 /* They had a few time units to dump stack, so complain. */ 1308 /* They had a few time units to dump stack, so complain. */
1196 print_other_cpu_stall(rsp); 1309 print_other_cpu_stall(rsp, gpnum);
1197 } 1310 }
1198} 1311}
1199 1312
@@ -1530,7 +1643,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1530 bool ret; 1643 bool ret;
1531 1644
1532 /* Handle the ends of any preceding grace periods first. */ 1645 /* Handle the ends of any preceding grace periods first. */
1533 if (rdp->completed == rnp->completed) { 1646 if (rdp->completed == rnp->completed &&
1647 !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
1534 1648
1535 /* No grace period end, so just accelerate recent callbacks. */ 1649 /* No grace period end, so just accelerate recent callbacks. */
1536 ret = rcu_accelerate_cbs(rsp, rnp, rdp); 1650 ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1545,7 +1659,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1545 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); 1659 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
1546 } 1660 }
1547 1661
1548 if (rdp->gpnum != rnp->gpnum) { 1662 if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
1549 /* 1663 /*
1550 * If the current grace period is waiting for this CPU, 1664 * If the current grace period is waiting for this CPU,
1551 * set up to detect a quiescent state, otherwise don't 1665 * set up to detect a quiescent state, otherwise don't
@@ -1554,8 +1668,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1554 rdp->gpnum = rnp->gpnum; 1668 rdp->gpnum = rnp->gpnum;
1555 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); 1669 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
1556 rdp->passed_quiesce = 0; 1670 rdp->passed_quiesce = 0;
1671 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
1557 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1672 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1558 zero_cpu_stall_ticks(rdp); 1673 zero_cpu_stall_ticks(rdp);
1674 ACCESS_ONCE(rdp->gpwrap) = false;
1559 } 1675 }
1560 return ret; 1676 return ret;
1561} 1677}
@@ -1569,7 +1685,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1569 local_irq_save(flags); 1685 local_irq_save(flags);
1570 rnp = rdp->mynode; 1686 rnp = rdp->mynode;
1571 if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && 1687 if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
1572 rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ 1688 rdp->completed == ACCESS_ONCE(rnp->completed) &&
1689 !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
1573 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ 1690 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1574 local_irq_restore(flags); 1691 local_irq_restore(flags);
1575 return; 1692 return;
@@ -1589,6 +1706,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1589 struct rcu_data *rdp; 1706 struct rcu_data *rdp;
1590 struct rcu_node *rnp = rcu_get_root(rsp); 1707 struct rcu_node *rnp = rcu_get_root(rsp);
1591 1708
1709 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1592 rcu_bind_gp_kthread(); 1710 rcu_bind_gp_kthread();
1593 raw_spin_lock_irq(&rnp->lock); 1711 raw_spin_lock_irq(&rnp->lock);
1594 smp_mb__after_unlock_lock(); 1712 smp_mb__after_unlock_lock();
@@ -1649,6 +1767,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1649 rnp->grphi, rnp->qsmask); 1767 rnp->grphi, rnp->qsmask);
1650 raw_spin_unlock_irq(&rnp->lock); 1768 raw_spin_unlock_irq(&rnp->lock);
1651 cond_resched_rcu_qs(); 1769 cond_resched_rcu_qs();
1770 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1652 } 1771 }
1653 1772
1654 mutex_unlock(&rsp->onoff_mutex); 1773 mutex_unlock(&rsp->onoff_mutex);
@@ -1665,6 +1784,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1665 unsigned long maxj; 1784 unsigned long maxj;
1666 struct rcu_node *rnp = rcu_get_root(rsp); 1785 struct rcu_node *rnp = rcu_get_root(rsp);
1667 1786
1787 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1668 rsp->n_force_qs++; 1788 rsp->n_force_qs++;
1669 if (fqs_state == RCU_SAVE_DYNTICK) { 1789 if (fqs_state == RCU_SAVE_DYNTICK) {
1670 /* Collect dyntick-idle snapshots. */ 1790 /* Collect dyntick-idle snapshots. */
@@ -1703,6 +1823,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1703 struct rcu_data *rdp; 1823 struct rcu_data *rdp;
1704 struct rcu_node *rnp = rcu_get_root(rsp); 1824 struct rcu_node *rnp = rcu_get_root(rsp);
1705 1825
1826 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1706 raw_spin_lock_irq(&rnp->lock); 1827 raw_spin_lock_irq(&rnp->lock);
1707 smp_mb__after_unlock_lock(); 1828 smp_mb__after_unlock_lock();
1708 gp_duration = jiffies - rsp->gp_start; 1829 gp_duration = jiffies - rsp->gp_start;
@@ -1739,6 +1860,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1739 nocb += rcu_future_gp_cleanup(rsp, rnp); 1860 nocb += rcu_future_gp_cleanup(rsp, rnp);
1740 raw_spin_unlock_irq(&rnp->lock); 1861 raw_spin_unlock_irq(&rnp->lock);
1741 cond_resched_rcu_qs(); 1862 cond_resched_rcu_qs();
1863 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1742 } 1864 }
1743 rnp = rcu_get_root(rsp); 1865 rnp = rcu_get_root(rsp);
1744 raw_spin_lock_irq(&rnp->lock); 1866 raw_spin_lock_irq(&rnp->lock);
@@ -1788,6 +1910,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1788 if (rcu_gp_init(rsp)) 1910 if (rcu_gp_init(rsp))
1789 break; 1911 break;
1790 cond_resched_rcu_qs(); 1912 cond_resched_rcu_qs();
1913 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1791 WARN_ON(signal_pending(current)); 1914 WARN_ON(signal_pending(current));
1792 trace_rcu_grace_period(rsp->name, 1915 trace_rcu_grace_period(rsp->name,
1793 ACCESS_ONCE(rsp->gpnum), 1916 ACCESS_ONCE(rsp->gpnum),
@@ -1831,9 +1954,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
1831 ACCESS_ONCE(rsp->gpnum), 1954 ACCESS_ONCE(rsp->gpnum),
1832 TPS("fqsend")); 1955 TPS("fqsend"));
1833 cond_resched_rcu_qs(); 1956 cond_resched_rcu_qs();
1957 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1834 } else { 1958 } else {
1835 /* Deal with stray signal. */ 1959 /* Deal with stray signal. */
1836 cond_resched_rcu_qs(); 1960 cond_resched_rcu_qs();
1961 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1837 WARN_ON(signal_pending(current)); 1962 WARN_ON(signal_pending(current));
1838 trace_rcu_grace_period(rsp->name, 1963 trace_rcu_grace_period(rsp->name,
1839 ACCESS_ONCE(rsp->gpnum), 1964 ACCESS_ONCE(rsp->gpnum),
@@ -2010,8 +2135,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2010 rnp = rdp->mynode; 2135 rnp = rdp->mynode;
2011 raw_spin_lock_irqsave(&rnp->lock, flags); 2136 raw_spin_lock_irqsave(&rnp->lock, flags);
2012 smp_mb__after_unlock_lock(); 2137 smp_mb__after_unlock_lock();
2013 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || 2138 if ((rdp->passed_quiesce == 0 &&
2014 rnp->completed == rnp->gpnum) { 2139 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
2140 rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
2141 rdp->gpwrap) {
2015 2142
2016 /* 2143 /*
2017 * The grace period in which this quiescent state was 2144 * The grace period in which this quiescent state was
@@ -2020,6 +2147,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2020 * within the current grace period. 2147 * within the current grace period.
2021 */ 2148 */
2022 rdp->passed_quiesce = 0; /* need qs for new gp. */ 2149 rdp->passed_quiesce = 0; /* need qs for new gp. */
2150 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
2023 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2151 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2024 return; 2152 return;
2025 } 2153 }
@@ -2064,7 +2192,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
2064 * Was there a quiescent state since the beginning of the grace 2192 * Was there a quiescent state since the beginning of the grace
2065 * period? If no, then exit and wait for the next call. 2193 * period? If no, then exit and wait for the next call.
2066 */ 2194 */
2067 if (!rdp->passed_quiesce) 2195 if (!rdp->passed_quiesce &&
2196 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
2068 return; 2197 return;
2069 2198
2070 /* 2199 /*
@@ -2195,6 +2324,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2195} 2324}
2196 2325
2197/* 2326/*
2327 * All CPUs for the specified rcu_node structure have gone offline,
2328 * and all tasks that were preempted within an RCU read-side critical
2329 * section while running on one of those CPUs have since exited their RCU
2330 * read-side critical section. Some other CPU is reporting this fact with
2331 * the specified rcu_node structure's ->lock held and interrupts disabled.
2332 * This function therefore goes up the tree of rcu_node structures,
2333 * clearing the corresponding bits in the ->qsmaskinit fields. Note that
2334 * the leaf rcu_node structure's ->qsmaskinit field has already been
2335 * updated
2336 *
2337 * This function does check that the specified rcu_node structure has
2338 * all CPUs offline and no blocked tasks, so it is OK to invoke it
2339 * prematurely. That said, invoking it after the fact will cost you
2340 * a needless lock acquisition. So once it has done its work, don't
2341 * invoke it again.
2342 */
2343static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2344{
2345 long mask;
2346 struct rcu_node *rnp = rnp_leaf;
2347
2348 if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
2349 return;
2350 for (;;) {
2351 mask = rnp->grpmask;
2352 rnp = rnp->parent;
2353 if (!rnp)
2354 break;
2355 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
2356 smp_mb__after_unlock_lock(); /* GP memory ordering. */
2357 rnp->qsmaskinit &= ~mask;
2358 if (rnp->qsmaskinit) {
2359 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2360 return;
2361 }
2362 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2363 }
2364}
2365
2366/*
2198 * The CPU has been completely removed, and some other CPU is reporting 2367 * The CPU has been completely removed, and some other CPU is reporting
2199 * this fact from process context. Do the remainder of the cleanup, 2368 * this fact from process context. Do the remainder of the cleanup,
2200 * including orphaning the outgoing CPU's RCU callbacks, and also 2369 * including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2204,8 +2373,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2204static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2373static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2205{ 2374{
2206 unsigned long flags; 2375 unsigned long flags;
2207 unsigned long mask;
2208 int need_report = 0;
2209 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2376 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2210 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 2377 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
2211 2378
@@ -2219,40 +2386,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2219 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 2386 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
2220 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 2387 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
2221 rcu_adopt_orphan_cbs(rsp, flags); 2388 rcu_adopt_orphan_cbs(rsp, flags);
2389 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
2222 2390
2223 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 2391 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
2224 mask = rdp->grpmask; /* rnp->grplo is constant. */ 2392 raw_spin_lock_irqsave(&rnp->lock, flags);
2225 do { 2393 smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
2226 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 2394 rnp->qsmaskinit &= ~rdp->grpmask;
2227 smp_mb__after_unlock_lock(); 2395 if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
2228 rnp->qsmaskinit &= ~mask; 2396 rcu_cleanup_dead_rnp(rnp);
2229 if (rnp->qsmaskinit != 0) { 2397 rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
2230 if (rnp != rdp->mynode)
2231 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2232 break;
2233 }
2234 if (rnp == rdp->mynode)
2235 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
2236 else
2237 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2238 mask = rnp->grpmask;
2239 rnp = rnp->parent;
2240 } while (rnp != NULL);
2241
2242 /*
2243 * We still hold the leaf rcu_node structure lock here, and
2244 * irqs are still disabled. The reason for this subterfuge is
2245 * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
2246 * held leads to deadlock.
2247 */
2248 raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
2249 rnp = rdp->mynode;
2250 if (need_report & RCU_OFL_TASKS_NORM_GP)
2251 rcu_report_unblock_qs_rnp(rnp, flags);
2252 else
2253 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2254 if (need_report & RCU_OFL_TASKS_EXP_GP)
2255 rcu_report_exp_rnp(rsp, rnp, true);
2256 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 2398 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
2257 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 2399 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
2258 cpu, rdp->qlen, rdp->nxtlist); 2400 cpu, rdp->qlen, rdp->nxtlist);
@@ -2268,6 +2410,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2268{ 2410{
2269} 2411}
2270 2412
2413static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2414{
2415}
2416
2271static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2417static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2272{ 2418{
2273} 2419}
@@ -2464,12 +2610,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
2464 } 2610 }
2465 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2611 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2466 } 2612 }
2467 rnp = rcu_get_root(rsp);
2468 if (rnp->qsmask == 0) {
2469 raw_spin_lock_irqsave(&rnp->lock, flags);
2470 smp_mb__after_unlock_lock();
2471 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
2472 }
2473} 2613}
2474 2614
2475/* 2615/*
@@ -2569,7 +2709,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
2569 * Schedule RCU callback invocation. If the specified type of RCU 2709 * Schedule RCU callback invocation. If the specified type of RCU
2570 * does not support RCU priority boosting, just do a direct call, 2710 * does not support RCU priority boosting, just do a direct call,
2571 * otherwise wake up the per-CPU kernel kthread. Note that because we 2711 * otherwise wake up the per-CPU kernel kthread. Note that because we
2572 * are running on the current CPU with interrupts disabled, the 2712 * are running on the current CPU with softirqs disabled, the
2573 * rcu_cpu_kthread_task cannot disappear out from under us. 2713 * rcu_cpu_kthread_task cannot disappear out from under us.
2574 */ 2714 */
2575static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 2715static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -3109,9 +3249,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3109 3249
3110 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3250 /* Is the RCU core waiting for a quiescent state from this CPU? */
3111 if (rcu_scheduler_fully_active && 3251 if (rcu_scheduler_fully_active &&
3112 rdp->qs_pending && !rdp->passed_quiesce) { 3252 rdp->qs_pending && !rdp->passed_quiesce &&
3253 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
3113 rdp->n_rp_qs_pending++; 3254 rdp->n_rp_qs_pending++;
3114 } else if (rdp->qs_pending && rdp->passed_quiesce) { 3255 } else if (rdp->qs_pending &&
3256 (rdp->passed_quiesce ||
3257 rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
3115 rdp->n_rp_report_qs++; 3258 rdp->n_rp_report_qs++;
3116 return 1; 3259 return 1;
3117 } 3260 }
@@ -3135,7 +3278,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3135 } 3278 }
3136 3279
3137 /* Has a new RCU grace period started? */ 3280 /* Has a new RCU grace period started? */
3138 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ 3281 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
3282 unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
3139 rdp->n_rp_gp_started++; 3283 rdp->n_rp_gp_started++;
3140 return 1; 3284 return 1;
3141 } 3285 }
@@ -3318,6 +3462,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
3318 } else { 3462 } else {
3319 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 3463 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
3320 rsp->n_barrier_done); 3464 rsp->n_barrier_done);
3465 smp_mb__before_atomic();
3321 atomic_inc(&rsp->barrier_cpu_count); 3466 atomic_inc(&rsp->barrier_cpu_count);
3322 __call_rcu(&rdp->barrier_head, 3467 __call_rcu(&rdp->barrier_head,
3323 rcu_barrier_callback, rsp, cpu, 0); 3468 rcu_barrier_callback, rsp, cpu, 0);
@@ -3385,9 +3530,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
3385 /* Set up local state, ensuring consistent view of global state. */ 3530 /* Set up local state, ensuring consistent view of global state. */
3386 raw_spin_lock_irqsave(&rnp->lock, flags); 3531 raw_spin_lock_irqsave(&rnp->lock, flags);
3387 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 3532 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
3388 init_callback_list(rdp);
3389 rdp->qlen_lazy = 0;
3390 ACCESS_ONCE(rdp->qlen) = 0;
3391 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 3533 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
3392 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 3534 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
3393 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 3535 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -3444,6 +3586,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3444 rdp->gpnum = rnp->completed; 3586 rdp->gpnum = rnp->completed;
3445 rdp->completed = rnp->completed; 3587 rdp->completed = rnp->completed;
3446 rdp->passed_quiesce = 0; 3588 rdp->passed_quiesce = 0;
3589 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
3447 rdp->qs_pending = 0; 3590 rdp->qs_pending = 0;
3448 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); 3591 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3449 } 3592 }
@@ -3535,17 +3678,35 @@ static int rcu_pm_notify(struct notifier_block *self,
3535static int __init rcu_spawn_gp_kthread(void) 3678static int __init rcu_spawn_gp_kthread(void)
3536{ 3679{
3537 unsigned long flags; 3680 unsigned long flags;
3681 int kthread_prio_in = kthread_prio;
3538 struct rcu_node *rnp; 3682 struct rcu_node *rnp;
3539 struct rcu_state *rsp; 3683 struct rcu_state *rsp;
3684 struct sched_param sp;
3540 struct task_struct *t; 3685 struct task_struct *t;
3541 3686
3687 /* Force priority into range. */
3688 if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
3689 kthread_prio = 1;
3690 else if (kthread_prio < 0)
3691 kthread_prio = 0;
3692 else if (kthread_prio > 99)
3693 kthread_prio = 99;
3694 if (kthread_prio != kthread_prio_in)
3695 pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
3696 kthread_prio, kthread_prio_in);
3697
3542 rcu_scheduler_fully_active = 1; 3698 rcu_scheduler_fully_active = 1;
3543 for_each_rcu_flavor(rsp) { 3699 for_each_rcu_flavor(rsp) {
3544 t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); 3700 t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
3545 BUG_ON(IS_ERR(t)); 3701 BUG_ON(IS_ERR(t));
3546 rnp = rcu_get_root(rsp); 3702 rnp = rcu_get_root(rsp);
3547 raw_spin_lock_irqsave(&rnp->lock, flags); 3703 raw_spin_lock_irqsave(&rnp->lock, flags);
3548 rsp->gp_kthread = t; 3704 rsp->gp_kthread = t;
3705 if (kthread_prio) {
3706 sp.sched_priority = kthread_prio;
3707 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
3708 }
3709 wake_up_process(t);
3549 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3710 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3550 } 3711 }
3551 rcu_spawn_nocb_kthreads(); 3712 rcu_spawn_nocb_kthreads();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b1843896e..119de399eb2f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,7 +27,6 @@
27#include <linux/threads.h> 27#include <linux/threads.h>
28#include <linux/cpumask.h> 28#include <linux/cpumask.h>
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30#include <linux/irq_work.h>
31 30
32/* 31/*
33 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and 32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -172,11 +171,6 @@ struct rcu_node {
172 /* queued on this rcu_node structure that */ 171 /* queued on this rcu_node structure that */
173 /* are blocking the current grace period, */ 172 /* are blocking the current grace period, */
174 /* there can be no such task. */ 173 /* there can be no such task. */
175 struct completion boost_completion;
176 /* Used to ensure that the rt_mutex used */
177 /* to carry out the boosting is fully */
178 /* released with no future boostee accesses */
179 /* before that rt_mutex is re-initialized. */
180 struct rt_mutex boost_mtx; 174 struct rt_mutex boost_mtx;
181 /* Used only for the priority-boosting */ 175 /* Used only for the priority-boosting */
182 /* side effect, not as a lock. */ 176 /* side effect, not as a lock. */
@@ -257,9 +251,12 @@ struct rcu_data {
257 /* in order to detect GP end. */ 251 /* in order to detect GP end. */
258 unsigned long gpnum; /* Highest gp number that this CPU */ 252 unsigned long gpnum; /* Highest gp number that this CPU */
259 /* is aware of having started. */ 253 /* is aware of having started. */
254 unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
255 /* for rcu_all_qs() invocations. */
260 bool passed_quiesce; /* User-mode/idle loop etc. */ 256 bool passed_quiesce; /* User-mode/idle loop etc. */
261 bool qs_pending; /* Core waits for quiesc state. */ 257 bool qs_pending; /* Core waits for quiesc state. */
262 bool beenonline; /* CPU online at least once. */ 258 bool beenonline; /* CPU online at least once. */
259 bool gpwrap; /* Possible gpnum/completed wrap. */
263 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 260 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
264 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 261 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
265#ifdef CONFIG_RCU_CPU_STALL_INFO 262#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -340,14 +337,10 @@ struct rcu_data {
340#ifdef CONFIG_RCU_NOCB_CPU 337#ifdef CONFIG_RCU_NOCB_CPU
341 struct rcu_head *nocb_head; /* CBs waiting for kthread. */ 338 struct rcu_head *nocb_head; /* CBs waiting for kthread. */
342 struct rcu_head **nocb_tail; 339 struct rcu_head **nocb_tail;
343 atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ 340 atomic_long_t nocb_q_count; /* # CBs waiting for nocb */
344 atomic_long_t nocb_q_count_lazy; /* (approximate). */ 341 atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
345 struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ 342 struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
346 struct rcu_head **nocb_follower_tail; 343 struct rcu_head **nocb_follower_tail;
347 atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
348 atomic_long_t nocb_follower_count_lazy; /* (approximate). */
349 int nocb_p_count; /* # CBs being invoked by kthread */
350 int nocb_p_count_lazy; /* (approximate). */
351 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ 344 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
352 struct task_struct *nocb_kthread; 345 struct task_struct *nocb_kthread;
353 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ 346 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
@@ -356,8 +349,6 @@ struct rcu_data {
356 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; 349 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
357 /* CBs waiting for GP. */ 350 /* CBs waiting for GP. */
358 struct rcu_head **nocb_gp_tail; 351 struct rcu_head **nocb_gp_tail;
359 long nocb_gp_count;
360 long nocb_gp_count_lazy;
361 bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ 352 bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
362 struct rcu_data *nocb_next_follower; 353 struct rcu_data *nocb_next_follower;
363 /* Next follower in wakeup chain. */ 354 /* Next follower in wakeup chain. */
@@ -488,10 +479,14 @@ struct rcu_state {
488 /* due to no GP active. */ 479 /* due to no GP active. */
489 unsigned long gp_start; /* Time at which GP started, */ 480 unsigned long gp_start; /* Time at which GP started, */
490 /* but in jiffies. */ 481 /* but in jiffies. */
482 unsigned long gp_activity; /* Time of last GP kthread */
483 /* activity in jiffies. */
491 unsigned long jiffies_stall; /* Time at which to check */ 484 unsigned long jiffies_stall; /* Time at which to check */
492 /* for CPU stalls. */ 485 /* for CPU stalls. */
493 unsigned long jiffies_resched; /* Time at which to resched */ 486 unsigned long jiffies_resched; /* Time at which to resched */
494 /* a reluctant CPU. */ 487 /* a reluctant CPU. */
488 unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
489 /* GP start. */
495 unsigned long gp_max; /* Maximum GP duration in */ 490 unsigned long gp_max; /* Maximum GP duration in */
496 /* jiffies. */ 491 /* jiffies. */
497 const char *name; /* Name of structure. */ 492 const char *name; /* Name of structure. */
@@ -514,13 +509,6 @@ extern struct list_head rcu_struct_flavors;
514#define for_each_rcu_flavor(rsp) \ 509#define for_each_rcu_flavor(rsp) \
515 list_for_each_entry((rsp), &rcu_struct_flavors, flavors) 510 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
516 511
517/* Return values for rcu_preempt_offline_tasks(). */
518
519#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
520 /* GP were moved to root. */
521#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
522 /* GP were moved to root. */
523
524/* 512/*
525 * RCU implementation internal declarations: 513 * RCU implementation internal declarations:
526 */ 514 */
@@ -546,27 +534,16 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
546 534
547/* Forward declarations for rcutree_plugin.h */ 535/* Forward declarations for rcutree_plugin.h */
548static void rcu_bootup_announce(void); 536static void rcu_bootup_announce(void);
549long rcu_batches_completed(void);
550static void rcu_preempt_note_context_switch(void); 537static void rcu_preempt_note_context_switch(void);
551static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 538static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
552#ifdef CONFIG_HOTPLUG_CPU 539#ifdef CONFIG_HOTPLUG_CPU
553static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 540static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
554 unsigned long flags);
555#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 541#endif /* #ifdef CONFIG_HOTPLUG_CPU */
556static void rcu_print_detail_task_stall(struct rcu_state *rsp); 542static void rcu_print_detail_task_stall(struct rcu_state *rsp);
557static int rcu_print_task_stall(struct rcu_node *rnp); 543static int rcu_print_task_stall(struct rcu_node *rnp);
558static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 544static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
559#ifdef CONFIG_HOTPLUG_CPU
560static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
561 struct rcu_node *rnp,
562 struct rcu_data *rdp);
563#endif /* #ifdef CONFIG_HOTPLUG_CPU */
564static void rcu_preempt_check_callbacks(void); 545static void rcu_preempt_check_callbacks(void);
565void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 546void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
566#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
567static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
568 bool wake);
569#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
570static void __init __rcu_init_preempt(void); 547static void __init __rcu_init_preempt(void);
571static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 548static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
572static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 549static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -622,24 +599,15 @@ static void rcu_dynticks_task_exit(void);
622#endif /* #ifndef RCU_TREE_NONCORE */ 599#endif /* #ifndef RCU_TREE_NONCORE */
623 600
624#ifdef CONFIG_RCU_TRACE 601#ifdef CONFIG_RCU_TRACE
625#ifdef CONFIG_RCU_NOCB_CPU 602/* Read out queue lengths for tracing. */
626/* Sum up queue lengths for tracing. */
627static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) 603static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
628{ 604{
629 *ql = atomic_long_read(&rdp->nocb_q_count) + 605#ifdef CONFIG_RCU_NOCB_CPU
630 rdp->nocb_p_count + 606 *ql = atomic_long_read(&rdp->nocb_q_count);
631 atomic_long_read(&rdp->nocb_follower_count) + 607 *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
632 rdp->nocb_p_count + rdp->nocb_gp_count;
633 *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
634 rdp->nocb_p_count_lazy +
635 atomic_long_read(&rdp->nocb_follower_count_lazy) +
636 rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
637}
638#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 608#else /* #ifdef CONFIG_RCU_NOCB_CPU */
639static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
640{
641 *ql = 0; 609 *ql = 0;
642 *qll = 0; 610 *qll = 0;
643}
644#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 611#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
612}
645#endif /* #ifdef CONFIG_RCU_TRACE */ 613#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..2e850a51bb8f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -34,10 +34,6 @@
34 34
35#include "../locking/rtmutex_common.h" 35#include "../locking/rtmutex_common.h"
36 36
37/* rcuc/rcub kthread realtime priority */
38static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
39module_param(kthread_prio, int, 0644);
40
41/* 37/*
42 * Control variables for per-CPU and per-rcu_node kthreads. These 38 * Control variables for per-CPU and per-rcu_node kthreads. These
43 * handle all flavors of RCU. 39 * handle all flavors of RCU.
@@ -103,6 +99,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
103static struct rcu_state *rcu_state_p = &rcu_preempt_state; 99static struct rcu_state *rcu_state_p = &rcu_preempt_state;
104 100
105static int rcu_preempted_readers_exp(struct rcu_node *rnp); 101static int rcu_preempted_readers_exp(struct rcu_node *rnp);
102static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
103 bool wake);
106 104
107/* 105/*
108 * Tell them what RCU they are running. 106 * Tell them what RCU they are running.
@@ -114,25 +112,6 @@ static void __init rcu_bootup_announce(void)
114} 112}
115 113
116/* 114/*
117 * Return the number of RCU-preempt batches processed thus far
118 * for debug and statistics.
119 */
120static long rcu_batches_completed_preempt(void)
121{
122 return rcu_preempt_state.completed;
123}
124EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
125
126/*
127 * Return the number of RCU batches processed thus far for debug & stats.
128 */
129long rcu_batches_completed(void)
130{
131 return rcu_batches_completed_preempt();
132}
133EXPORT_SYMBOL_GPL(rcu_batches_completed);
134
135/*
136 * Record a preemptible-RCU quiescent state for the specified CPU. Note 115 * Record a preemptible-RCU quiescent state for the specified CPU. Note
137 * that this just means that the task currently running on the CPU is 116 * that this just means that the task currently running on the CPU is
138 * not in a quiescent state. There might be any number of tasks blocked 117 * not in a quiescent state. There might be any number of tasks blocked
@@ -307,15 +286,25 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
307} 286}
308 287
309/* 288/*
289 * Return true if the specified rcu_node structure has tasks that were
290 * preempted within an RCU read-side critical section.
291 */
292static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
293{
294 return !list_empty(&rnp->blkd_tasks);
295}
296
297/*
310 * Handle special cases during rcu_read_unlock(), such as needing to 298 * Handle special cases during rcu_read_unlock(), such as needing to
311 * notify RCU core processing or task having blocked during the RCU 299 * notify RCU core processing or task having blocked during the RCU
312 * read-side critical section. 300 * read-side critical section.
313 */ 301 */
314void rcu_read_unlock_special(struct task_struct *t) 302void rcu_read_unlock_special(struct task_struct *t)
315{ 303{
316 int empty; 304 bool empty;
317 int empty_exp; 305 bool empty_exp;
318 int empty_exp_now; 306 bool empty_norm;
307 bool empty_exp_now;
319 unsigned long flags; 308 unsigned long flags;
320 struct list_head *np; 309 struct list_head *np;
321#ifdef CONFIG_RCU_BOOST 310#ifdef CONFIG_RCU_BOOST
@@ -367,7 +356,8 @@ void rcu_read_unlock_special(struct task_struct *t)
367 break; 356 break;
368 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 357 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
369 } 358 }
370 empty = !rcu_preempt_blocked_readers_cgp(rnp); 359 empty = !rcu_preempt_has_tasks(rnp);
360 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
371 empty_exp = !rcu_preempted_readers_exp(rnp); 361 empty_exp = !rcu_preempted_readers_exp(rnp);
372 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 362 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
373 np = rcu_next_node_entry(t, rnp); 363 np = rcu_next_node_entry(t, rnp);
@@ -387,13 +377,21 @@ void rcu_read_unlock_special(struct task_struct *t)
387#endif /* #ifdef CONFIG_RCU_BOOST */ 377#endif /* #ifdef CONFIG_RCU_BOOST */
388 378
389 /* 379 /*
380 * If this was the last task on the list, go see if we
381 * need to propagate ->qsmaskinit bit clearing up the
382 * rcu_node tree.
383 */
384 if (!empty && !rcu_preempt_has_tasks(rnp))
385 rcu_cleanup_dead_rnp(rnp);
386
387 /*
390 * If this was the last task on the current list, and if 388 * If this was the last task on the current list, and if
391 * we aren't waiting on any CPUs, report the quiescent state. 389 * we aren't waiting on any CPUs, report the quiescent state.
392 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, 390 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
393 * so we must take a snapshot of the expedited state. 391 * so we must take a snapshot of the expedited state.
394 */ 392 */
395 empty_exp_now = !rcu_preempted_readers_exp(rnp); 393 empty_exp_now = !rcu_preempted_readers_exp(rnp);
396 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 394 if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
397 trace_rcu_quiescent_state_report(TPS("preempt_rcu"), 395 trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
398 rnp->gpnum, 396 rnp->gpnum,
399 0, rnp->qsmask, 397 0, rnp->qsmask,
@@ -408,10 +406,8 @@ void rcu_read_unlock_special(struct task_struct *t)
408 406
409#ifdef CONFIG_RCU_BOOST 407#ifdef CONFIG_RCU_BOOST
410 /* Unboost if we were boosted. */ 408 /* Unboost if we were boosted. */
411 if (drop_boost_mutex) { 409 if (drop_boost_mutex)
412 rt_mutex_unlock(&rnp->boost_mtx); 410 rt_mutex_unlock(&rnp->boost_mtx);
413 complete(&rnp->boost_completion);
414 }
415#endif /* #ifdef CONFIG_RCU_BOOST */ 411#endif /* #ifdef CONFIG_RCU_BOOST */
416 412
417 /* 413 /*
@@ -519,99 +515,13 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
519static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 515static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
520{ 516{
521 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); 517 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
522 if (!list_empty(&rnp->blkd_tasks)) 518 if (rcu_preempt_has_tasks(rnp))
523 rnp->gp_tasks = rnp->blkd_tasks.next; 519 rnp->gp_tasks = rnp->blkd_tasks.next;
524 WARN_ON_ONCE(rnp->qsmask); 520 WARN_ON_ONCE(rnp->qsmask);
525} 521}
526 522
527#ifdef CONFIG_HOTPLUG_CPU 523#ifdef CONFIG_HOTPLUG_CPU
528 524
529/*
530 * Handle tasklist migration for case in which all CPUs covered by the
531 * specified rcu_node have gone offline. Move them up to the root
532 * rcu_node. The reason for not just moving them to the immediate
533 * parent is to remove the need for rcu_read_unlock_special() to
534 * make more than two attempts to acquire the target rcu_node's lock.
535 * Returns true if there were tasks blocking the current RCU grace
536 * period.
537 *
538 * Returns 1 if there was previously a task blocking the current grace
539 * period on the specified rcu_node structure.
540 *
541 * The caller must hold rnp->lock with irqs disabled.
542 */
543static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
544 struct rcu_node *rnp,
545 struct rcu_data *rdp)
546{
547 struct list_head *lp;
548 struct list_head *lp_root;
549 int retval = 0;
550 struct rcu_node *rnp_root = rcu_get_root(rsp);
551 struct task_struct *t;
552
553 if (rnp == rnp_root) {
554 WARN_ONCE(1, "Last CPU thought to be offlined?");
555 return 0; /* Shouldn't happen: at least one CPU online. */
556 }
557
558 /* If we are on an internal node, complain bitterly. */
559 WARN_ON_ONCE(rnp != rdp->mynode);
560
561 /*
562 * Move tasks up to root rcu_node. Don't try to get fancy for
563 * this corner-case operation -- just put this node's tasks
564 * at the head of the root node's list, and update the root node's
565 * ->gp_tasks and ->exp_tasks pointers to those of this node's,
566 * if non-NULL. This might result in waiting for more tasks than
567 * absolutely necessary, but this is a good performance/complexity
568 * tradeoff.
569 */
570 if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
571 retval |= RCU_OFL_TASKS_NORM_GP;
572 if (rcu_preempted_readers_exp(rnp))
573 retval |= RCU_OFL_TASKS_EXP_GP;
574 lp = &rnp->blkd_tasks;
575 lp_root = &rnp_root->blkd_tasks;
576 while (!list_empty(lp)) {
577 t = list_entry(lp->next, typeof(*t), rcu_node_entry);
578 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
579 smp_mb__after_unlock_lock();
580 list_del(&t->rcu_node_entry);
581 t->rcu_blocked_node = rnp_root;
582 list_add(&t->rcu_node_entry, lp_root);
583 if (&t->rcu_node_entry == rnp->gp_tasks)
584 rnp_root->gp_tasks = rnp->gp_tasks;
585 if (&t->rcu_node_entry == rnp->exp_tasks)
586 rnp_root->exp_tasks = rnp->exp_tasks;
587#ifdef CONFIG_RCU_BOOST
588 if (&t->rcu_node_entry == rnp->boost_tasks)
589 rnp_root->boost_tasks = rnp->boost_tasks;
590#endif /* #ifdef CONFIG_RCU_BOOST */
591 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
592 }
593
594 rnp->gp_tasks = NULL;
595 rnp->exp_tasks = NULL;
596#ifdef CONFIG_RCU_BOOST
597 rnp->boost_tasks = NULL;
598 /*
599 * In case root is being boosted and leaf was not. Make sure
600 * that we boost the tasks blocking the current grace period
601 * in this case.
602 */
603 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
604 smp_mb__after_unlock_lock();
605 if (rnp_root->boost_tasks != NULL &&
606 rnp_root->boost_tasks != rnp_root->gp_tasks &&
607 rnp_root->boost_tasks != rnp_root->exp_tasks)
608 rnp_root->boost_tasks = rnp_root->gp_tasks;
609 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
610#endif /* #ifdef CONFIG_RCU_BOOST */
611
612 return retval;
613}
614
615#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 525#endif /* #ifdef CONFIG_HOTPLUG_CPU */
616 526
617/* 527/*
@@ -771,7 +681,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
771 681
772 raw_spin_lock_irqsave(&rnp->lock, flags); 682 raw_spin_lock_irqsave(&rnp->lock, flags);
773 smp_mb__after_unlock_lock(); 683 smp_mb__after_unlock_lock();
774 if (list_empty(&rnp->blkd_tasks)) { 684 if (!rcu_preempt_has_tasks(rnp)) {
775 raw_spin_unlock_irqrestore(&rnp->lock, flags); 685 raw_spin_unlock_irqrestore(&rnp->lock, flags);
776 } else { 686 } else {
777 rnp->exp_tasks = rnp->blkd_tasks.next; 687 rnp->exp_tasks = rnp->blkd_tasks.next;
@@ -933,15 +843,6 @@ static void __init rcu_bootup_announce(void)
933} 843}
934 844
935/* 845/*
936 * Return the number of RCU batches processed thus far for debug & stats.
937 */
938long rcu_batches_completed(void)
939{
940 return rcu_batches_completed_sched();
941}
942EXPORT_SYMBOL_GPL(rcu_batches_completed);
943
944/*
945 * Because preemptible RCU does not exist, we never have to check for 846 * Because preemptible RCU does not exist, we never have to check for
946 * CPUs being in quiescent states. 847 * CPUs being in quiescent states.
947 */ 848 */
@@ -960,11 +861,12 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
960 861
961#ifdef CONFIG_HOTPLUG_CPU 862#ifdef CONFIG_HOTPLUG_CPU
962 863
963/* Because preemptible RCU does not exist, no quieting of tasks. */ 864/*
964static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 865 * Because there is no preemptible RCU, there can be no readers blocked.
965 __releases(rnp->lock) 866 */
867static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
966{ 868{
967 raw_spin_unlock_irqrestore(&rnp->lock, flags); 869 return false;
968} 870}
969 871
970#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 872#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -996,23 +898,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
996 WARN_ON_ONCE(rnp->qsmask); 898 WARN_ON_ONCE(rnp->qsmask);
997} 899}
998 900
999#ifdef CONFIG_HOTPLUG_CPU
1000
1001/*
1002 * Because preemptible RCU does not exist, it never needs to migrate
1003 * tasks that were blocked within RCU read-side critical sections, and
1004 * such non-existent tasks cannot possibly have been blocking the current
1005 * grace period.
1006 */
1007static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1008 struct rcu_node *rnp,
1009 struct rcu_data *rdp)
1010{
1011 return 0;
1012}
1013
1014#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1015
1016/* 901/*
1017 * Because preemptible RCU does not exist, it never has any callbacks 902 * Because preemptible RCU does not exist, it never has any callbacks
1018 * to check. 903 * to check.
@@ -1031,20 +916,6 @@ void synchronize_rcu_expedited(void)
1031} 916}
1032EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 917EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1033 918
1034#ifdef CONFIG_HOTPLUG_CPU
1035
1036/*
1037 * Because preemptible RCU does not exist, there is never any need to
1038 * report on tasks preempted in RCU read-side critical sections during
1039 * expedited RCU grace periods.
1040 */
1041static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1042 bool wake)
1043{
1044}
1045
1046#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1047
1048/* 919/*
1049 * Because preemptible RCU does not exist, rcu_barrier() is just 920 * Because preemptible RCU does not exist, rcu_barrier() is just
1050 * another name for rcu_barrier_sched(). 921 * another name for rcu_barrier_sched().
@@ -1080,7 +951,7 @@ void exit_rcu(void)
1080 951
1081static void rcu_initiate_boost_trace(struct rcu_node *rnp) 952static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1082{ 953{
1083 if (list_empty(&rnp->blkd_tasks)) 954 if (!rcu_preempt_has_tasks(rnp))
1084 rnp->n_balk_blkd_tasks++; 955 rnp->n_balk_blkd_tasks++;
1085 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) 956 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
1086 rnp->n_balk_exp_gp_tasks++; 957 rnp->n_balk_exp_gp_tasks++;
@@ -1127,7 +998,8 @@ static int rcu_boost(struct rcu_node *rnp)
1127 struct task_struct *t; 998 struct task_struct *t;
1128 struct list_head *tb; 999 struct list_head *tb;
1129 1000
1130 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) 1001 if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
1002 ACCESS_ONCE(rnp->boost_tasks) == NULL)
1131 return 0; /* Nothing left to boost. */ 1003 return 0; /* Nothing left to boost. */
1132 1004
1133 raw_spin_lock_irqsave(&rnp->lock, flags); 1005 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1175,15 +1047,11 @@ static int rcu_boost(struct rcu_node *rnp)
1175 */ 1047 */
1176 t = container_of(tb, struct task_struct, rcu_node_entry); 1048 t = container_of(tb, struct task_struct, rcu_node_entry);
1177 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); 1049 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
1178 init_completion(&rnp->boost_completion);
1179 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1050 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1180 /* Lock only for side effect: boosts task t's priority. */ 1051 /* Lock only for side effect: boosts task t's priority. */
1181 rt_mutex_lock(&rnp->boost_mtx); 1052 rt_mutex_lock(&rnp->boost_mtx);
1182 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ 1053 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
1183 1054
1184 /* Wait for boostee to be done w/boost_mtx before reinitializing. */
1185 wait_for_completion(&rnp->boost_completion);
1186
1187 return ACCESS_ONCE(rnp->exp_tasks) != NULL || 1055 return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1188 ACCESS_ONCE(rnp->boost_tasks) != NULL; 1056 ACCESS_ONCE(rnp->boost_tasks) != NULL;
1189} 1057}
@@ -1416,12 +1284,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1416 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) 1284 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1417 if ((mask & 0x1) && cpu != outgoingcpu) 1285 if ((mask & 0x1) && cpu != outgoingcpu)
1418 cpumask_set_cpu(cpu, cm); 1286 cpumask_set_cpu(cpu, cm);
1419 if (cpumask_weight(cm) == 0) { 1287 if (cpumask_weight(cm) == 0)
1420 cpumask_setall(cm); 1288 cpumask_setall(cm);
1421 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
1422 cpumask_clear_cpu(cpu, cm);
1423 WARN_ON_ONCE(cpumask_weight(cm) == 0);
1424 }
1425 set_cpus_allowed_ptr(t, cm); 1289 set_cpus_allowed_ptr(t, cm);
1426 free_cpumask_var(cm); 1290 free_cpumask_var(cm);
1427} 1291}
@@ -1446,12 +1310,8 @@ static void __init rcu_spawn_boost_kthreads(void)
1446 for_each_possible_cpu(cpu) 1310 for_each_possible_cpu(cpu)
1447 per_cpu(rcu_cpu_has_work, cpu) = 0; 1311 per_cpu(rcu_cpu_has_work, cpu) = 0;
1448 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); 1312 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
1449 rnp = rcu_get_root(rcu_state_p); 1313 rcu_for_each_leaf_node(rcu_state_p, rnp)
1450 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); 1314 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1451 if (NUM_RCU_NODES > 1) {
1452 rcu_for_each_leaf_node(rcu_state_p, rnp)
1453 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1454 }
1455} 1315}
1456 1316
1457static void rcu_prepare_kthreads(int cpu) 1317static void rcu_prepare_kthreads(int cpu)
@@ -1605,7 +1465,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
1605 * completed since we last checked and there are 1465 * completed since we last checked and there are
1606 * callbacks not yet ready to invoke. 1466 * callbacks not yet ready to invoke.
1607 */ 1467 */
1608 if (rdp->completed != rnp->completed && 1468 if ((rdp->completed != rnp->completed ||
1469 unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
1609 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) 1470 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1610 note_gp_changes(rsp, rdp); 1471 note_gp_changes(rsp, rdp);
1611 1472
@@ -1898,11 +1759,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1898 ticks_value = rsp->gpnum - rdp->gpnum; 1759 ticks_value = rsp->gpnum - rdp->gpnum;
1899 } 1760 }
1900 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1761 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
1901 pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", 1762 pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
1902 cpu, ticks_value, ticks_title, 1763 cpu, ticks_value, ticks_title,
1903 atomic_read(&rdtp->dynticks) & 0xfff, 1764 atomic_read(&rdtp->dynticks) & 0xfff,
1904 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 1765 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
1905 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), 1766 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
1767 ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
1906 fast_no_hz); 1768 fast_no_hz);
1907} 1769}
1908 1770
@@ -2056,9 +1918,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
2056static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) 1918static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2057{ 1919{
2058 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1920 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1921 unsigned long ret;
1922#ifdef CONFIG_PROVE_RCU
2059 struct rcu_head *rhp; 1923 struct rcu_head *rhp;
1924#endif /* #ifdef CONFIG_PROVE_RCU */
2060 1925
2061 /* No-CBs CPUs might have callbacks on any of three lists. */ 1926 /*
1927 * Check count of all no-CBs callbacks awaiting invocation.
1928 * There needs to be a barrier before this function is called,
1929 * but associated with a prior determination that no more
1930 * callbacks would be posted. In the worst case, the first
1931 * barrier in _rcu_barrier() suffices (but the caller cannot
1932 * necessarily rely on this, not a substitute for the caller
1933 * getting the concurrency design right!). There must also be
1934 * a barrier between the following load an posting of a callback
1935 * (if a callback is in fact needed). This is associated with an
1936 * atomic_inc() in the caller.
1937 */
1938 ret = atomic_long_read(&rdp->nocb_q_count);
1939
1940#ifdef CONFIG_PROVE_RCU
2062 rhp = ACCESS_ONCE(rdp->nocb_head); 1941 rhp = ACCESS_ONCE(rdp->nocb_head);
2063 if (!rhp) 1942 if (!rhp)
2064 rhp = ACCESS_ONCE(rdp->nocb_gp_head); 1943 rhp = ACCESS_ONCE(rdp->nocb_gp_head);
@@ -2072,8 +1951,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2072 cpu, rhp->func); 1951 cpu, rhp->func);
2073 WARN_ON_ONCE(1); 1952 WARN_ON_ONCE(1);
2074 } 1953 }
1954#endif /* #ifdef CONFIG_PROVE_RCU */
2075 1955
2076 return !!rhp; 1956 return !!ret;
2077} 1957}
2078 1958
2079/* 1959/*
@@ -2095,9 +1975,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2095 struct task_struct *t; 1975 struct task_struct *t;
2096 1976
2097 /* Enqueue the callback on the nocb list and update counts. */ 1977 /* Enqueue the callback on the nocb list and update counts. */
1978 atomic_long_add(rhcount, &rdp->nocb_q_count);
1979 /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
2098 old_rhpp = xchg(&rdp->nocb_tail, rhtp); 1980 old_rhpp = xchg(&rdp->nocb_tail, rhtp);
2099 ACCESS_ONCE(*old_rhpp) = rhp; 1981 ACCESS_ONCE(*old_rhpp) = rhp;
2100 atomic_long_add(rhcount, &rdp->nocb_q_count);
2101 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); 1982 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
2102 smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ 1983 smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
2103 1984
@@ -2288,9 +2169,6 @@ wait_again:
2288 /* Move callbacks to wait-for-GP list, which is empty. */ 2169 /* Move callbacks to wait-for-GP list, which is empty. */
2289 ACCESS_ONCE(rdp->nocb_head) = NULL; 2170 ACCESS_ONCE(rdp->nocb_head) = NULL;
2290 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); 2171 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2291 rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
2292 rdp->nocb_gp_count_lazy =
2293 atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2294 gotcbs = true; 2172 gotcbs = true;
2295 } 2173 }
2296 2174
@@ -2338,9 +2216,6 @@ wait_again:
2338 /* Append callbacks to follower's "done" list. */ 2216 /* Append callbacks to follower's "done" list. */
2339 tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); 2217 tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
2340 *tail = rdp->nocb_gp_head; 2218 *tail = rdp->nocb_gp_head;
2341 atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
2342 atomic_long_add(rdp->nocb_gp_count_lazy,
2343 &rdp->nocb_follower_count_lazy);
2344 smp_mb__after_atomic(); /* Store *tail before wakeup. */ 2219 smp_mb__after_atomic(); /* Store *tail before wakeup. */
2345 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { 2220 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
2346 /* 2221 /*
@@ -2415,13 +2290,11 @@ static int rcu_nocb_kthread(void *arg)
2415 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); 2290 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
2416 ACCESS_ONCE(rdp->nocb_follower_head) = NULL; 2291 ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
2417 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); 2292 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
2418 c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
2419 cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
2420 rdp->nocb_p_count += c;
2421 rdp->nocb_p_count_lazy += cl;
2422 2293
2423 /* Each pass through the following loop invokes a callback. */ 2294 /* Each pass through the following loop invokes a callback. */
2424 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); 2295 trace_rcu_batch_start(rdp->rsp->name,
2296 atomic_long_read(&rdp->nocb_q_count_lazy),
2297 atomic_long_read(&rdp->nocb_q_count), -1);
2425 c = cl = 0; 2298 c = cl = 0;
2426 while (list) { 2299 while (list) {
2427 next = list->next; 2300 next = list->next;
@@ -2443,9 +2316,9 @@ static int rcu_nocb_kthread(void *arg)
2443 list = next; 2316 list = next;
2444 } 2317 }
2445 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); 2318 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2446 ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c; 2319 smp_mb__before_atomic(); /* _add after CB invocation. */
2447 ACCESS_ONCE(rdp->nocb_p_count_lazy) = 2320 atomic_long_add(-c, &rdp->nocb_q_count);
2448 rdp->nocb_p_count_lazy - cl; 2321 atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
2449 rdp->n_nocbs_invoked += c; 2322 rdp->n_nocbs_invoked += c;
2450 } 2323 }
2451 return 0; 2324 return 0;
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 5cdc62e1beeb..fbb6240509ea 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -46,6 +46,8 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "tree.h" 47#include "tree.h"
48 48
49DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
50
49static int r_open(struct inode *inode, struct file *file, 51static int r_open(struct inode *inode, struct file *file,
50 const struct seq_operations *op) 52 const struct seq_operations *op)
51{ 53{
@@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
115 117
116 if (!rdp->beenonline) 118 if (!rdp->beenonline)
117 return; 119 return;
118 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", 120 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
119 rdp->cpu, 121 rdp->cpu,
120 cpu_is_offline(rdp->cpu) ? '!' : ' ', 122 cpu_is_offline(rdp->cpu) ? '!' : ' ',
121 ulong2long(rdp->completed), ulong2long(rdp->gpnum), 123 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
122 rdp->passed_quiesce, rdp->qs_pending); 124 rdp->passed_quiesce,
125 rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
126 rdp->qs_pending);
123 seq_printf(m, " dt=%d/%llx/%d df=%lu", 127 seq_printf(m, " dt=%d/%llx/%d df=%lu",
124 atomic_read(&rdp->dynticks->dynticks), 128 atomic_read(&rdp->dynticks->dynticks),
125 rdp->dynticks->dynticks_nesting, 129 rdp->dynticks->dynticks_nesting,
diff --git a/kernel/resource.c b/kernel/resource.c
index 0bcebffc4e77..19f2357dfda3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -22,6 +22,7 @@
22#include <linux/device.h> 22#include <linux/device.h>
23#include <linux/pfn.h> 23#include <linux/pfn.h>
24#include <linux/mm.h> 24#include <linux/mm.h>
25#include <linux/resource_ext.h>
25#include <asm/io.h> 26#include <asm/io.h>
26 27
27 28
@@ -1529,6 +1530,30 @@ int iomem_is_exclusive(u64 addr)
1529 return err; 1530 return err;
1530} 1531}
1531 1532
1533struct resource_entry *resource_list_create_entry(struct resource *res,
1534 size_t extra_size)
1535{
1536 struct resource_entry *entry;
1537
1538 entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
1539 if (entry) {
1540 INIT_LIST_HEAD(&entry->node);
1541 entry->res = res ? res : &entry->__res;
1542 }
1543
1544 return entry;
1545}
1546EXPORT_SYMBOL(resource_list_create_entry);
1547
1548void resource_list_free(struct list_head *head)
1549{
1550 struct resource_entry *entry, *tmp;
1551
1552 list_for_each_entry_safe(entry, tmp, head, node)
1553 resource_list_destroy_entry(entry);
1554}
1555EXPORT_SYMBOL(resource_list_free);
1556
1532static int __init strict_iomem(char *str) 1557static int __init strict_iomem(char *str)
1533{ 1558{
1534 if (strstr(str, "relaxed")) 1559 if (strstr(str, "relaxed"))
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index ab32b7b0db5c..46be87024875 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,5 +1,5 @@
1ifdef CONFIG_FUNCTION_TRACER 1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_clock.o = -pg 2CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
3endif 3endif
4 4
5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c27e4f8f4879..c0a205101c23 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -420,3 +420,16 @@ u64 local_clock(void)
420 420
421EXPORT_SYMBOL_GPL(cpu_clock); 421EXPORT_SYMBOL_GPL(cpu_clock);
422EXPORT_SYMBOL_GPL(local_clock); 422EXPORT_SYMBOL_GPL(local_clock);
423
424/*
425 * Running clock - returns the time that has elapsed while a guest has been
426 * running.
427 * On a guest this value should be local_clock minus the time the guest was
428 * suspended by the hypervisor (for any reason).
429 * On bare metal this function should return the same as local_clock.
430 * Architectures and sub-architectures can override this.
431 */
432u64 __weak running_clock(void)
433{
434 return local_clock();
435}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 607f852b4d04..7052d3fd4e7b 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -268,6 +268,15 @@ bool try_wait_for_completion(struct completion *x)
268 unsigned long flags; 268 unsigned long flags;
269 int ret = 1; 269 int ret = 1;
270 270
271 /*
272 * Since x->done will need to be locked only
273 * in the non-blocking case, we check x->done
274 * first without taking the lock so we can
275 * return early in the blocking case.
276 */
277 if (!ACCESS_ONCE(x->done))
278 return 0;
279
271 spin_lock_irqsave(&x->wait.lock, flags); 280 spin_lock_irqsave(&x->wait.lock, flags);
272 if (!x->done) 281 if (!x->done)
273 ret = 0; 282 ret = 0;
@@ -288,13 +297,6 @@ EXPORT_SYMBOL(try_wait_for_completion);
288 */ 297 */
289bool completion_done(struct completion *x) 298bool completion_done(struct completion *x)
290{ 299{
291 unsigned long flags; 300 return !!ACCESS_ONCE(x->done);
292 int ret = 1;
293
294 spin_lock_irqsave(&x->wait.lock, flags);
295 if (!x->done)
296 ret = 0;
297 spin_unlock_irqrestore(&x->wait.lock, flags);
298 return ret;
299} 301}
300EXPORT_SYMBOL(completion_done); 302EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc00566e..1f37fe7f77a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
119{ 119{
120 s64 delta; 120 s64 delta;
121 121
122 if (rq->skip_clock_update > 0) 122 lockdep_assert_held(&rq->lock);
123
124 if (rq->clock_skip_update & RQCF_ACT_SKIP)
123 return; 125 return;
124 126
125 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 127 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -490,6 +492,11 @@ static __init void init_hrtick(void)
490 */ 492 */
491void hrtick_start(struct rq *rq, u64 delay) 493void hrtick_start(struct rq *rq, u64 delay)
492{ 494{
495 /*
496 * Don't schedule slices shorter than 10000ns, that just
497 * doesn't make sense. Rely on vruntime for fairness.
498 */
499 delay = max_t(u64, delay, 10000LL);
493 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 500 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
494 HRTIMER_MODE_REL_PINNED, 0); 501 HRTIMER_MODE_REL_PINNED, 0);
495} 502}
@@ -1046,7 +1053,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1046 * this case, we can save a useless back to back clock update. 1053 * this case, we can save a useless back to back clock update.
1047 */ 1054 */
1048 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) 1055 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1049 rq->skip_clock_update = 1; 1056 rq_clock_skip_update(rq, true);
1050} 1057}
1051 1058
1052#ifdef CONFIG_SMP 1059#ifdef CONFIG_SMP
@@ -1082,7 +1089,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1082 if (p->sched_class->migrate_task_rq) 1089 if (p->sched_class->migrate_task_rq)
1083 p->sched_class->migrate_task_rq(p, new_cpu); 1090 p->sched_class->migrate_task_rq(p, new_cpu);
1084 p->se.nr_migrations++; 1091 p->se.nr_migrations++;
1085 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 1092 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
1086 } 1093 }
1087 1094
1088 __set_task_cpu(p, new_cpu); 1095 __set_task_cpu(p, new_cpu);
@@ -1814,6 +1821,10 @@ void __dl_clear_params(struct task_struct *p)
1814 dl_se->dl_period = 0; 1821 dl_se->dl_period = 0;
1815 dl_se->flags = 0; 1822 dl_se->flags = 0;
1816 dl_se->dl_bw = 0; 1823 dl_se->dl_bw = 0;
1824
1825 dl_se->dl_throttled = 0;
1826 dl_se->dl_new = 1;
1827 dl_se->dl_yielded = 0;
1817} 1828}
1818 1829
1819/* 1830/*
@@ -1832,6 +1843,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1832 p->se.prev_sum_exec_runtime = 0; 1843 p->se.prev_sum_exec_runtime = 0;
1833 p->se.nr_migrations = 0; 1844 p->se.nr_migrations = 0;
1834 p->se.vruntime = 0; 1845 p->se.vruntime = 0;
1846#ifdef CONFIG_SMP
1847 p->se.avg.decay_count = 0;
1848#endif
1835 INIT_LIST_HEAD(&p->se.group_node); 1849 INIT_LIST_HEAD(&p->se.group_node);
1836 1850
1837#ifdef CONFIG_SCHEDSTATS 1851#ifdef CONFIG_SCHEDSTATS
@@ -1839,7 +1853,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1839#endif 1853#endif
1840 1854
1841 RB_CLEAR_NODE(&p->dl.rb_node); 1855 RB_CLEAR_NODE(&p->dl.rb_node);
1842 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1856 init_dl_task_timer(&p->dl);
1843 __dl_clear_params(p); 1857 __dl_clear_params(p);
1844 1858
1845 INIT_LIST_HEAD(&p->rt.run_list); 1859 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2049,6 +2063,9 @@ static inline int dl_bw_cpus(int i)
2049 * allocated bandwidth to reflect the new situation. 2063 * allocated bandwidth to reflect the new situation.
2050 * 2064 *
2051 * This function is called while holding p's rq->lock. 2065 * This function is called while holding p's rq->lock.
2066 *
2067 * XXX we should delay bw change until the task's 0-lag point, see
2068 * __setparam_dl().
2052 */ 2069 */
2053static int dl_overflow(struct task_struct *p, int policy, 2070static int dl_overflow(struct task_struct *p, int policy,
2054 const struct sched_attr *attr) 2071 const struct sched_attr *attr)
@@ -2748,6 +2765,10 @@ again:
2748 * - explicit schedule() call 2765 * - explicit schedule() call
2749 * - return from syscall or exception to user-space 2766 * - return from syscall or exception to user-space
2750 * - return from interrupt-handler to user-space 2767 * - return from interrupt-handler to user-space
2768 *
2769 * WARNING: all callers must re-check need_resched() afterward and reschedule
2770 * accordingly in case an event triggered the need for rescheduling (such as
2771 * an interrupt waking up a task) while preemption was disabled in __schedule().
2751 */ 2772 */
2752static void __sched __schedule(void) 2773static void __sched __schedule(void)
2753{ 2774{
@@ -2756,7 +2777,6 @@ static void __sched __schedule(void)
2756 struct rq *rq; 2777 struct rq *rq;
2757 int cpu; 2778 int cpu;
2758 2779
2759need_resched:
2760 preempt_disable(); 2780 preempt_disable();
2761 cpu = smp_processor_id(); 2781 cpu = smp_processor_id();
2762 rq = cpu_rq(cpu); 2782 rq = cpu_rq(cpu);
@@ -2776,6 +2796,8 @@ need_resched:
2776 smp_mb__before_spinlock(); 2796 smp_mb__before_spinlock();
2777 raw_spin_lock_irq(&rq->lock); 2797 raw_spin_lock_irq(&rq->lock);
2778 2798
2799 rq->clock_skip_update <<= 1; /* promote REQ to ACT */
2800
2779 switch_count = &prev->nivcsw; 2801 switch_count = &prev->nivcsw;
2780 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2802 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2781 if (unlikely(signal_pending_state(prev->state, prev))) { 2803 if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2800,13 +2822,13 @@ need_resched:
2800 switch_count = &prev->nvcsw; 2822 switch_count = &prev->nvcsw;
2801 } 2823 }
2802 2824
2803 if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) 2825 if (task_on_rq_queued(prev))
2804 update_rq_clock(rq); 2826 update_rq_clock(rq);
2805 2827
2806 next = pick_next_task(rq, prev); 2828 next = pick_next_task(rq, prev);
2807 clear_tsk_need_resched(prev); 2829 clear_tsk_need_resched(prev);
2808 clear_preempt_need_resched(); 2830 clear_preempt_need_resched();
2809 rq->skip_clock_update = 0; 2831 rq->clock_skip_update = 0;
2810 2832
2811 if (likely(prev != next)) { 2833 if (likely(prev != next)) {
2812 rq->nr_switches++; 2834 rq->nr_switches++;
@@ -2821,8 +2843,6 @@ need_resched:
2821 post_schedule(rq); 2843 post_schedule(rq);
2822 2844
2823 sched_preempt_enable_no_resched(); 2845 sched_preempt_enable_no_resched();
2824 if (need_resched())
2825 goto need_resched;
2826} 2846}
2827 2847
2828static inline void sched_submit_work(struct task_struct *tsk) 2848static inline void sched_submit_work(struct task_struct *tsk)
@@ -2842,7 +2862,9 @@ asmlinkage __visible void __sched schedule(void)
2842 struct task_struct *tsk = current; 2862 struct task_struct *tsk = current;
2843 2863
2844 sched_submit_work(tsk); 2864 sched_submit_work(tsk);
2845 __schedule(); 2865 do {
2866 __schedule();
2867 } while (need_resched());
2846} 2868}
2847EXPORT_SYMBOL(schedule); 2869EXPORT_SYMBOL(schedule);
2848 2870
@@ -2877,6 +2899,21 @@ void __sched schedule_preempt_disabled(void)
2877 preempt_disable(); 2899 preempt_disable();
2878} 2900}
2879 2901
2902static void preempt_schedule_common(void)
2903{
2904 do {
2905 __preempt_count_add(PREEMPT_ACTIVE);
2906 __schedule();
2907 __preempt_count_sub(PREEMPT_ACTIVE);
2908
2909 /*
2910 * Check again in case we missed a preemption opportunity
2911 * between schedule and now.
2912 */
2913 barrier();
2914 } while (need_resched());
2915}
2916
2880#ifdef CONFIG_PREEMPT 2917#ifdef CONFIG_PREEMPT
2881/* 2918/*
2882 * this is the entry point to schedule() from in-kernel preemption 2919 * this is the entry point to schedule() from in-kernel preemption
@@ -2892,17 +2929,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
2892 if (likely(!preemptible())) 2929 if (likely(!preemptible()))
2893 return; 2930 return;
2894 2931
2895 do { 2932 preempt_schedule_common();
2896 __preempt_count_add(PREEMPT_ACTIVE);
2897 __schedule();
2898 __preempt_count_sub(PREEMPT_ACTIVE);
2899
2900 /*
2901 * Check again in case we missed a preemption opportunity
2902 * between schedule and now.
2903 */
2904 barrier();
2905 } while (need_resched());
2906} 2933}
2907NOKPROBE_SYMBOL(preempt_schedule); 2934NOKPROBE_SYMBOL(preempt_schedule);
2908EXPORT_SYMBOL(preempt_schedule); 2935EXPORT_SYMBOL(preempt_schedule);
@@ -3251,15 +3278,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3251{ 3278{
3252 struct sched_dl_entity *dl_se = &p->dl; 3279 struct sched_dl_entity *dl_se = &p->dl;
3253 3280
3254 init_dl_task_timer(dl_se);
3255 dl_se->dl_runtime = attr->sched_runtime; 3281 dl_se->dl_runtime = attr->sched_runtime;
3256 dl_se->dl_deadline = attr->sched_deadline; 3282 dl_se->dl_deadline = attr->sched_deadline;
3257 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; 3283 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3258 dl_se->flags = attr->sched_flags; 3284 dl_se->flags = attr->sched_flags;
3259 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3285 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3260 dl_se->dl_throttled = 0; 3286
3261 dl_se->dl_new = 1; 3287 /*
3262 dl_se->dl_yielded = 0; 3288 * Changing the parameters of a task is 'tricky' and we're not doing
3289 * the correct thing -- also see task_dead_dl() and switched_from_dl().
3290 *
3291 * What we SHOULD do is delay the bandwidth release until the 0-lag
3292 * point. This would include retaining the task_struct until that time
3293 * and change dl_overflow() to not immediately decrement the current
3294 * amount.
3295 *
3296 * Instead we retain the current runtime/deadline and let the new
3297 * parameters take effect after the current reservation period lapses.
3298 * This is safe (albeit pessimistic) because the 0-lag point is always
3299 * before the current scheduling deadline.
3300 *
3301 * We can still have temporary overloads because we do not delay the
3302 * change in bandwidth until that time; so admission control is
3303 * not on the safe side. It does however guarantee tasks will never
3304 * consume more than promised.
3305 */
3263} 3306}
3264 3307
3265/* 3308/*
@@ -3382,6 +3425,20 @@ static bool check_same_owner(struct task_struct *p)
3382 return match; 3425 return match;
3383} 3426}
3384 3427
3428static bool dl_param_changed(struct task_struct *p,
3429 const struct sched_attr *attr)
3430{
3431 struct sched_dl_entity *dl_se = &p->dl;
3432
3433 if (dl_se->dl_runtime != attr->sched_runtime ||
3434 dl_se->dl_deadline != attr->sched_deadline ||
3435 dl_se->dl_period != attr->sched_period ||
3436 dl_se->flags != attr->sched_flags)
3437 return true;
3438
3439 return false;
3440}
3441
3385static int __sched_setscheduler(struct task_struct *p, 3442static int __sched_setscheduler(struct task_struct *p,
3386 const struct sched_attr *attr, 3443 const struct sched_attr *attr,
3387 bool user) 3444 bool user)
@@ -3510,7 +3567,7 @@ recheck:
3510 goto change; 3567 goto change;
3511 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3568 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3512 goto change; 3569 goto change;
3513 if (dl_policy(policy)) 3570 if (dl_policy(policy) && dl_param_changed(p, attr))
3514 goto change; 3571 goto change;
3515 3572
3516 p->sched_reset_on_fork = reset_on_fork; 3573 p->sched_reset_on_fork = reset_on_fork;
@@ -4202,17 +4259,10 @@ SYSCALL_DEFINE0(sched_yield)
4202 return 0; 4259 return 0;
4203} 4260}
4204 4261
4205static void __cond_resched(void)
4206{
4207 __preempt_count_add(PREEMPT_ACTIVE);
4208 __schedule();
4209 __preempt_count_sub(PREEMPT_ACTIVE);
4210}
4211
4212int __sched _cond_resched(void) 4262int __sched _cond_resched(void)
4213{ 4263{
4214 if (should_resched()) { 4264 if (should_resched()) {
4215 __cond_resched(); 4265 preempt_schedule_common();
4216 return 1; 4266 return 1;
4217 } 4267 }
4218 return 0; 4268 return 0;
@@ -4237,7 +4287,7 @@ int __cond_resched_lock(spinlock_t *lock)
4237 if (spin_needbreak(lock) || resched) { 4287 if (spin_needbreak(lock) || resched) {
4238 spin_unlock(lock); 4288 spin_unlock(lock);
4239 if (resched) 4289 if (resched)
4240 __cond_resched(); 4290 preempt_schedule_common();
4241 else 4291 else
4242 cpu_relax(); 4292 cpu_relax();
4243 ret = 1; 4293 ret = 1;
@@ -4253,7 +4303,7 @@ int __sched __cond_resched_softirq(void)
4253 4303
4254 if (should_resched()) { 4304 if (should_resched()) {
4255 local_bh_enable(); 4305 local_bh_enable();
4256 __cond_resched(); 4306 preempt_schedule_common();
4257 local_bh_disable(); 4307 local_bh_disable();
4258 return 1; 4308 return 1;
4259 } 4309 }
@@ -4508,9 +4558,10 @@ void sched_show_task(struct task_struct *p)
4508{ 4558{
4509 unsigned long free = 0; 4559 unsigned long free = 0;
4510 int ppid; 4560 int ppid;
4511 unsigned state; 4561 unsigned long state = p->state;
4512 4562
4513 state = p->state ? __ffs(p->state) + 1 : 0; 4563 if (state)
4564 state = __ffs(state) + 1;
4514 printk(KERN_INFO "%-15.15s %c", p->comm, 4565 printk(KERN_INFO "%-15.15s %c", p->comm,
4515 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4566 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4516#if BITS_PER_LONG == 32 4567#if BITS_PER_LONG == 32
@@ -4642,6 +4693,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
4642 struct dl_bw *cur_dl_b; 4693 struct dl_bw *cur_dl_b;
4643 unsigned long flags; 4694 unsigned long flags;
4644 4695
4696 if (!cpumask_weight(cur))
4697 return ret;
4698
4645 rcu_read_lock_sched(); 4699 rcu_read_lock_sched();
4646 cur_dl_b = dl_bw_of(cpumask_any(cur)); 4700 cur_dl_b = dl_bw_of(cpumask_any(cur));
4647 trial_cpus = cpumask_weight(trial); 4701 trial_cpus = cpumask_weight(trial);
@@ -4740,7 +4794,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
4740 4794
4741void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4795void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4742{ 4796{
4743 if (p->sched_class && p->sched_class->set_cpus_allowed) 4797 if (p->sched_class->set_cpus_allowed)
4744 p->sched_class->set_cpus_allowed(p, new_mask); 4798 p->sched_class->set_cpus_allowed(p, new_mask);
4745 4799
4746 cpumask_copy(&p->cpus_allowed, new_mask); 4800 cpumask_copy(&p->cpus_allowed, new_mask);
@@ -7250,6 +7304,11 @@ void __init sched_init(void)
7250 enter_lazy_tlb(&init_mm, current); 7304 enter_lazy_tlb(&init_mm, current);
7251 7305
7252 /* 7306 /*
7307 * During early bootup we pretend to be a normal task:
7308 */
7309 current->sched_class = &fair_sched_class;
7310
7311 /*
7253 * Make us the idle thread. Technically, schedule() should not be 7312 * Make us the idle thread. Technically, schedule() should not be
7254 * called from this thread, however somewhere below it might be, 7313 * called from this thread, however somewhere below it might be,
7255 * but because we are the idle thread, we just pick up running again 7314 * but because we are the idle thread, we just pick up running again
@@ -7259,11 +7318,6 @@ void __init sched_init(void)
7259 7318
7260 calc_load_update = jiffies + LOAD_FREQ; 7319 calc_load_update = jiffies + LOAD_FREQ;
7261 7320
7262 /*
7263 * During early bootup we pretend to be a normal task:
7264 */
7265 current->sched_class = &fair_sched_class;
7266
7267#ifdef CONFIG_SMP 7321#ifdef CONFIG_SMP
7268 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 7322 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7269 /* May be allocated at isolcpus cmdline parse time */ 7323 /* May be allocated at isolcpus cmdline parse time */
@@ -7292,13 +7346,12 @@ void __might_sleep(const char *file, int line, int preempt_offset)
7292 * since we will exit with TASK_RUNNING make sure we enter with it, 7346 * since we will exit with TASK_RUNNING make sure we enter with it,
7293 * otherwise we will destroy state. 7347 * otherwise we will destroy state.
7294 */ 7348 */
7295 if (WARN_ONCE(current->state != TASK_RUNNING, 7349 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
7296 "do not call blocking ops when !TASK_RUNNING; " 7350 "do not call blocking ops when !TASK_RUNNING; "
7297 "state=%lx set at [<%p>] %pS\n", 7351 "state=%lx set at [<%p>] %pS\n",
7298 current->state, 7352 current->state,
7299 (void *)current->task_state_change, 7353 (void *)current->task_state_change,
7300 (void *)current->task_state_change)) 7354 (void *)current->task_state_change);
7301 __set_current_state(TASK_RUNNING);
7302 7355
7303 ___might_sleep(file, line, preempt_offset); 7356 ___might_sleep(file, line, preempt_offset);
7304} 7357}
@@ -7325,6 +7378,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
7325 in_atomic(), irqs_disabled(), 7378 in_atomic(), irqs_disabled(),
7326 current->pid, current->comm); 7379 current->pid, current->comm);
7327 7380
7381 if (task_stack_end_corrupted(current))
7382 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
7383
7328 debug_show_held_locks(current); 7384 debug_show_held_locks(current);
7329 if (irqs_disabled()) 7385 if (irqs_disabled())
7330 print_irqtrace_events(current); 7386 print_irqtrace_events(current);
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 539ca3ce071b..c6acb07466bb 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,7 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
107 int best_cpu = -1; 107 int best_cpu = -1;
108 const struct sched_dl_entity *dl_se = &p->dl; 108 const struct sched_dl_entity *dl_se = &p->dl;
109 109
110 if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { 110 if (later_mask &&
111 cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
111 best_cpu = cpumask_any(later_mask); 112 best_cpu = cpumask_any(later_mask);
112 goto out; 113 goto out;
113 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && 114 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
@@ -186,6 +187,26 @@ out:
186} 187}
187 188
188/* 189/*
190 * cpudl_set_freecpu - Set the cpudl.free_cpus
191 * @cp: the cpudl max-heap context
192 * @cpu: rd attached cpu
193 */
194void cpudl_set_freecpu(struct cpudl *cp, int cpu)
195{
196 cpumask_set_cpu(cpu, cp->free_cpus);
197}
198
199/*
200 * cpudl_clear_freecpu - Clear the cpudl.free_cpus
201 * @cp: the cpudl max-heap context
202 * @cpu: rd attached cpu
203 */
204void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
205{
206 cpumask_clear_cpu(cpu, cp->free_cpus);
207}
208
209/*
189 * cpudl_init - initialize the cpudl structure 210 * cpudl_init - initialize the cpudl structure
190 * @cp: the cpudl max-heap context 211 * @cp: the cpudl max-heap context
191 */ 212 */
@@ -203,7 +224,7 @@ int cpudl_init(struct cpudl *cp)
203 if (!cp->elements) 224 if (!cp->elements)
204 return -ENOMEM; 225 return -ENOMEM;
205 226
206 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { 227 if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
207 kfree(cp->elements); 228 kfree(cp->elements);
208 return -ENOMEM; 229 return -ENOMEM;
209 } 230 }
@@ -211,8 +232,6 @@ int cpudl_init(struct cpudl *cp)
211 for_each_possible_cpu(i) 232 for_each_possible_cpu(i)
212 cp->elements[i].idx = IDX_INVALID; 233 cp->elements[i].idx = IDX_INVALID;
213 234
214 cpumask_setall(cp->free_cpus);
215
216 return 0; 235 return 0;
217} 236}
218 237
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 020039bd1326..1a0a6ef2fbe1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
24 struct cpumask *later_mask); 24 struct cpumask *later_mask);
25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); 25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26int cpudl_init(struct cpudl *cp); 26int cpudl_init(struct cpudl *cp);
27void cpudl_set_freecpu(struct cpudl *cp, int cpu);
28void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
27void cpudl_cleanup(struct cpudl *cp); 29void cpudl_cleanup(struct cpudl *cp);
28#endif /* CONFIG_SMP */ 30#endif /* CONFIG_SMP */
29 31
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b52092f2636d..a027799ae130 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -350,6 +350,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
350 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 350 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
351 dl_se->runtime = pi_se->dl_runtime; 351 dl_se->runtime = pi_se->dl_runtime;
352 } 352 }
353
354 if (dl_se->dl_yielded)
355 dl_se->dl_yielded = 0;
356 if (dl_se->dl_throttled)
357 dl_se->dl_throttled = 0;
353} 358}
354 359
355/* 360/*
@@ -536,23 +541,19 @@ again:
536 541
537 sched_clock_tick(); 542 sched_clock_tick();
538 update_rq_clock(rq); 543 update_rq_clock(rq);
539 dl_se->dl_throttled = 0; 544 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
540 dl_se->dl_yielded = 0; 545 if (dl_task(rq->curr))
541 if (task_on_rq_queued(p)) { 546 check_preempt_curr_dl(rq, p, 0);
542 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 547 else
543 if (dl_task(rq->curr)) 548 resched_curr(rq);
544 check_preempt_curr_dl(rq, p, 0);
545 else
546 resched_curr(rq);
547#ifdef CONFIG_SMP 549#ifdef CONFIG_SMP
548 /* 550 /*
549 * Queueing this task back might have overloaded rq, 551 * Queueing this task back might have overloaded rq,
550 * check if we need to kick someone away. 552 * check if we need to kick someone away.
551 */ 553 */
552 if (has_pushable_dl_tasks(rq)) 554 if (has_pushable_dl_tasks(rq))
553 push_dl_task(rq); 555 push_dl_task(rq);
554#endif 556#endif
555 }
556unlock: 557unlock:
557 raw_spin_unlock(&rq->lock); 558 raw_spin_unlock(&rq->lock);
558 559
@@ -613,10 +614,9 @@ static void update_curr_dl(struct rq *rq)
613 614
614 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; 615 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
615 if (dl_runtime_exceeded(rq, dl_se)) { 616 if (dl_runtime_exceeded(rq, dl_se)) {
617 dl_se->dl_throttled = 1;
616 __dequeue_task_dl(rq, curr, 0); 618 __dequeue_task_dl(rq, curr, 0);
617 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) 619 if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
618 dl_se->dl_throttled = 1;
619 else
620 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); 620 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
621 621
622 if (!is_leftmost(curr, &rq->dl)) 622 if (!is_leftmost(curr, &rq->dl))
@@ -853,7 +853,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
853 * its rq, the bandwidth timer callback (which clearly has not 853 * its rq, the bandwidth timer callback (which clearly has not
854 * run yet) will take care of this. 854 * run yet) will take care of this.
855 */ 855 */
856 if (p->dl.dl_throttled) 856 if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
857 return; 857 return;
858 858
859 enqueue_dl_entity(&p->dl, pi_se, flags); 859 enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -1073,7 +1073,13 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1073{ 1073{
1074 update_curr_dl(rq); 1074 update_curr_dl(rq);
1075 1075
1076 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) 1076 /*
1077 * Even when we have runtime, update_curr_dl() might have resulted in us
1078 * not being the leftmost task anymore. In that case NEED_RESCHED will
1079 * be set and schedule() will start a new hrtick for the next task.
1080 */
1081 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
1082 is_leftmost(p, &rq->dl))
1077 start_hrtick_dl(rq, p); 1083 start_hrtick_dl(rq, p);
1078} 1084}
1079 1085
@@ -1094,6 +1100,7 @@ static void task_dead_dl(struct task_struct *p)
1094 * Since we are TASK_DEAD we won't slip out of the domain! 1100 * Since we are TASK_DEAD we won't slip out of the domain!
1095 */ 1101 */
1096 raw_spin_lock_irq(&dl_b->lock); 1102 raw_spin_lock_irq(&dl_b->lock);
1103 /* XXX we should retain the bw until 0-lag */
1097 dl_b->total_bw -= p->dl.dl_bw; 1104 dl_b->total_bw -= p->dl.dl_bw;
1098 raw_spin_unlock_irq(&dl_b->lock); 1105 raw_spin_unlock_irq(&dl_b->lock);
1099 1106
@@ -1165,9 +1172,6 @@ static int find_later_rq(struct task_struct *task)
1165 * We have to consider system topology and task affinity 1172 * We have to consider system topology and task affinity
1166 * first, then we can look for a suitable cpu. 1173 * first, then we can look for a suitable cpu.
1167 */ 1174 */
1168 cpumask_copy(later_mask, task_rq(task)->rd->span);
1169 cpumask_and(later_mask, later_mask, cpu_active_mask);
1170 cpumask_and(later_mask, later_mask, &task->cpus_allowed);
1171 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, 1175 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
1172 task, later_mask); 1176 task, later_mask);
1173 if (best_cpu == -1) 1177 if (best_cpu == -1)
@@ -1562,6 +1566,7 @@ static void rq_online_dl(struct rq *rq)
1562 if (rq->dl.overloaded) 1566 if (rq->dl.overloaded)
1563 dl_set_overload(rq); 1567 dl_set_overload(rq);
1564 1568
1569 cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
1565 if (rq->dl.dl_nr_running > 0) 1570 if (rq->dl.dl_nr_running > 0)
1566 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); 1571 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
1567} 1572}
@@ -1573,6 +1578,7 @@ static void rq_offline_dl(struct rq *rq)
1573 dl_clear_overload(rq); 1578 dl_clear_overload(rq);
1574 1579
1575 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); 1580 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
1581 cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
1576} 1582}
1577 1583
1578void init_sched_dl_class(void) 1584void init_sched_dl_class(void)
@@ -1614,8 +1620,8 @@ static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
1614 1620
1615static void switched_from_dl(struct rq *rq, struct task_struct *p) 1621static void switched_from_dl(struct rq *rq, struct task_struct *p)
1616{ 1622{
1623 /* XXX we should retain the bw until 0-lag */
1617 cancel_dl_timer(rq, p); 1624 cancel_dl_timer(rq, p);
1618
1619 __dl_clear_params(p); 1625 __dl_clear_params(p);
1620 1626
1621 /* 1627 /*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 92cc52001e74..8baaf858d25c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -305,6 +305,7 @@ do { \
305 PN(next_balance); 305 PN(next_balance);
306 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); 306 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
307 PN(clock); 307 PN(clock);
308 PN(clock_task);
308 P(cpu_load[0]); 309 P(cpu_load[0]);
309 P(cpu_load[1]); 310 P(cpu_load[1]);
310 P(cpu_load[2]); 311 P(cpu_load[2]);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40667cbf371b..7ce18f3c097a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -676,7 +676,6 @@ void init_task_runnable_average(struct task_struct *p)
676{ 676{
677 u32 slice; 677 u32 slice;
678 678
679 p->se.avg.decay_count = 0;
680 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; 679 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
681 p->se.avg.runnable_avg_sum = slice; 680 p->se.avg.runnable_avg_sum = slice;
682 p->se.avg.runnable_avg_period = slice; 681 p->se.avg.runnable_avg_period = slice;
@@ -1730,7 +1729,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
1730 nodes = node_online_map; 1729 nodes = node_online_map;
1731 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { 1730 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1732 unsigned long max_faults = 0; 1731 unsigned long max_faults = 0;
1733 nodemask_t max_group; 1732 nodemask_t max_group = NODE_MASK_NONE;
1734 int a, b; 1733 int a, b;
1735 1734
1736 /* Are there nodes at this distance from each other? */ 1735 /* Are there nodes at this distance from each other? */
@@ -2574,11 +2573,11 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
2574 u64 decays = atomic64_read(&cfs_rq->decay_counter); 2573 u64 decays = atomic64_read(&cfs_rq->decay_counter);
2575 2574
2576 decays -= se->avg.decay_count; 2575 decays -= se->avg.decay_count;
2576 se->avg.decay_count = 0;
2577 if (!decays) 2577 if (!decays)
2578 return 0; 2578 return 0;
2579 2579
2580 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); 2580 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
2581 se->avg.decay_count = 0;
2582 2581
2583 return decays; 2582 return decays;
2584} 2583}
@@ -5157,7 +5156,7 @@ static void yield_task_fair(struct rq *rq)
5157 * so we don't do microscopic update in schedule() 5156 * so we don't do microscopic update in schedule()
5158 * and double the fastpath cost. 5157 * and double the fastpath cost.
5159 */ 5158 */
5160 rq->skip_clock_update = 1; 5159 rq_clock_skip_update(rq, true);
5161 } 5160 }
5162 5161
5163 set_skip_buddy(se); 5162 set_skip_buddy(se);
@@ -5949,8 +5948,8 @@ static unsigned long scale_rt_capacity(int cpu)
5949 */ 5948 */
5950 age_stamp = ACCESS_ONCE(rq->age_stamp); 5949 age_stamp = ACCESS_ONCE(rq->age_stamp);
5951 avg = ACCESS_ONCE(rq->rt_avg); 5950 avg = ACCESS_ONCE(rq->rt_avg);
5951 delta = __rq_clock_broken(rq) - age_stamp;
5952 5952
5953 delta = rq_clock(rq) - age_stamp;
5954 if (unlikely(delta < 0)) 5953 if (unlikely(delta < 0))
5955 delta = 0; 5954 delta = 0;
5956 5955
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..aaf1c1d5cf5d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -47,7 +47,8 @@ static inline int cpu_idle_poll(void)
47 rcu_idle_enter(); 47 rcu_idle_enter();
48 trace_cpu_idle_rcuidle(0, smp_processor_id()); 48 trace_cpu_idle_rcuidle(0, smp_processor_id());
49 local_irq_enable(); 49 local_irq_enable();
50 while (!tif_need_resched()) 50 while (!tif_need_resched() &&
51 (cpu_idle_force_poll || tick_check_broadcast_expired()))
51 cpu_relax(); 52 cpu_relax();
52 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 53 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
53 rcu_idle_exit(); 54 rcu_idle_exit();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ee15f5a0d1c1..f4d4b077eba0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -831,11 +831,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
831 enqueue = 1; 831 enqueue = 1;
832 832
833 /* 833 /*
834 * Force a clock update if the CPU was idle, 834 * When we're idle and a woken (rt) task is
835 * lest wakeup -> unthrottle time accumulate. 835 * throttled check_preempt_curr() will set
836 * skip_update and the time between the wakeup
837 * and this unthrottle will get accounted as
838 * 'runtime'.
836 */ 839 */
837 if (rt_rq->rt_nr_running && rq->curr == rq->idle) 840 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
838 rq->skip_clock_update = -1; 841 rq_clock_skip_update(rq, false);
839 } 842 }
840 if (rt_rq->rt_time || rt_rq->rt_nr_running) 843 if (rt_rq->rt_time || rt_rq->rt_nr_running)
841 idle = 0; 844 idle = 0;
@@ -1337,7 +1340,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1337 curr->prio <= p->prio)) { 1340 curr->prio <= p->prio)) {
1338 int target = find_lowest_rq(p); 1341 int target = find_lowest_rq(p);
1339 1342
1340 if (target != -1) 1343 /*
1344 * Don't bother moving it if the destination CPU is
1345 * not running a lower priority task.
1346 */
1347 if (target != -1 &&
1348 p->prio < cpu_rq(target)->rt.highest_prio.curr)
1341 cpu = target; 1349 cpu = target;
1342 } 1350 }
1343 rcu_read_unlock(); 1351 rcu_read_unlock();
@@ -1614,6 +1622,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1614 1622
1615 lowest_rq = cpu_rq(cpu); 1623 lowest_rq = cpu_rq(cpu);
1616 1624
1625 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1626 /*
1627 * Target rq has tasks of equal or higher priority,
1628 * retrying does not release any lock and is unlikely
1629 * to yield a different result.
1630 */
1631 lowest_rq = NULL;
1632 break;
1633 }
1634
1617 /* if the prio of this runqueue changed, try again */ 1635 /* if the prio of this runqueue changed, try again */
1618 if (double_lock_balance(rq, lowest_rq)) { 1636 if (double_lock_balance(rq, lowest_rq)) {
1619 /* 1637 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a2a45c970e7..0870db23d79c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -558,8 +558,6 @@ struct rq {
558#ifdef CONFIG_NO_HZ_FULL 558#ifdef CONFIG_NO_HZ_FULL
559 unsigned long last_sched_tick; 559 unsigned long last_sched_tick;
560#endif 560#endif
561 int skip_clock_update;
562
563 /* capture load from *all* tasks on this cpu: */ 561 /* capture load from *all* tasks on this cpu: */
564 struct load_weight load; 562 struct load_weight load;
565 unsigned long nr_load_updates; 563 unsigned long nr_load_updates;
@@ -588,6 +586,7 @@ struct rq {
588 unsigned long next_balance; 586 unsigned long next_balance;
589 struct mm_struct *prev_mm; 587 struct mm_struct *prev_mm;
590 588
589 unsigned int clock_skip_update;
591 u64 clock; 590 u64 clock;
592 u64 clock_task; 591 u64 clock_task;
593 592
@@ -687,16 +686,35 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
687#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 686#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
688#define raw_rq() raw_cpu_ptr(&runqueues) 687#define raw_rq() raw_cpu_ptr(&runqueues)
689 688
689static inline u64 __rq_clock_broken(struct rq *rq)
690{
691 return ACCESS_ONCE(rq->clock);
692}
693
690static inline u64 rq_clock(struct rq *rq) 694static inline u64 rq_clock(struct rq *rq)
691{ 695{
696 lockdep_assert_held(&rq->lock);
692 return rq->clock; 697 return rq->clock;
693} 698}
694 699
695static inline u64 rq_clock_task(struct rq *rq) 700static inline u64 rq_clock_task(struct rq *rq)
696{ 701{
702 lockdep_assert_held(&rq->lock);
697 return rq->clock_task; 703 return rq->clock_task;
698} 704}
699 705
706#define RQCF_REQ_SKIP 0x01
707#define RQCF_ACT_SKIP 0x02
708
709static inline void rq_clock_skip_update(struct rq *rq, bool skip)
710{
711 lockdep_assert_held(&rq->lock);
712 if (skip)
713 rq->clock_skip_update |= RQCF_REQ_SKIP;
714 else
715 rq->clock_skip_update &= ~RQCF_REQ_SKIP;
716}
717
700#ifdef CONFIG_NUMA 718#ifdef CONFIG_NUMA
701enum numa_topology_type { 719enum numa_topology_type {
702 NUMA_DIRECT, 720 NUMA_DIRECT,
diff --git a/kernel/signal.c b/kernel/signal.c
index 16a305295256..33a52759cc0e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2501,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals);
2501 */ 2501 */
2502SYSCALL_DEFINE0(restart_syscall) 2502SYSCALL_DEFINE0(restart_syscall)
2503{ 2503{
2504 struct restart_block *restart = &current_thread_info()->restart_block; 2504 struct restart_block *restart = &current->restart_block;
2505 return restart->fn(restart); 2505 return restart->fn(restart);
2506} 2506}
2507 2507
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f032fb5284e3..40190f28db35 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -280,6 +280,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
280 unsigned int cpu; 280 unsigned int cpu;
281 int ret = 0; 281 int ret = 0;
282 282
283 get_online_cpus();
283 mutex_lock(&smpboot_threads_lock); 284 mutex_lock(&smpboot_threads_lock);
284 for_each_online_cpu(cpu) { 285 for_each_online_cpu(cpu) {
285 ret = __smpboot_create_thread(plug_thread, cpu); 286 ret = __smpboot_create_thread(plug_thread, cpu);
@@ -292,6 +293,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
292 list_add(&plug_thread->list, &hotplug_threads); 293 list_add(&plug_thread->list, &hotplug_threads);
293out: 294out:
294 mutex_unlock(&smpboot_threads_lock); 295 mutex_unlock(&smpboot_threads_lock);
296 put_online_cpus();
295 return ret; 297 return ret;
296} 298}
297EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); 299EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 501baa9ac1be..479e4436f787 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
114 trace_softirqs_off(ip); 114 trace_softirqs_off(ip);
115 raw_local_irq_restore(flags); 115 raw_local_irq_restore(flags);
116 116
117 if (preempt_count() == cnt) 117 if (preempt_count() == cnt) {
118#ifdef CONFIG_DEBUG_PREEMPT
119 current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
120#endif
118 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 121 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
122 }
119} 123}
120EXPORT_SYMBOL(__local_bh_disable_ip); 124EXPORT_SYMBOL(__local_bh_disable_ip);
121#endif /* CONFIG_TRACE_IRQFLAGS */ 125#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -656,9 +660,8 @@ static void run_ksoftirqd(unsigned int cpu)
656 * in the task stack here. 660 * in the task stack here.
657 */ 661 */
658 __do_softirq(); 662 __do_softirq();
659 rcu_note_context_switch();
660 local_irq_enable(); 663 local_irq_enable();
661 cond_resched(); 664 cond_resched_rcu_qs();
662 return; 665 return;
663 } 666 }
664 local_irq_enable(); 667 local_irq_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index a8c9f5a7dda6..ea9c88109894 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2210,9 +2210,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2210 up_write(&me->mm->mmap_sem); 2210 up_write(&me->mm->mmap_sem);
2211 break; 2211 break;
2212 case PR_MPX_ENABLE_MANAGEMENT: 2212 case PR_MPX_ENABLE_MANAGEMENT:
2213 if (arg2 || arg3 || arg4 || arg5)
2214 return -EINVAL;
2213 error = MPX_ENABLE_MANAGEMENT(me); 2215 error = MPX_ENABLE_MANAGEMENT(me);
2214 break; 2216 break;
2215 case PR_MPX_DISABLE_MANAGEMENT: 2217 case PR_MPX_DISABLE_MANAGEMENT:
2218 if (arg2 || arg3 || arg4 || arg5)
2219 return -EINVAL;
2216 error = MPX_DISABLE_MANAGEMENT(me); 2220 error = MPX_DISABLE_MANAGEMENT(me);
2217 break; 2221 break;
2218 default: 2222 default:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 137c7f69b264..88ea2d6e0031 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1248,7 +1248,6 @@ static struct ctl_table vm_table[] = {
1248 .maxlen = sizeof(unsigned long), 1248 .maxlen = sizeof(unsigned long),
1249 .mode = 0644, 1249 .mode = 0644,
1250 .proc_handler = hugetlb_sysctl_handler, 1250 .proc_handler = hugetlb_sysctl_handler,
1251 .extra1 = &zero,
1252 }, 1251 },
1253#ifdef CONFIG_NUMA 1252#ifdef CONFIG_NUMA
1254 { 1253 {
@@ -1257,7 +1256,6 @@ static struct ctl_table vm_table[] = {
1257 .maxlen = sizeof(unsigned long), 1256 .maxlen = sizeof(unsigned long),
1258 .mode = 0644, 1257 .mode = 0644,
1259 .proc_handler = &hugetlb_mempolicy_sysctl_handler, 1258 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
1260 .extra1 = &zero,
1261 }, 1259 },
1262#endif 1260#endif
1263 { 1261 {
@@ -1280,7 +1278,6 @@ static struct ctl_table vm_table[] = {
1280 .maxlen = sizeof(unsigned long), 1278 .maxlen = sizeof(unsigned long),
1281 .mode = 0644, 1279 .mode = 0644,
1282 .proc_handler = hugetlb_overcommit_handler, 1280 .proc_handler = hugetlb_overcommit_handler,
1283 .extra1 = &zero,
1284 }, 1281 },
1285#endif 1282#endif
1286 { 1283 {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 670fff88a961..21f82c29c914 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -111,13 +111,8 @@ static int send_reply(struct sk_buff *skb, struct genl_info *info)
111{ 111{
112 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 112 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
113 void *reply = genlmsg_data(genlhdr); 113 void *reply = genlmsg_data(genlhdr);
114 int rc;
115 114
116 rc = genlmsg_end(skb, reply); 115 genlmsg_end(skb, reply);
117 if (rc < 0) {
118 nlmsg_free(skb);
119 return rc;
120 }
121 116
122 return genlmsg_reply(skb, info); 117 return genlmsg_reply(skb, info);
123} 118}
@@ -134,11 +129,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
134 void *reply = genlmsg_data(genlhdr); 129 void *reply = genlmsg_data(genlhdr);
135 int rc, delcount = 0; 130 int rc, delcount = 0;
136 131
137 rc = genlmsg_end(skb, reply); 132 genlmsg_end(skb, reply);
138 if (rc < 0) {
139 nlmsg_free(skb);
140 return;
141 }
142 133
143 rc = 0; 134 rc = 0;
144 down_read(&listeners->sem); 135 down_read(&listeners->sem);
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f622cf28628a..c09c07817d7a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,6 @@
1obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o 1obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
3obj-y += timeconv.o posix-clock.o alarmtimer.o 3obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
4 4
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
6obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 6obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index a7077d3ae52f..1b001ed1edb9 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -788,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
788 goto out; 788 goto out;
789 } 789 }
790 790
791 restart = &current_thread_info()->restart_block; 791 restart = &current->restart_block;
792 restart->fn = alarm_timer_nsleep_restart; 792 restart->fn = alarm_timer_nsleep_restart;
793 restart->nanosleep.clockid = type; 793 restart->nanosleep.clockid = type;
794 restart->nanosleep.expires = exp.tv64; 794 restart->nanosleep.expires = exp.tv64;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b79f39bda7e1..4892352f0e49 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -34,82 +34,6 @@
34#include "tick-internal.h" 34#include "tick-internal.h"
35#include "timekeeping_internal.h" 35#include "timekeeping_internal.h"
36 36
37void timecounter_init(struct timecounter *tc,
38 const struct cyclecounter *cc,
39 u64 start_tstamp)
40{
41 tc->cc = cc;
42 tc->cycle_last = cc->read(cc);
43 tc->nsec = start_tstamp;
44}
45EXPORT_SYMBOL_GPL(timecounter_init);
46
47/**
48 * timecounter_read_delta - get nanoseconds since last call of this function
49 * @tc: Pointer to time counter
50 *
51 * When the underlying cycle counter runs over, this will be handled
52 * correctly as long as it does not run over more than once between
53 * calls.
54 *
55 * The first call to this function for a new time counter initializes
56 * the time tracking and returns an undefined result.
57 */
58static u64 timecounter_read_delta(struct timecounter *tc)
59{
60 cycle_t cycle_now, cycle_delta;
61 u64 ns_offset;
62
63 /* read cycle counter: */
64 cycle_now = tc->cc->read(tc->cc);
65
66 /* calculate the delta since the last timecounter_read_delta(): */
67 cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
68
69 /* convert to nanoseconds: */
70 ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
71
72 /* update time stamp of timecounter_read_delta() call: */
73 tc->cycle_last = cycle_now;
74
75 return ns_offset;
76}
77
78u64 timecounter_read(struct timecounter *tc)
79{
80 u64 nsec;
81
82 /* increment time by nanoseconds since last call */
83 nsec = timecounter_read_delta(tc);
84 nsec += tc->nsec;
85 tc->nsec = nsec;
86
87 return nsec;
88}
89EXPORT_SYMBOL_GPL(timecounter_read);
90
91u64 timecounter_cyc2time(struct timecounter *tc,
92 cycle_t cycle_tstamp)
93{
94 u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
95 u64 nsec;
96
97 /*
98 * Instead of always treating cycle_tstamp as more recent
99 * than tc->cycle_last, detect when it is too far in the
100 * future and treat it as old time stamp instead.
101 */
102 if (cycle_delta > tc->cc->mask / 2) {
103 cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
104 nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
105 } else {
106 nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
107 }
108
109 return nsec;
110}
111EXPORT_SYMBOL_GPL(timecounter_cyc2time);
112
113/** 37/**
114 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks 38 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
115 * @mult: pointer to mult variable 39 * @mult: pointer to mult variable
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 37e50aadd471..bee0c1f78091 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -122,7 +122,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
122 mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); 122 mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
123 boot = ktime_add(mono, off_boot); 123 boot = ktime_add(mono, off_boot);
124 xtim = ktime_add(mono, off_real); 124 xtim = ktime_add(mono, off_real);
125 tai = ktime_add(xtim, off_tai); 125 tai = ktime_add(mono, off_tai);
126 126
127 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; 127 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
128 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; 128 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
@@ -266,7 +266,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
266/* 266/*
267 * Divide a ktime value by a nanosecond value 267 * Divide a ktime value by a nanosecond value
268 */ 268 */
269u64 ktime_divns(const ktime_t kt, s64 div) 269u64 __ktime_divns(const ktime_t kt, s64 div)
270{ 270{
271 u64 dclc; 271 u64 dclc;
272 int sft = 0; 272 int sft = 0;
@@ -282,7 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
282 282
283 return dclc; 283 return dclc;
284} 284}
285EXPORT_SYMBOL_GPL(ktime_divns); 285EXPORT_SYMBOL_GPL(__ktime_divns);
286#endif /* BITS_PER_LONG >= 64 */ 286#endif /* BITS_PER_LONG >= 64 */
287 287
288/* 288/*
@@ -440,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer)
440 trace_hrtimer_cancel(timer); 440 trace_hrtimer_cancel(timer);
441} 441}
442 442
443#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
444static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
445{
446 struct hrtimer_clock_base *base = cpu_base->clock_base;
447 ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
448 int i;
449
450 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
451 struct timerqueue_node *next;
452 struct hrtimer *timer;
453
454 next = timerqueue_getnext(&base->active);
455 if (!next)
456 continue;
457
458 timer = container_of(next, struct hrtimer, node);
459 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
460 if (expires.tv64 < expires_next.tv64)
461 expires_next = expires;
462 }
463 /*
464 * clock_was_set() might have changed base->offset of any of
465 * the clock bases so the result might be negative. Fix it up
466 * to prevent a false positive in clockevents_program_event().
467 */
468 if (expires_next.tv64 < 0)
469 expires_next.tv64 = 0;
470 return expires_next;
471}
472#endif
473
443/* High resolution timer related functions */ 474/* High resolution timer related functions */
444#ifdef CONFIG_HIGH_RES_TIMERS 475#ifdef CONFIG_HIGH_RES_TIMERS
445 476
@@ -488,32 +519,7 @@ static inline int hrtimer_hres_active(void)
488static void 519static void
489hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) 520hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
490{ 521{
491 int i; 522 ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
492 struct hrtimer_clock_base *base = cpu_base->clock_base;
493 ktime_t expires, expires_next;
494
495 expires_next.tv64 = KTIME_MAX;
496
497 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
498 struct hrtimer *timer;
499 struct timerqueue_node *next;
500
501 next = timerqueue_getnext(&base->active);
502 if (!next)
503 continue;
504 timer = container_of(next, struct hrtimer, node);
505
506 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
507 /*
508 * clock_was_set() has changed base->offset so the
509 * result might be negative. Fix it up to prevent a
510 * false positive in clockevents_program_event()
511 */
512 if (expires.tv64 < 0)
513 expires.tv64 = 0;
514 if (expires.tv64 < expires_next.tv64)
515 expires_next = expires;
516 }
517 523
518 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) 524 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
519 return; 525 return;
@@ -587,6 +593,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
587 return 0; 593 return 0;
588 594
589 /* 595 /*
596 * When the target cpu of the timer is currently executing
597 * hrtimer_interrupt(), then we do not touch the clock event
598 * device. hrtimer_interrupt() will reevaluate all clock bases
599 * before reprogramming the device.
600 */
601 if (cpu_base->in_hrtirq)
602 return 0;
603
604 /*
590 * If a hang was detected in the last timer interrupt then we 605 * If a hang was detected in the last timer interrupt then we
591 * do not schedule a timer which is earlier than the expiry 606 * do not schedule a timer which is earlier than the expiry
592 * which we enforced in the hang detection. We want the system 607 * which we enforced in the hang detection. We want the system
@@ -1104,29 +1119,14 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1104ktime_t hrtimer_get_next_event(void) 1119ktime_t hrtimer_get_next_event(void)
1105{ 1120{
1106 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1121 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1107 struct hrtimer_clock_base *base = cpu_base->clock_base; 1122 ktime_t mindelta = { .tv64 = KTIME_MAX };
1108 ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
1109 unsigned long flags; 1123 unsigned long flags;
1110 int i;
1111 1124
1112 raw_spin_lock_irqsave(&cpu_base->lock, flags); 1125 raw_spin_lock_irqsave(&cpu_base->lock, flags);
1113 1126
1114 if (!hrtimer_hres_active()) { 1127 if (!hrtimer_hres_active())
1115 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1128 mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
1116 struct hrtimer *timer; 1129 ktime_get());
1117 struct timerqueue_node *next;
1118
1119 next = timerqueue_getnext(&base->active);
1120 if (!next)
1121 continue;
1122
1123 timer = container_of(next, struct hrtimer, node);
1124 delta.tv64 = hrtimer_get_expires_tv64(timer);
1125 delta = ktime_sub(delta, base->get_time());
1126 if (delta.tv64 < mindelta.tv64)
1127 mindelta.tv64 = delta.tv64;
1128 }
1129 }
1130 1130
1131 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1131 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1132 1132
@@ -1253,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1253 raw_spin_lock(&cpu_base->lock); 1253 raw_spin_lock(&cpu_base->lock);
1254 entry_time = now = hrtimer_update_base(cpu_base); 1254 entry_time = now = hrtimer_update_base(cpu_base);
1255retry: 1255retry:
1256 expires_next.tv64 = KTIME_MAX; 1256 cpu_base->in_hrtirq = 1;
1257 /* 1257 /*
1258 * We set expires_next to KTIME_MAX here with cpu_base->lock 1258 * We set expires_next to KTIME_MAX here with cpu_base->lock
1259 * held to prevent that a timer is enqueued in our queue via 1259 * held to prevent that a timer is enqueued in our queue via
@@ -1291,28 +1291,20 @@ retry:
1291 * are right-of a not yet expired timer, because that 1291 * are right-of a not yet expired timer, because that
1292 * timer will have to trigger a wakeup anyway. 1292 * timer will have to trigger a wakeup anyway.
1293 */ 1293 */
1294 1294 if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
1295 if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
1296 ktime_t expires;
1297
1298 expires = ktime_sub(hrtimer_get_expires(timer),
1299 base->offset);
1300 if (expires.tv64 < 0)
1301 expires.tv64 = KTIME_MAX;
1302 if (expires.tv64 < expires_next.tv64)
1303 expires_next = expires;
1304 break; 1295 break;
1305 }
1306 1296
1307 __run_hrtimer(timer, &basenow); 1297 __run_hrtimer(timer, &basenow);
1308 } 1298 }
1309 } 1299 }
1310 1300 /* Reevaluate the clock bases for the next expiry */
1301 expires_next = __hrtimer_get_next_event(cpu_base);
1311 /* 1302 /*
1312 * Store the new expiry value so the migration code can verify 1303 * Store the new expiry value so the migration code can verify
1313 * against it. 1304 * against it.
1314 */ 1305 */
1315 cpu_base->expires_next = expires_next; 1306 cpu_base->expires_next = expires_next;
1307 cpu_base->in_hrtirq = 0;
1316 raw_spin_unlock(&cpu_base->lock); 1308 raw_spin_unlock(&cpu_base->lock);
1317 1309
1318 /* Reprogramming necessary ? */ 1310 /* Reprogramming necessary ? */
@@ -1591,7 +1583,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1591 goto out; 1583 goto out;
1592 } 1584 }
1593 1585
1594 restart = &current_thread_info()->restart_block; 1586 restart = &current->restart_block;
1595 restart->fn = hrtimer_nanosleep_restart; 1587 restart->fn = hrtimer_nanosleep_restart;
1596 restart->nanosleep.clockid = t.timer.base->clockid; 1588 restart->nanosleep.clockid = t.timer.base->clockid;
1597 restart->nanosleep.rmtp = rmtp; 1589 restart->nanosleep.rmtp = rmtp;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 87a346fd6d61..4b585e0fdd22 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -488,13 +488,13 @@ static void sync_cmos_clock(struct work_struct *work)
488 488
489 getnstimeofday64(&now); 489 getnstimeofday64(&now);
490 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { 490 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
491 struct timespec adjust = timespec64_to_timespec(now); 491 struct timespec64 adjust = now;
492 492
493 fail = -ENODEV; 493 fail = -ENODEV;
494 if (persistent_clock_is_local) 494 if (persistent_clock_is_local)
495 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); 495 adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
496#ifdef CONFIG_GENERIC_CMOS_UPDATE 496#ifdef CONFIG_GENERIC_CMOS_UPDATE
497 fail = update_persistent_clock(adjust); 497 fail = update_persistent_clock(timespec64_to_timespec(adjust));
498#endif 498#endif
499#ifdef CONFIG_RTC_SYSTOHC 499#ifdef CONFIG_RTC_SYSTOHC
500 if (fail == -ENODEV) 500 if (fail == -ENODEV)
@@ -633,6 +633,13 @@ int ntp_validate_timex(struct timex *txc)
633 if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) 633 if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
634 return -EPERM; 634 return -EPERM;
635 635
636 if (txc->modes & ADJ_FREQUENCY) {
637 if (LONG_MIN / PPM_SCALE > txc->freq)
638 return -EINVAL;
639 if (LONG_MAX / PPM_SCALE < txc->freq)
640 return -EINVAL;
641 }
642
636 return 0; 643 return 0;
637} 644}
638 645
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a16b67859e2a..0075da74abf0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1334,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1334static int posix_cpu_nsleep(const clockid_t which_clock, int flags, 1334static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1335 struct timespec *rqtp, struct timespec __user *rmtp) 1335 struct timespec *rqtp, struct timespec __user *rmtp)
1336{ 1336{
1337 struct restart_block *restart_block = 1337 struct restart_block *restart_block = &current->restart_block;
1338 &current_thread_info()->restart_block;
1339 struct itimerspec it; 1338 struct itimerspec it;
1340 int error; 1339 int error;
1341 1340
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 6390517e77d4..2c85b7724af4 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -196,6 +196,10 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
196 if (tv) { 196 if (tv) {
197 if (copy_from_user(&user_tv, tv, sizeof(*tv))) 197 if (copy_from_user(&user_tv, tv, sizeof(*tv)))
198 return -EFAULT; 198 return -EFAULT;
199
200 if (!timeval_valid(&user_tv))
201 return -EINVAL;
202
199 new_ts.tv_sec = user_tv.tv_sec; 203 new_ts.tv_sec = user_tv.tv_sec;
200 new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; 204 new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
201 } 205 }
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
new file mode 100644
index 000000000000..4687b3104bae
--- /dev/null
+++ b/kernel/time/timecounter.c
@@ -0,0 +1,112 @@
1/*
2 * linux/kernel/time/timecounter.c
3 *
4 * based on code that migrated away from
5 * linux/kernel/time/clocksource.c
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 */
17
18#include <linux/export.h>
19#include <linux/timecounter.h>
20
21void timecounter_init(struct timecounter *tc,
22 const struct cyclecounter *cc,
23 u64 start_tstamp)
24{
25 tc->cc = cc;
26 tc->cycle_last = cc->read(cc);
27 tc->nsec = start_tstamp;
28 tc->mask = (1ULL << cc->shift) - 1;
29 tc->frac = 0;
30}
31EXPORT_SYMBOL_GPL(timecounter_init);
32
33/**
34 * timecounter_read_delta - get nanoseconds since last call of this function
35 * @tc: Pointer to time counter
36 *
37 * When the underlying cycle counter runs over, this will be handled
38 * correctly as long as it does not run over more than once between
39 * calls.
40 *
41 * The first call to this function for a new time counter initializes
42 * the time tracking and returns an undefined result.
43 */
44static u64 timecounter_read_delta(struct timecounter *tc)
45{
46 cycle_t cycle_now, cycle_delta;
47 u64 ns_offset;
48
49 /* read cycle counter: */
50 cycle_now = tc->cc->read(tc->cc);
51
52 /* calculate the delta since the last timecounter_read_delta(): */
53 cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
54
55 /* convert to nanoseconds: */
56 ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta,
57 tc->mask, &tc->frac);
58
59 /* update time stamp of timecounter_read_delta() call: */
60 tc->cycle_last = cycle_now;
61
62 return ns_offset;
63}
64
65u64 timecounter_read(struct timecounter *tc)
66{
67 u64 nsec;
68
69 /* increment time by nanoseconds since last call */
70 nsec = timecounter_read_delta(tc);
71 nsec += tc->nsec;
72 tc->nsec = nsec;
73
74 return nsec;
75}
76EXPORT_SYMBOL_GPL(timecounter_read);
77
78/*
79 * This is like cyclecounter_cyc2ns(), but it is used for computing a
80 * time previous to the time stored in the cycle counter.
81 */
82static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
83 cycle_t cycles, u64 mask, u64 frac)
84{
85 u64 ns = (u64) cycles;
86
87 ns = ((ns * cc->mult) - frac) >> cc->shift;
88
89 return ns;
90}
91
92u64 timecounter_cyc2time(struct timecounter *tc,
93 cycle_t cycle_tstamp)
94{
95 u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
96 u64 nsec = tc->nsec, frac = tc->frac;
97
98 /*
99 * Instead of always treating cycle_tstamp as more recent
100 * than tc->cycle_last, detect when it is too far in the
101 * future and treat it as old time stamp instead.
102 */
103 if (delta > tc->cc->mask / 2) {
104 delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
105 nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac);
106 } else {
107 nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac);
108 }
109
110 return nsec;
111}
112EXPORT_SYMBOL_GPL(timecounter_cyc2time);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6a931852082f..b124af259800 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1659,24 +1659,24 @@ out:
1659} 1659}
1660 1660
1661/** 1661/**
1662 * getboottime - Return the real time of system boot. 1662 * getboottime64 - Return the real time of system boot.
1663 * @ts: pointer to the timespec to be set 1663 * @ts: pointer to the timespec64 to be set
1664 * 1664 *
1665 * Returns the wall-time of boot in a timespec. 1665 * Returns the wall-time of boot in a timespec64.
1666 * 1666 *
1667 * This is based on the wall_to_monotonic offset and the total suspend 1667 * This is based on the wall_to_monotonic offset and the total suspend
1668 * time. Calls to settimeofday will affect the value returned (which 1668 * time. Calls to settimeofday will affect the value returned (which
1669 * basically means that however wrong your real time clock is at boot time, 1669 * basically means that however wrong your real time clock is at boot time,
1670 * you get the right time here). 1670 * you get the right time here).
1671 */ 1671 */
1672void getboottime(struct timespec *ts) 1672void getboottime64(struct timespec64 *ts)
1673{ 1673{
1674 struct timekeeper *tk = &tk_core.timekeeper; 1674 struct timekeeper *tk = &tk_core.timekeeper;
1675 ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); 1675 ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
1676 1676
1677 *ts = ktime_to_timespec(t); 1677 *ts = ktime_to_timespec64(t);
1678} 1678}
1679EXPORT_SYMBOL_GPL(getboottime); 1679EXPORT_SYMBOL_GPL(getboottime64);
1680 1680
1681unsigned long get_seconds(void) 1681unsigned long get_seconds(void)
1682{ 1682{
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979ccde26720..98f26588255e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -3,11 +3,11 @@
3 3
4ifdef CONFIG_FUNCTION_TRACER 4ifdef CONFIG_FUNCTION_TRACER
5ORIG_CFLAGS := $(KBUILD_CFLAGS) 5ORIG_CFLAGS := $(KBUILD_CFLAGS)
6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) 6KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
7 7
8ifdef CONFIG_FTRACE_SELFTEST 8ifdef CONFIG_FTRACE_SELFTEST
9# selftest needs instrumentation 9# selftest needs instrumentation
10CFLAGS_trace_selftest_dynamic.o = -pg 10CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE)
11obj-y += trace_selftest_dynamic.o 11obj-y += trace_selftest_dynamic.o
12endif 12endif
13endif 13endif
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 224e768bdc73..45e5cb143d17 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5456,7 +5456,7 @@ static __init int ftrace_init_debugfs(void)
5456 struct dentry *d_tracer; 5456 struct dentry *d_tracer;
5457 5457
5458 d_tracer = tracing_init_dentry(); 5458 d_tracer = tracing_init_dentry();
5459 if (!d_tracer) 5459 if (IS_ERR(d_tracer))
5460 return 0; 5460 return 0;
5461 5461
5462 ftrace_init_dyn_debugfs(d_tracer); 5462 ftrace_init_dyn_debugfs(d_tracer);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 1c71382b283d..eb4220a132ec 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,6 @@
13#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
16EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); 17EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
17 18
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7a4104cb95cb..5040d44fe5a3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -9,7 +9,6 @@
9#include <linux/trace_seq.h> 9#include <linux/trace_seq.h>
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/irq_work.h> 11#include <linux/irq_work.h>
12#include <linux/debugfs.h>
13#include <linux/uaccess.h> 12#include <linux/uaccess.h>
14#include <linux/hardirq.h> 13#include <linux/hardirq.h>
15#include <linux/kthread.h> /* for self test */ 14#include <linux/kthread.h> /* for self test */
@@ -23,7 +22,6 @@
23#include <linux/hash.h> 22#include <linux/hash.h>
24#include <linux/list.h> 23#include <linux/list.h>
25#include <linux/cpu.h> 24#include <linux/cpu.h>
26#include <linux/fs.h>
27 25
28#include <asm/local.h> 26#include <asm/local.h>
29 27
@@ -447,7 +445,10 @@ int ring_buffer_print_page_header(struct trace_seq *s)
447struct rb_irq_work { 445struct rb_irq_work {
448 struct irq_work work; 446 struct irq_work work;
449 wait_queue_head_t waiters; 447 wait_queue_head_t waiters;
448 wait_queue_head_t full_waiters;
450 bool waiters_pending; 449 bool waiters_pending;
450 bool full_waiters_pending;
451 bool wakeup_full;
451}; 452};
452 453
453/* 454/*
@@ -529,6 +530,10 @@ static void rb_wake_up_waiters(struct irq_work *work)
529 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); 530 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
530 531
531 wake_up_all(&rbwork->waiters); 532 wake_up_all(&rbwork->waiters);
533 if (rbwork->wakeup_full) {
534 rbwork->wakeup_full = false;
535 wake_up_all(&rbwork->full_waiters);
536 }
532} 537}
533 538
534/** 539/**
@@ -553,9 +558,11 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
553 * data in any cpu buffer, or a specific buffer, put the 558 * data in any cpu buffer, or a specific buffer, put the
554 * caller on the appropriate wait queue. 559 * caller on the appropriate wait queue.
555 */ 560 */
556 if (cpu == RING_BUFFER_ALL_CPUS) 561 if (cpu == RING_BUFFER_ALL_CPUS) {
557 work = &buffer->irq_work; 562 work = &buffer->irq_work;
558 else { 563 /* Full only makes sense on per cpu reads */
564 full = false;
565 } else {
559 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 566 if (!cpumask_test_cpu(cpu, buffer->cpumask))
560 return -ENODEV; 567 return -ENODEV;
561 cpu_buffer = buffer->buffers[cpu]; 568 cpu_buffer = buffer->buffers[cpu];
@@ -564,7 +571,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
564 571
565 572
566 while (true) { 573 while (true) {
567 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); 574 if (full)
575 prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
576 else
577 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
568 578
569 /* 579 /*
570 * The events can happen in critical sections where 580 * The events can happen in critical sections where
@@ -586,7 +596,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
586 * that is necessary is that the wake up happens after 596 * that is necessary is that the wake up happens after
587 * a task has been queued. It's OK for spurious wake ups. 597 * a task has been queued. It's OK for spurious wake ups.
588 */ 598 */
589 work->waiters_pending = true; 599 if (full)
600 work->full_waiters_pending = true;
601 else
602 work->waiters_pending = true;
590 603
591 if (signal_pending(current)) { 604 if (signal_pending(current)) {
592 ret = -EINTR; 605 ret = -EINTR;
@@ -615,7 +628,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
615 schedule(); 628 schedule();
616 } 629 }
617 630
618 finish_wait(&work->waiters, &wait); 631 if (full)
632 finish_wait(&work->full_waiters, &wait);
633 else
634 finish_wait(&work->waiters, &wait);
619 635
620 return ret; 636 return ret;
621} 637}
@@ -1230,6 +1246,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1230 init_completion(&cpu_buffer->update_done); 1246 init_completion(&cpu_buffer->update_done);
1231 init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); 1247 init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
1232 init_waitqueue_head(&cpu_buffer->irq_work.waiters); 1248 init_waitqueue_head(&cpu_buffer->irq_work.waiters);
1249 init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
1233 1250
1234 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1251 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1235 GFP_KERNEL, cpu_to_node(cpu)); 1252 GFP_KERNEL, cpu_to_node(cpu));
@@ -2801,6 +2818,8 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2801static __always_inline void 2818static __always_inline void
2802rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) 2819rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2803{ 2820{
2821 bool pagebusy;
2822
2804 if (buffer->irq_work.waiters_pending) { 2823 if (buffer->irq_work.waiters_pending) {
2805 buffer->irq_work.waiters_pending = false; 2824 buffer->irq_work.waiters_pending = false;
2806 /* irq_work_queue() supplies it's own memory barriers */ 2825 /* irq_work_queue() supplies it's own memory barriers */
@@ -2812,6 +2831,15 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2812 /* irq_work_queue() supplies it's own memory barriers */ 2831 /* irq_work_queue() supplies it's own memory barriers */
2813 irq_work_queue(&cpu_buffer->irq_work.work); 2832 irq_work_queue(&cpu_buffer->irq_work.work);
2814 } 2833 }
2834
2835 pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
2836
2837 if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
2838 cpu_buffer->irq_work.wakeup_full = true;
2839 cpu_buffer->irq_work.full_waiters_pending = false;
2840 /* irq_work_queue() supplies it's own memory barriers */
2841 irq_work_queue(&cpu_buffer->irq_work.work);
2842 }
2815} 2843}
2816 2844
2817/** 2845/**
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 3f9e328c30b5..13d945c0d03f 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -7,7 +7,7 @@
7#include <linux/completion.h> 7#include <linux/completion.h>
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/time.h> 10#include <linux/ktime.h>
11#include <asm/local.h> 11#include <asm/local.h>
12 12
13struct rb_page { 13struct rb_page {
@@ -17,7 +17,7 @@ struct rb_page {
17}; 17};
18 18
19/* run time and sleep time in seconds */ 19/* run time and sleep time in seconds */
20#define RUN_TIME 10 20#define RUN_TIME 10ULL
21#define SLEEP_TIME 10 21#define SLEEP_TIME 10
22 22
23/* number of events for writer to wake up the reader */ 23/* number of events for writer to wake up the reader */
@@ -212,8 +212,7 @@ static void ring_buffer_consumer(void)
212 212
213static void ring_buffer_producer(void) 213static void ring_buffer_producer(void)
214{ 214{
215 struct timeval start_tv; 215 ktime_t start_time, end_time, timeout;
216 struct timeval end_tv;
217 unsigned long long time; 216 unsigned long long time;
218 unsigned long long entries; 217 unsigned long long entries;
219 unsigned long long overruns; 218 unsigned long long overruns;
@@ -227,7 +226,8 @@ static void ring_buffer_producer(void)
227 * make the system stall) 226 * make the system stall)
228 */ 227 */
229 trace_printk("Starting ring buffer hammer\n"); 228 trace_printk("Starting ring buffer hammer\n");
230 do_gettimeofday(&start_tv); 229 start_time = ktime_get();
230 timeout = ktime_add_ns(start_time, RUN_TIME * NSEC_PER_SEC);
231 do { 231 do {
232 struct ring_buffer_event *event; 232 struct ring_buffer_event *event;
233 int *entry; 233 int *entry;
@@ -244,7 +244,7 @@ static void ring_buffer_producer(void)
244 ring_buffer_unlock_commit(buffer, event); 244 ring_buffer_unlock_commit(buffer, event);
245 } 245 }
246 } 246 }
247 do_gettimeofday(&end_tv); 247 end_time = ktime_get();
248 248
249 cnt++; 249 cnt++;
250 if (consumer && !(cnt % wakeup_interval)) 250 if (consumer && !(cnt % wakeup_interval))
@@ -264,7 +264,7 @@ static void ring_buffer_producer(void)
264 cond_resched(); 264 cond_resched();
265#endif 265#endif
266 266
267 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); 267 } while (ktime_before(end_time, timeout) && !kill_test);
268 trace_printk("End ring buffer hammer\n"); 268 trace_printk("End ring buffer hammer\n");
269 269
270 if (consumer) { 270 if (consumer) {
@@ -280,9 +280,7 @@ static void ring_buffer_producer(void)
280 wait_for_completion(&read_done); 280 wait_for_completion(&read_done);
281 } 281 }
282 282
283 time = end_tv.tv_sec - start_tv.tv_sec; 283 time = ktime_us_delta(end_time, start_time);
284 time *= USEC_PER_SEC;
285 time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
286 284
287 entries = ring_buffer_entries(buffer); 285 entries = ring_buffer_entries(buffer);
288 overruns = ring_buffer_overruns(buffer); 286 overruns = ring_buffer_overruns(buffer);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4a9079b9f082..77b8dc528006 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2036,7 +2036,8 @@ void trace_printk_init_buffers(void)
2036 2036
2037 /* trace_printk() is for debug use only. Don't use it in production. */ 2037 /* trace_printk() is for debug use only. Don't use it in production. */
2038 2038
2039 pr_warning("\n**********************************************************\n"); 2039 pr_warning("\n");
2040 pr_warning("**********************************************************\n");
2040 pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); 2041 pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
2041 pr_warning("** **\n"); 2042 pr_warning("** **\n");
2042 pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); 2043 pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
@@ -4140,6 +4141,12 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
4140 goto out; 4141 goto out;
4141 } 4142 }
4142 4143
4144 /* If trace pipe files are being read, we can't change the tracer */
4145 if (tr->current_trace->ref) {
4146 ret = -EBUSY;
4147 goto out;
4148 }
4149
4143 trace_branch_disable(); 4150 trace_branch_disable();
4144 4151
4145 tr->current_trace->enabled--; 4152 tr->current_trace->enabled--;
@@ -4326,17 +4333,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
4326 } 4333 }
4327 4334
4328 trace_seq_init(&iter->seq); 4335 trace_seq_init(&iter->seq);
4329 4336 iter->trace = tr->current_trace;
4330 /*
4331 * We make a copy of the current tracer to avoid concurrent
4332 * changes on it while we are reading.
4333 */
4334 iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL);
4335 if (!iter->trace) {
4336 ret = -ENOMEM;
4337 goto fail;
4338 }
4339 *iter->trace = *tr->current_trace;
4340 4337
4341 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 4338 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
4342 ret = -ENOMEM; 4339 ret = -ENOMEM;
@@ -4363,6 +4360,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
4363 iter->trace->pipe_open(iter); 4360 iter->trace->pipe_open(iter);
4364 4361
4365 nonseekable_open(inode, filp); 4362 nonseekable_open(inode, filp);
4363
4364 tr->current_trace->ref++;
4366out: 4365out:
4367 mutex_unlock(&trace_types_lock); 4366 mutex_unlock(&trace_types_lock);
4368 return ret; 4367 return ret;
@@ -4382,6 +4381,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
4382 4381
4383 mutex_lock(&trace_types_lock); 4382 mutex_lock(&trace_types_lock);
4384 4383
4384 tr->current_trace->ref--;
4385
4385 if (iter->trace->pipe_close) 4386 if (iter->trace->pipe_close)
4386 iter->trace->pipe_close(iter); 4387 iter->trace->pipe_close(iter);
4387 4388
@@ -4389,7 +4390,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
4389 4390
4390 free_cpumask_var(iter->started); 4391 free_cpumask_var(iter->started);
4391 mutex_destroy(&iter->mutex); 4392 mutex_destroy(&iter->mutex);
4392 kfree(iter->trace);
4393 kfree(iter); 4393 kfree(iter);
4394 4394
4395 trace_array_put(tr); 4395 trace_array_put(tr);
@@ -4422,7 +4422,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
4422 return trace_poll(iter, filp, poll_table); 4422 return trace_poll(iter, filp, poll_table);
4423} 4423}
4424 4424
4425/* Must be called with trace_types_lock mutex held. */ 4425/* Must be called with iter->mutex held. */
4426static int tracing_wait_pipe(struct file *filp) 4426static int tracing_wait_pipe(struct file *filp)
4427{ 4427{
4428 struct trace_iterator *iter = filp->private_data; 4428 struct trace_iterator *iter = filp->private_data;
@@ -4467,7 +4467,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
4467 size_t cnt, loff_t *ppos) 4467 size_t cnt, loff_t *ppos)
4468{ 4468{
4469 struct trace_iterator *iter = filp->private_data; 4469 struct trace_iterator *iter = filp->private_data;
4470 struct trace_array *tr = iter->tr;
4471 ssize_t sret; 4470 ssize_t sret;
4472 4471
4473 /* return any leftover data */ 4472 /* return any leftover data */
@@ -4477,12 +4476,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
4477 4476
4478 trace_seq_init(&iter->seq); 4477 trace_seq_init(&iter->seq);
4479 4478
4480 /* copy the tracer to avoid using a global lock all around */
4481 mutex_lock(&trace_types_lock);
4482 if (unlikely(iter->trace->name != tr->current_trace->name))
4483 *iter->trace = *tr->current_trace;
4484 mutex_unlock(&trace_types_lock);
4485
4486 /* 4479 /*
4487 * Avoid more than one consumer on a single file descriptor 4480 * Avoid more than one consumer on a single file descriptor
4488 * This is just a matter of traces coherency, the ring buffer itself 4481 * This is just a matter of traces coherency, the ring buffer itself
@@ -4642,7 +4635,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
4642 .ops = &tracing_pipe_buf_ops, 4635 .ops = &tracing_pipe_buf_ops,
4643 .spd_release = tracing_spd_release_pipe, 4636 .spd_release = tracing_spd_release_pipe,
4644 }; 4637 };
4645 struct trace_array *tr = iter->tr;
4646 ssize_t ret; 4638 ssize_t ret;
4647 size_t rem; 4639 size_t rem;
4648 unsigned int i; 4640 unsigned int i;
@@ -4650,12 +4642,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
4650 if (splice_grow_spd(pipe, &spd)) 4642 if (splice_grow_spd(pipe, &spd))
4651 return -ENOMEM; 4643 return -ENOMEM;
4652 4644
4653 /* copy the tracer to avoid using a global lock all around */
4654 mutex_lock(&trace_types_lock);
4655 if (unlikely(iter->trace->name != tr->current_trace->name))
4656 *iter->trace = *tr->current_trace;
4657 mutex_unlock(&trace_types_lock);
4658
4659 mutex_lock(&iter->mutex); 4645 mutex_lock(&iter->mutex);
4660 4646
4661 if (iter->trace->splice_read) { 4647 if (iter->trace->splice_read) {
@@ -4942,7 +4928,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4942 *fpos += written; 4928 *fpos += written;
4943 4929
4944 out_unlock: 4930 out_unlock:
4945 for (i = 0; i < nr_pages; i++){ 4931 for (i = nr_pages - 1; i >= 0; i--) {
4946 kunmap_atomic(map_page[i]); 4932 kunmap_atomic(map_page[i]);
4947 put_page(pages[i]); 4933 put_page(pages[i]);
4948 } 4934 }
@@ -5331,6 +5317,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
5331 5317
5332 filp->private_data = info; 5318 filp->private_data = info;
5333 5319
5320 tr->current_trace->ref++;
5321
5334 mutex_unlock(&trace_types_lock); 5322 mutex_unlock(&trace_types_lock);
5335 5323
5336 ret = nonseekable_open(inode, filp); 5324 ret = nonseekable_open(inode, filp);
@@ -5361,21 +5349,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5361 if (!count) 5349 if (!count)
5362 return 0; 5350 return 0;
5363 5351
5364 mutex_lock(&trace_types_lock);
5365
5366#ifdef CONFIG_TRACER_MAX_TRACE 5352#ifdef CONFIG_TRACER_MAX_TRACE
5367 if (iter->snapshot && iter->tr->current_trace->use_max_tr) { 5353 if (iter->snapshot && iter->tr->current_trace->use_max_tr)
5368 size = -EBUSY; 5354 return -EBUSY;
5369 goto out_unlock;
5370 }
5371#endif 5355#endif
5372 5356
5373 if (!info->spare) 5357 if (!info->spare)
5374 info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, 5358 info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
5375 iter->cpu_file); 5359 iter->cpu_file);
5376 size = -ENOMEM;
5377 if (!info->spare) 5360 if (!info->spare)
5378 goto out_unlock; 5361 return -ENOMEM;
5379 5362
5380 /* Do we have previous read data to read? */ 5363 /* Do we have previous read data to read? */
5381 if (info->read < PAGE_SIZE) 5364 if (info->read < PAGE_SIZE)
@@ -5391,21 +5374,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5391 5374
5392 if (ret < 0) { 5375 if (ret < 0) {
5393 if (trace_empty(iter)) { 5376 if (trace_empty(iter)) {
5394 if ((filp->f_flags & O_NONBLOCK)) { 5377 if ((filp->f_flags & O_NONBLOCK))
5395 size = -EAGAIN; 5378 return -EAGAIN;
5396 goto out_unlock; 5379
5397 }
5398 mutex_unlock(&trace_types_lock);
5399 ret = wait_on_pipe(iter, false); 5380 ret = wait_on_pipe(iter, false);
5400 mutex_lock(&trace_types_lock); 5381 if (ret)
5401 if (ret) { 5382 return ret;
5402 size = ret; 5383
5403 goto out_unlock;
5404 }
5405 goto again; 5384 goto again;
5406 } 5385 }
5407 size = 0; 5386 return 0;
5408 goto out_unlock;
5409 } 5387 }
5410 5388
5411 info->read = 0; 5389 info->read = 0;
@@ -5415,18 +5393,14 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5415 size = count; 5393 size = count;
5416 5394
5417 ret = copy_to_user(ubuf, info->spare + info->read, size); 5395 ret = copy_to_user(ubuf, info->spare + info->read, size);
5418 if (ret == size) { 5396 if (ret == size)
5419 size = -EFAULT; 5397 return -EFAULT;
5420 goto out_unlock; 5398
5421 }
5422 size -= ret; 5399 size -= ret;
5423 5400
5424 *ppos += size; 5401 *ppos += size;
5425 info->read += size; 5402 info->read += size;
5426 5403
5427 out_unlock:
5428 mutex_unlock(&trace_types_lock);
5429
5430 return size; 5404 return size;
5431} 5405}
5432 5406
@@ -5437,6 +5411,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
5437 5411
5438 mutex_lock(&trace_types_lock); 5412 mutex_lock(&trace_types_lock);
5439 5413
5414 iter->tr->current_trace->ref--;
5415
5440 __trace_array_put(iter->tr); 5416 __trace_array_put(iter->tr);
5441 5417
5442 if (info->spare) 5418 if (info->spare)
@@ -5522,30 +5498,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5522 int entries, size, i; 5498 int entries, size, i;
5523 ssize_t ret = 0; 5499 ssize_t ret = 0;
5524 5500
5525 mutex_lock(&trace_types_lock);
5526
5527#ifdef CONFIG_TRACER_MAX_TRACE 5501#ifdef CONFIG_TRACER_MAX_TRACE
5528 if (iter->snapshot && iter->tr->current_trace->use_max_tr) { 5502 if (iter->snapshot && iter->tr->current_trace->use_max_tr)
5529 ret = -EBUSY; 5503 return -EBUSY;
5530 goto out;
5531 }
5532#endif 5504#endif
5533 5505
5534 if (splice_grow_spd(pipe, &spd)) { 5506 if (splice_grow_spd(pipe, &spd))
5535 ret = -ENOMEM; 5507 return -ENOMEM;
5536 goto out;
5537 }
5538 5508
5539 if (*ppos & (PAGE_SIZE - 1)) { 5509 if (*ppos & (PAGE_SIZE - 1))
5540 ret = -EINVAL; 5510 return -EINVAL;
5541 goto out;
5542 }
5543 5511
5544 if (len & (PAGE_SIZE - 1)) { 5512 if (len & (PAGE_SIZE - 1)) {
5545 if (len < PAGE_SIZE) { 5513 if (len < PAGE_SIZE)
5546 ret = -EINVAL; 5514 return -EINVAL;
5547 goto out;
5548 }
5549 len &= PAGE_MASK; 5515 len &= PAGE_MASK;
5550 } 5516 }
5551 5517
@@ -5606,25 +5572,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5606 /* did we read anything? */ 5572 /* did we read anything? */
5607 if (!spd.nr_pages) { 5573 if (!spd.nr_pages) {
5608 if (ret) 5574 if (ret)
5609 goto out; 5575 return ret;
5576
5577 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
5578 return -EAGAIN;
5610 5579
5611 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
5612 ret = -EAGAIN;
5613 goto out;
5614 }
5615 mutex_unlock(&trace_types_lock);
5616 ret = wait_on_pipe(iter, true); 5580 ret = wait_on_pipe(iter, true);
5617 mutex_lock(&trace_types_lock);
5618 if (ret) 5581 if (ret)
5619 goto out; 5582 return ret;
5620 5583
5621 goto again; 5584 goto again;
5622 } 5585 }
5623 5586
5624 ret = splice_to_pipe(pipe, &spd); 5587 ret = splice_to_pipe(pipe, &spd);
5625 splice_shrink_spd(&spd); 5588 splice_shrink_spd(&spd);
5626out:
5627 mutex_unlock(&trace_types_lock);
5628 5589
5629 return ret; 5590 return ret;
5630} 5591}
@@ -5854,28 +5815,11 @@ static __init int register_snapshot_cmd(void)
5854static inline __init int register_snapshot_cmd(void) { return 0; } 5815static inline __init int register_snapshot_cmd(void) { return 0; }
5855#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ 5816#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
5856 5817
5857struct dentry *tracing_init_dentry_tr(struct trace_array *tr) 5818static struct dentry *tracing_get_dentry(struct trace_array *tr)
5858{ 5819{
5859 if (tr->dir)
5860 return tr->dir;
5861
5862 if (!debugfs_initialized())
5863 return NULL;
5864
5865 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
5866 tr->dir = debugfs_create_dir("tracing", NULL);
5867
5868 if (!tr->dir)
5869 pr_warn_once("Could not create debugfs directory 'tracing'\n");
5870
5871 return tr->dir; 5820 return tr->dir;
5872} 5821}
5873 5822
5874struct dentry *tracing_init_dentry(void)
5875{
5876 return tracing_init_dentry_tr(&global_trace);
5877}
5878
5879static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) 5823static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
5880{ 5824{
5881 struct dentry *d_tracer; 5825 struct dentry *d_tracer;
@@ -5883,8 +5827,8 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
5883 if (tr->percpu_dir) 5827 if (tr->percpu_dir)
5884 return tr->percpu_dir; 5828 return tr->percpu_dir;
5885 5829
5886 d_tracer = tracing_init_dentry_tr(tr); 5830 d_tracer = tracing_get_dentry(tr);
5887 if (!d_tracer) 5831 if (IS_ERR(d_tracer))
5888 return NULL; 5832 return NULL;
5889 5833
5890 tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); 5834 tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
@@ -6086,8 +6030,8 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
6086 if (tr->options) 6030 if (tr->options)
6087 return tr->options; 6031 return tr->options;
6088 6032
6089 d_tracer = tracing_init_dentry_tr(tr); 6033 d_tracer = tracing_get_dentry(tr);
6090 if (!d_tracer) 6034 if (IS_ERR(d_tracer))
6091 return NULL; 6035 return NULL;
6092 6036
6093 tr->options = debugfs_create_dir("options", d_tracer); 6037 tr->options = debugfs_create_dir("options", d_tracer);
@@ -6416,7 +6360,7 @@ static int instance_delete(const char *name)
6416 goto out_unlock; 6360 goto out_unlock;
6417 6361
6418 ret = -EBUSY; 6362 ret = -EBUSY;
6419 if (tr->ref) 6363 if (tr->ref || (tr->current_trace && tr->current_trace->ref))
6420 goto out_unlock; 6364 goto out_unlock;
6421 6365
6422 list_del(&tr->list); 6366 list_del(&tr->list);
@@ -6571,6 +6515,33 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6571 6515
6572} 6516}
6573 6517
6518/**
6519 * tracing_init_dentry - initialize top level trace array
6520 *
6521 * This is called when creating files or directories in the tracing
6522 * directory. It is called via fs_initcall() by any of the boot up code
6523 * and expects to return the dentry of the top level tracing directory.
6524 */
6525struct dentry *tracing_init_dentry(void)
6526{
6527 struct trace_array *tr = &global_trace;
6528
6529 if (tr->dir)
6530 return tr->dir;
6531
6532 if (WARN_ON(!debugfs_initialized()))
6533 return ERR_PTR(-ENODEV);
6534
6535 tr->dir = debugfs_create_dir("tracing", NULL);
6536
6537 if (!tr->dir) {
6538 pr_warn_once("Could not create debugfs directory 'tracing'\n");
6539 return ERR_PTR(-ENOMEM);
6540 }
6541
6542 return tr->dir;
6543}
6544
6574static __init int tracer_init_debugfs(void) 6545static __init int tracer_init_debugfs(void)
6575{ 6546{
6576 struct dentry *d_tracer; 6547 struct dentry *d_tracer;
@@ -6578,7 +6549,7 @@ static __init int tracer_init_debugfs(void)
6578 trace_access_lock_init(); 6549 trace_access_lock_init();
6579 6550
6580 d_tracer = tracing_init_dentry(); 6551 d_tracer = tracing_init_dentry();
6581 if (!d_tracer) 6552 if (IS_ERR(d_tracer))
6582 return 0; 6553 return 0;
6583 6554
6584 init_tracer_debugfs(&global_trace, d_tracer); 6555 init_tracer_debugfs(&global_trace, d_tracer);
@@ -6811,7 +6782,6 @@ __init static int tracer_alloc_buffers(void)
6811 int ring_buf_size; 6782 int ring_buf_size;
6812 int ret = -ENOMEM; 6783 int ret = -ENOMEM;
6813 6784
6814
6815 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 6785 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
6816 goto out; 6786 goto out;
6817 6787
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8de48bac1ce2..dd8205a35760 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -388,6 +388,7 @@ struct tracer {
388 struct tracer *next; 388 struct tracer *next;
389 struct tracer_flags *flags; 389 struct tracer_flags *flags;
390 int enabled; 390 int enabled;
391 int ref;
391 bool print_max; 392 bool print_max;
392 bool allow_instances; 393 bool allow_instances;
393#ifdef CONFIG_TRACER_MAX_TRACE 394#ifdef CONFIG_TRACER_MAX_TRACE
@@ -541,7 +542,6 @@ struct dentry *trace_create_file(const char *name,
541 void *data, 542 void *data,
542 const struct file_operations *fops); 543 const struct file_operations *fops);
543 544
544struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
545struct dentry *tracing_init_dentry(void); 545struct dentry *tracing_init_dentry(void);
546 546
547struct ring_buffer_event; 547struct ring_buffer_event;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7d6e2afde669..57cbf1efdd44 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -7,7 +7,6 @@
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/spinlock.h> 8#include <linux/spinlock.h>
9#include <linux/irqflags.h> 9#include <linux/irqflags.h>
10#include <linux/debugfs.h>
11#include <linux/uaccess.h> 10#include <linux/uaccess.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/ftrace.h> 12#include <linux/ftrace.h>
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4b9c114ee9de..6fa484de2ba1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags)
261} 261}
262 262
263void *perf_trace_buf_prepare(int size, unsigned short type, 263void *perf_trace_buf_prepare(int size, unsigned short type,
264 struct pt_regs *regs, int *rctxp) 264 struct pt_regs **regs, int *rctxp)
265{ 265{
266 struct trace_entry *entry; 266 struct trace_entry *entry;
267 unsigned long flags; 267 unsigned long flags;
@@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
280 if (*rctxp < 0) 280 if (*rctxp < 0)
281 return NULL; 281 return NULL;
282 282
283 if (regs)
284 *regs = this_cpu_ptr(&__perf_regs[*rctxp]);
283 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); 285 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
284 286
285 /* zero the dead bytes from align to not leak stack to user */ 287 /* zero the dead bytes from align to not leak stack to user */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b03a0ea77b99..db54dda10ccc 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2531,7 +2531,7 @@ static __init int event_trace_init(void)
2531 return -ENODEV; 2531 return -ENODEV;
2532 2532
2533 d_tracer = tracing_init_dentry(); 2533 d_tracer = tracing_init_dentry();
2534 if (!d_tracer) 2534 if (IS_ERR(d_tracer))
2535 return 0; 2535 return 0;
2536 2536
2537 entry = debugfs_create_file("available_events", 0444, d_tracer, 2537 entry = debugfs_create_file("available_events", 0444, d_tracer,
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4ddde28a81a..12e2b99be862 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -6,12 +6,10 @@
6#include <linux/stringify.h> 6#include <linux/stringify.h>
7#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 9#include <linux/uaccess.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/init.h> 12#include <linux/init.h>
14#include <linux/fs.h>
15 13
16#include "trace_output.h" 14#include "trace_output.h"
17 15
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ba476009e5de..2d25ad1526bb 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1437,7 +1437,7 @@ static __init int init_graph_debugfs(void)
1437 struct dentry *d_tracer; 1437 struct dentry *d_tracer;
1438 1438
1439 d_tracer = tracing_init_dentry(); 1439 d_tracer = tracing_init_dentry();
1440 if (!d_tracer) 1440 if (IS_ERR(d_tracer))
1441 return 0; 1441 return 0;
1442 1442
1443 trace_create_file("max_graph_depth", 0644, d_tracer, 1443 trace_create_file("max_graph_depth", 0644, d_tracer,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9bb104f748d0..8523ea345f2b 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -10,11 +10,9 @@
10 * Copyright (C) 2004 Nadia Yvette Chambers 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/kallsyms.h> 12#include <linux/kallsyms.h>
13#include <linux/debugfs.h>
14#include <linux/uaccess.h> 13#include <linux/uaccess.h>
15#include <linux/module.h> 14#include <linux/module.h>
16#include <linux/ftrace.h> 15#include <linux/ftrace.h>
17#include <linux/fs.h>
18 16
19#include "trace.h" 17#include "trace.h"
20 18
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5edb518be345..d73f565b4e06 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1148 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1148 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1149 size -= sizeof(u32); 1149 size -= sizeof(u32);
1150 1150
1151 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1151 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1152 if (!entry) 1152 if (!entry)
1153 return; 1153 return;
1154 1154
@@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1179 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1179 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1180 size -= sizeof(u32); 1180 size -= sizeof(u32);
1181 1181
1182 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1182 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1183 if (!entry) 1183 if (!entry)
1184 return; 1184 return;
1185 1185
@@ -1320,7 +1320,7 @@ static __init int init_kprobe_trace(void)
1320 return -EINVAL; 1320 return -EINVAL;
1321 1321
1322 d_tracer = tracing_init_dentry(); 1322 d_tracer = tracing_init_dentry();
1323 if (!d_tracer) 1323 if (IS_ERR(d_tracer))
1324 return 0; 1324 return 0;
1325 1325
1326 entry = debugfs_create_file("kprobe_events", 0644, d_tracer, 1326 entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index fcf0a9e48916..8bb2071474dd 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -6,8 +6,6 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h> 9#include <linux/ftrace.h>
12 10
13#include "trace.h" 11#include "trace.h"
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b77b9a697619..692bf7184c8c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -177,6 +177,50 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
177} 177}
178EXPORT_SYMBOL(ftrace_print_hex_seq); 178EXPORT_SYMBOL(ftrace_print_hex_seq);
179 179
180const char *
181ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
182 size_t el_size)
183{
184 const char *ret = trace_seq_buffer_ptr(p);
185 const char *prefix = "";
186 void *ptr = (void *)buf;
187
188 trace_seq_putc(p, '{');
189
190 while (ptr < buf + buf_len) {
191 switch (el_size) {
192 case 1:
193 trace_seq_printf(p, "%s0x%x", prefix,
194 *(u8 *)ptr);
195 break;
196 case 2:
197 trace_seq_printf(p, "%s0x%x", prefix,
198 *(u16 *)ptr);
199 break;
200 case 4:
201 trace_seq_printf(p, "%s0x%x", prefix,
202 *(u32 *)ptr);
203 break;
204 case 8:
205 trace_seq_printf(p, "%s0x%llx", prefix,
206 *(u64 *)ptr);
207 break;
208 default:
209 trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size,
210 *(u8 *)ptr);
211 el_size = 1;
212 }
213 prefix = ",";
214 ptr += el_size;
215 }
216
217 trace_seq_putc(p, '}');
218 trace_seq_putc(p, 0);
219
220 return ret;
221}
222EXPORT_SYMBOL(ftrace_print_array_seq);
223
180int ftrace_raw_output_prep(struct trace_iterator *iter, 224int ftrace_raw_output_prep(struct trace_iterator *iter,
181 struct trace_event *trace_event) 225 struct trace_event *trace_event)
182{ 226{
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index c4e70b6bd7fa..36c1455b7567 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -5,7 +5,6 @@
5 * 5 *
6 */ 6 */
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/debugfs.h>
9#include <linux/uaccess.h> 8#include <linux/uaccess.h>
10#include <linux/kernel.h> 9#include <linux/kernel.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
@@ -15,7 +14,6 @@
15#include <linux/ctype.h> 14#include <linux/ctype.h>
16#include <linux/list.h> 15#include <linux/list.h>
17#include <linux/slab.h> 16#include <linux/slab.h>
18#include <linux/fs.h>
19 17
20#include "trace.h" 18#include "trace.h"
21 19
@@ -349,7 +347,7 @@ static __init int init_trace_printk_function_export(void)
349 struct dentry *d_tracer; 347 struct dentry *d_tracer;
350 348
351 d_tracer = tracing_init_dentry(); 349 d_tracer = tracing_init_dentry();
352 if (!d_tracer) 350 if (IS_ERR(d_tracer))
353 return 0; 351 return 0;
354 352
355 trace_create_file("printk_formats", 0444, d_tracer, 353 trace_create_file("printk_formats", 0444, d_tracer,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 2e293beb186e..419ca37e72c9 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -5,8 +5,6 @@
5 * 5 *
6 */ 6 */
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/fs.h>
9#include <linux/debugfs.h>
10#include <linux/kallsyms.h> 8#include <linux/kallsyms.h>
11#include <linux/uaccess.h> 9#include <linux/uaccess.h>
12#include <linux/ftrace.h> 10#include <linux/ftrace.h>
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 8fb84b362816..d6e1003724e9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -10,8 +10,6 @@
10 * Copyright (C) 2004 Nadia Yvette Chambers 10 * Copyright (C) 2004 Nadia Yvette Chambers
11 */ 11 */
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/fs.h>
14#include <linux/debugfs.h>
15#include <linux/kallsyms.h> 13#include <linux/kallsyms.h>
16#include <linux/uaccess.h> 14#include <linux/uaccess.h>
17#include <linux/ftrace.h> 15#include <linux/ftrace.h>
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 16eddb308c33..c3e4fcfddd45 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -7,12 +7,10 @@
7#include <linux/seq_file.h> 7#include <linux/seq_file.h>
8#include <linux/spinlock.h> 8#include <linux/spinlock.h>
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/sysctl.h> 12#include <linux/sysctl.h>
14#include <linux/init.h> 13#include <linux/init.h>
15#include <linux/fs.h>
16 14
17#include <asm/setup.h> 15#include <asm/setup.h>
18 16
@@ -462,7 +460,7 @@ static __init int stack_trace_init(void)
462 struct dentry *d_tracer; 460 struct dentry *d_tracer;
463 461
464 d_tracer = tracing_init_dentry(); 462 d_tracer = tracing_init_dentry();
465 if (!d_tracer) 463 if (IS_ERR(d_tracer))
466 return 0; 464 return 0;
467 465
468 trace_create_file("stack_max_size", 0644, d_tracer, 466 trace_create_file("stack_max_size", 0644, d_tracer,
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 7af67360b330..75e19e86c954 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -276,7 +276,7 @@ static int tracing_stat_init(void)
276 struct dentry *d_tracing; 276 struct dentry *d_tracing;
277 277
278 d_tracing = tracing_init_dentry(); 278 d_tracing = tracing_init_dentry();
279 if (!d_tracing) 279 if (IS_ERR(d_tracing))
280 return 0; 280 return 0;
281 281
282 stat_dir = debugfs_create_dir("trace_stat", d_tracing); 282 stat_dir = debugfs_create_dir("trace_stat", d_tracing);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..f97f6e3a676c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
574 size -= sizeof(u32); 574 size -= sizeof(u32);
575 575
576 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 576 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
577 sys_data->enter_event->event.type, regs, &rctx); 577 sys_data->enter_event->event.type, NULL, &rctx);
578 if (!rec) 578 if (!rec)
579 return; 579 return;
580 580
@@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
647 size -= sizeof(u32); 647 size -= sizeof(u32);
648 648
649 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 649 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
650 sys_data->exit_event->event.type, regs, &rctx); 650 sys_data->exit_event->event.type, NULL, &rctx);
651 if (!rec) 651 if (!rec)
652 return; 652 return;
653 653
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8520acc34b18..7dc1c8abecd6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
1111 if (hlist_empty(head)) 1111 if (hlist_empty(head))
1112 goto out; 1112 goto out;
1113 1113
1114 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1114 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1115 if (!entry) 1115 if (!entry)
1116 goto out; 1116 goto out;
1117 1117
@@ -1321,7 +1321,7 @@ static __init int init_uprobe_trace(void)
1321 struct dentry *d_tracer; 1321 struct dentry *d_tracer;
1322 1322
1323 d_tracer = tracing_init_dentry(); 1323 d_tracer = tracing_init_dentry();
1324 if (!d_tracer) 1324 if (IS_ERR(d_tracer))
1325 return 0; 1325 return 0;
1326 1326
1327 trace_create_file("uprobe_events", 0644, d_tracer, 1327 trace_create_file("uprobe_events", 0644, d_tracer,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 70bf11815f84..3174bf8e3538 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -154,7 +154,7 @@ static int get_softlockup_thresh(void)
154 */ 154 */
155static unsigned long get_timestamp(void) 155static unsigned long get_timestamp(void)
156{ 156{
157 return local_clock() >> 30LL; /* 2^30 ~= 10^9 */ 157 return running_clock() >> 30LL; /* 2^30 ~= 10^9 */
158} 158}
159 159
160static void set_sample_period(void) 160static void set_sample_period(void)