aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/audit.c13
-rw-r--r--kernel/audit_tree.c1
-rw-r--r--kernel/auditsc.c7
-rw-r--r--kernel/bpf/Makefile6
-rw-r--r--kernel/bpf/arraymap.c156
-rw-r--r--kernel/bpf/core.c9
-rw-r--r--kernel/bpf/hashtab.c367
-rw-r--r--kernel/bpf/helpers.c89
-rw-r--r--kernel/bpf/syscall.c6
-rw-r--r--kernel/bpf/test_stub.c56
-rw-r--r--kernel/bpf/verifier.c174
-rw-r--r--kernel/context_tracking.c40
-rw-r--r--kernel/cpu.c21
-rw-r--r--kernel/cpuset.c23
-rw-r--r--kernel/events/core.c85
-rw-r--r--kernel/events/hw_breakpoint.c7
-rw-r--r--kernel/events/uprobes.c1
-rw-r--r--kernel/exit.c248
-rw-r--r--kernel/extable.c7
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/futex.c36
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/irq/Kconfig15
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c130
-rw-r--r--kernel/irq/generic-chip.c36
-rw-r--r--kernel/irq/irqdomain.c567
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/irq/msi.c330
-rw-r--r--kernel/kmod.c111
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/locking/mutex.c8
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/panic.c14
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/pid_namespace.c28
-rw-r--r--kernel/power/Kconfig7
-rw-r--r--kernel/power/hibernate.c22
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/power/swap.c43
-rw-r--r--kernel/printk/printk.c90
-rw-r--r--kernel/ptrace.c23
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcu.h2
-rw-r--r--kernel/rcu/rcutorture.c1
-rw-r--r--kernel/rcu/tiny.c6
-rw-r--r--kernel/rcu/tree.c112
-rw-r--r--kernel/rcu/tree.h23
-rw-r--r--kernel/rcu/tree_plugin.h144
-rw-r--r--kernel/rcu/update.c89
-rw-r--r--kernel/res_counter.c211
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c365
-rw-r--r--kernel/sched/cpudeadline.h3
-rw-r--r--kernel/sched/cpupri.h3
-rw-r--r--kernel/sched/deadline.c142
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c389
-rw-r--r--kernel/sched/idle_task.c5
-rw-r--r--kernel/sched/rt.c19
-rw-r--r--kernel/sched/sched.h45
-rw-r--r--kernel/sched/stop_task.c5
-rw-r--r--kernel/sched/wait.c66
-rw-r--r--kernel/signal.c46
-rw-r--r--kernel/smpboot.c15
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/sys.c12
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/posix-cpu-timers.c2
-rw-r--r--kernel/time/posix-timers.c1
-rw-r--r--kernel/time/test_udelay.c (renamed from kernel/time/udelay_test.c)0
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/time.c20
-rw-r--r--kernel/time/timekeeping.c127
-rw-r--r--kernel/time/timer.c3
-rw-r--r--kernel/trace/blktrace.c148
-rw-r--r--kernel/trace/ftrace.c384
-rw-r--r--kernel/trace/ring_buffer.c154
-rw-r--r--kernel/trace/trace.c261
-rw-r--r--kernel/trace/trace.h17
-rw-r--r--kernel/trace/trace_branch.c47
-rw-r--r--kernel/trace/trace_events.c15
-rw-r--r--kernel/trace/trace_events_filter.c29
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_functions.c119
-rw-r--r--kernel/trace/trace_functions_graph.c423
-rw-r--r--kernel/trace/trace_kdb.c21
-rw-r--r--kernel/trace/trace_kprobe.c46
-rw-r--r--kernel/trace/trace_mmiotrace.c52
-rw-r--r--kernel/trace/trace_output.c446
-rw-r--r--kernel/trace/trace_output.h16
-rw-r--r--kernel/trace/trace_printk.c2
-rw-r--r--kernel/trace/trace_probe.c10
-rw-r--r--kernel/trace/trace_sched_switch.c144
-rw-r--r--kernel/trace/trace_sched_wakeup.c56
-rw-r--r--kernel/trace/trace_seq.c253
-rw-r--r--kernel/trace/trace_syscalls.c55
-rw-r--r--kernel/trace/trace_uprobe.c28
105 files changed, 4913 insertions, 2555 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index dc5c77544fd6..a59481a3fa6c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o
57obj-$(CONFIG_USER_NS) += user_namespace.o 57obj-$(CONFIG_USER_NS) += user_namespace.o
58obj-$(CONFIG_PID_NS) += pid_namespace.o 58obj-$(CONFIG_PID_NS) += pid_namespace.o
59obj-$(CONFIG_IKCONFIG) += configs.o 59obj-$(CONFIG_IKCONFIG) += configs.o
60obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
61obj-$(CONFIG_SMP) += stop_machine.o 60obj-$(CONFIG_SMP) += stop_machine.o
62obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 61obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
63obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 62obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -86,7 +85,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
86obj-$(CONFIG_TRACEPOINTS) += trace/ 85obj-$(CONFIG_TRACEPOINTS) += trace/
87obj-$(CONFIG_IRQ_WORK) += irq_work.o 86obj-$(CONFIG_IRQ_WORK) += irq_work.o
88obj-$(CONFIG_CPU_PM) += cpu_pm.o 87obj-$(CONFIG_CPU_PM) += cpu_pm.o
89obj-$(CONFIG_NET) += bpf/ 88obj-$(CONFIG_BPF) += bpf/
90 89
91obj-$(CONFIG_PERF_EVENTS) += events/ 90obj-$(CONFIG_PERF_EVENTS) += events/
92 91
diff --git a/kernel/audit.c b/kernel/audit.c
index 80983df92cd4..1f37f15117e5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy)
499 set_freezable(); 499 set_freezable();
500 while (!kthread_should_stop()) { 500 while (!kthread_should_stop()) {
501 struct sk_buff *skb; 501 struct sk_buff *skb;
502 DECLARE_WAITQUEUE(wait, current);
503 502
504 flush_hold_queue(); 503 flush_hold_queue();
505 504
@@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy)
514 audit_printk_skb(skb); 513 audit_printk_skb(skb);
515 continue; 514 continue;
516 } 515 }
517 set_current_state(TASK_INTERRUPTIBLE);
518 add_wait_queue(&kauditd_wait, &wait);
519 516
520 if (!skb_queue_len(&audit_skb_queue)) { 517 wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
521 try_to_freeze();
522 schedule();
523 }
524
525 __set_current_state(TASK_RUNNING);
526 remove_wait_queue(&kauditd_wait, &wait);
527 } 518 }
528 return 0; 519 return 0;
529} 520}
@@ -739,7 +730,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
739 730
740 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); 731 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
741 audit_log_task_info(ab, current); 732 audit_log_task_info(ab, current);
742 audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", 733 audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
743 audit_feature_names[which], !!old_feature, !!new_feature, 734 audit_feature_names[which], !!old_feature, !!new_feature,
744 !!old_lock, !!new_lock, res); 735 !!old_lock, !!new_lock, res);
745 audit_log_end(ab); 736 audit_log_end(ab);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e242e3a9864a..80f29e015570 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -154,6 +154,7 @@ static struct audit_chunk *alloc_chunk(int count)
154 chunk->owners[i].index = i; 154 chunk->owners[i].index = i;
155 } 155 }
156 fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); 156 fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
157 chunk->mark.mask = FS_IN_IGNORED;
157 return chunk; 158 return chunk;
158} 159}
159 160
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e420a0c41b5f..c75522a83678 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1897,6 +1897,11 @@ out:
1897 audit_copy_inode(n, dentry, inode); 1897 audit_copy_inode(n, dentry, inode);
1898} 1898}
1899 1899
1900void __audit_file(const struct file *file)
1901{
1902 __audit_inode(NULL, file->f_path.dentry, 0);
1903}
1904
1900/** 1905/**
1901 * __audit_inode_child - collect inode info for created/removed objects 1906 * __audit_inode_child - collect inode info for created/removed objects
1902 * @parent: inode of dentry parent 1907 * @parent: inode of dentry parent
@@ -2373,7 +2378,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2373 ax->d.next = context->aux; 2378 ax->d.next = context->aux;
2374 context->aux = (void *)ax; 2379 context->aux = (void *)ax;
2375 2380
2376 dentry = dget(bprm->file->f_dentry); 2381 dentry = dget(bprm->file->f_path.dentry);
2377 get_vfs_caps_from_disk(dentry, &vcaps); 2382 get_vfs_caps_from_disk(dentry, &vcaps);
2378 dput(dentry); 2383 dput(dentry);
2379 2384
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 45427239f375..a5ae60f0b0a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
1obj-y := core.o syscall.o verifier.o 1obj-y := core.o
2 2obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
3ifdef CONFIG_TEST_BPF 3ifdef CONFIG_TEST_BPF
4obj-y += test_stub.o 4obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
5endif 5endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000000..9eb4d8a7cd87
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,156 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/err.h>
14#include <linux/vmalloc.h>
15#include <linux/slab.h>
16#include <linux/mm.h>
17
18struct bpf_array {
19 struct bpf_map map;
20 u32 elem_size;
21 char value[0] __aligned(8);
22};
23
24/* Called from syscall */
25static struct bpf_map *array_map_alloc(union bpf_attr *attr)
26{
27 struct bpf_array *array;
28 u32 elem_size, array_size;
29
30 /* check sanity of attributes */
31 if (attr->max_entries == 0 || attr->key_size != 4 ||
32 attr->value_size == 0)
33 return ERR_PTR(-EINVAL);
34
35 elem_size = round_up(attr->value_size, 8);
36
37 /* check round_up into zero and u32 overflow */
38 if (elem_size == 0 ||
39 attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size)
40 return ERR_PTR(-ENOMEM);
41
42 array_size = sizeof(*array) + attr->max_entries * elem_size;
43
44 /* allocate all map elements and zero-initialize them */
45 array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
46 if (!array) {
47 array = vzalloc(array_size);
48 if (!array)
49 return ERR_PTR(-ENOMEM);
50 }
51
52 /* copy mandatory map attributes */
53 array->map.key_size = attr->key_size;
54 array->map.value_size = attr->value_size;
55 array->map.max_entries = attr->max_entries;
56
57 array->elem_size = elem_size;
58
59 return &array->map;
60}
61
62/* Called from syscall or from eBPF program */
63static void *array_map_lookup_elem(struct bpf_map *map, void *key)
64{
65 struct bpf_array *array = container_of(map, struct bpf_array, map);
66 u32 index = *(u32 *)key;
67
68 if (index >= array->map.max_entries)
69 return NULL;
70
71 return array->value + array->elem_size * index;
72}
73
74/* Called from syscall */
75static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
76{
77 struct bpf_array *array = container_of(map, struct bpf_array, map);
78 u32 index = *(u32 *)key;
79 u32 *next = (u32 *)next_key;
80
81 if (index >= array->map.max_entries) {
82 *next = 0;
83 return 0;
84 }
85
86 if (index == array->map.max_entries - 1)
87 return -ENOENT;
88
89 *next = index + 1;
90 return 0;
91}
92
93/* Called from syscall or from eBPF program */
94static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
95 u64 map_flags)
96{
97 struct bpf_array *array = container_of(map, struct bpf_array, map);
98 u32 index = *(u32 *)key;
99
100 if (map_flags > BPF_EXIST)
101 /* unknown flags */
102 return -EINVAL;
103
104 if (index >= array->map.max_entries)
105 /* all elements were pre-allocated, cannot insert a new one */
106 return -E2BIG;
107
108 if (map_flags == BPF_NOEXIST)
109 /* all elements already exist */
110 return -EEXIST;
111
112 memcpy(array->value + array->elem_size * index, value, array->elem_size);
113 return 0;
114}
115
116/* Called from syscall or from eBPF program */
117static int array_map_delete_elem(struct bpf_map *map, void *key)
118{
119 return -EINVAL;
120}
121
122/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
123static void array_map_free(struct bpf_map *map)
124{
125 struct bpf_array *array = container_of(map, struct bpf_array, map);
126
127 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
128 * so the programs (can be more than one that used this map) were
129 * disconnected from events. Wait for outstanding programs to complete
130 * and free the array
131 */
132 synchronize_rcu();
133
134 kvfree(array);
135}
136
137static struct bpf_map_ops array_ops = {
138 .map_alloc = array_map_alloc,
139 .map_free = array_map_free,
140 .map_get_next_key = array_map_get_next_key,
141 .map_lookup_elem = array_map_lookup_elem,
142 .map_update_elem = array_map_update_elem,
143 .map_delete_elem = array_map_delete_elem,
144};
145
146static struct bpf_map_type_list tl = {
147 .ops = &array_ops,
148 .type = BPF_MAP_TYPE_ARRAY,
149};
150
151static int __init register_array_map(void)
152{
153 bpf_register_map_type(&tl);
154 return 0;
155}
156late_initcall(register_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f0c30c59b317..d6594e457a25 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -655,3 +655,12 @@ void bpf_prog_free(struct bpf_prog *fp)
655 schedule_work(&aux->work); 655 schedule_work(&aux->work);
656} 656}
657EXPORT_SYMBOL_GPL(bpf_prog_free); 657EXPORT_SYMBOL_GPL(bpf_prog_free);
658
659/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
660 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
661 */
662int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
663 int len)
664{
665 return -EFAULT;
666}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..b3ba43674310
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,367 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/jhash.h>
14#include <linux/filter.h>
15#include <linux/vmalloc.h>
16
17struct bpf_htab {
18 struct bpf_map map;
19 struct hlist_head *buckets;
20 spinlock_t lock;
21 u32 count; /* number of elements in this hashtable */
22 u32 n_buckets; /* number of hash buckets */
23 u32 elem_size; /* size of each element in bytes */
24};
25
26/* each htab element is struct htab_elem + key + value */
27struct htab_elem {
28 struct hlist_node hash_node;
29 struct rcu_head rcu;
30 u32 hash;
31 char key[0] __aligned(8);
32};
33
34/* Called from syscall */
35static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
36{
37 struct bpf_htab *htab;
38 int err, i;
39
40 htab = kzalloc(sizeof(*htab), GFP_USER);
41 if (!htab)
42 return ERR_PTR(-ENOMEM);
43
44 /* mandatory map attributes */
45 htab->map.key_size = attr->key_size;
46 htab->map.value_size = attr->value_size;
47 htab->map.max_entries = attr->max_entries;
48
49 /* check sanity of attributes.
50 * value_size == 0 may be allowed in the future to use map as a set
51 */
52 err = -EINVAL;
53 if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
54 htab->map.value_size == 0)
55 goto free_htab;
56
57 /* hash table size must be power of 2 */
58 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
59
60 err = -E2BIG;
61 if (htab->map.key_size > MAX_BPF_STACK)
62 /* eBPF programs initialize keys on stack, so they cannot be
63 * larger than max stack size
64 */
65 goto free_htab;
66
67 err = -ENOMEM;
68 /* prevent zero size kmalloc and check for u32 overflow */
69 if (htab->n_buckets == 0 ||
70 htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
71 goto free_htab;
72
73 htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
74 GFP_USER | __GFP_NOWARN);
75
76 if (!htab->buckets) {
77 htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
78 if (!htab->buckets)
79 goto free_htab;
80 }
81
82 for (i = 0; i < htab->n_buckets; i++)
83 INIT_HLIST_HEAD(&htab->buckets[i]);
84
85 spin_lock_init(&htab->lock);
86 htab->count = 0;
87
88 htab->elem_size = sizeof(struct htab_elem) +
89 round_up(htab->map.key_size, 8) +
90 htab->map.value_size;
91 return &htab->map;
92
93free_htab:
94 kfree(htab);
95 return ERR_PTR(err);
96}
97
98static inline u32 htab_map_hash(const void *key, u32 key_len)
99{
100 return jhash(key, key_len, 0);
101}
102
103static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
104{
105 return &htab->buckets[hash & (htab->n_buckets - 1)];
106}
107
108static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
109 void *key, u32 key_size)
110{
111 struct htab_elem *l;
112
113 hlist_for_each_entry_rcu(l, head, hash_node)
114 if (l->hash == hash && !memcmp(&l->key, key, key_size))
115 return l;
116
117 return NULL;
118}
119
120/* Called from syscall or from eBPF program */
121static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
122{
123 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
124 struct hlist_head *head;
125 struct htab_elem *l;
126 u32 hash, key_size;
127
128 /* Must be called with rcu_read_lock. */
129 WARN_ON_ONCE(!rcu_read_lock_held());
130
131 key_size = map->key_size;
132
133 hash = htab_map_hash(key, key_size);
134
135 head = select_bucket(htab, hash);
136
137 l = lookup_elem_raw(head, hash, key, key_size);
138
139 if (l)
140 return l->key + round_up(map->key_size, 8);
141
142 return NULL;
143}
144
145/* Called from syscall */
146static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
147{
148 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
149 struct hlist_head *head;
150 struct htab_elem *l, *next_l;
151 u32 hash, key_size;
152 int i;
153
154 WARN_ON_ONCE(!rcu_read_lock_held());
155
156 key_size = map->key_size;
157
158 hash = htab_map_hash(key, key_size);
159
160 head = select_bucket(htab, hash);
161
162 /* lookup the key */
163 l = lookup_elem_raw(head, hash, key, key_size);
164
165 if (!l) {
166 i = 0;
167 goto find_first_elem;
168 }
169
170 /* key was found, get next key in the same bucket */
171 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
172 struct htab_elem, hash_node);
173
174 if (next_l) {
175 /* if next elem in this hash list is non-zero, just return it */
176 memcpy(next_key, next_l->key, key_size);
177 return 0;
178 }
179
180 /* no more elements in this hash list, go to the next bucket */
181 i = hash & (htab->n_buckets - 1);
182 i++;
183
184find_first_elem:
185 /* iterate over buckets */
186 for (; i < htab->n_buckets; i++) {
187 head = select_bucket(htab, i);
188
189 /* pick first element in the bucket */
190 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
191 struct htab_elem, hash_node);
192 if (next_l) {
193 /* if it's not empty, just return it */
194 memcpy(next_key, next_l->key, key_size);
195 return 0;
196 }
197 }
198
199 /* itereated over all buckets and all elements */
200 return -ENOENT;
201}
202
203/* Called from syscall or from eBPF program */
204static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
205 u64 map_flags)
206{
207 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
208 struct htab_elem *l_new, *l_old;
209 struct hlist_head *head;
210 unsigned long flags;
211 u32 key_size;
212 int ret;
213
214 if (map_flags > BPF_EXIST)
215 /* unknown flags */
216 return -EINVAL;
217
218 WARN_ON_ONCE(!rcu_read_lock_held());
219
220 /* allocate new element outside of lock */
221 l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
222 if (!l_new)
223 return -ENOMEM;
224
225 key_size = map->key_size;
226
227 memcpy(l_new->key, key, key_size);
228 memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
229
230 l_new->hash = htab_map_hash(l_new->key, key_size);
231
232 /* bpf_map_update_elem() can be called in_irq() */
233 spin_lock_irqsave(&htab->lock, flags);
234
235 head = select_bucket(htab, l_new->hash);
236
237 l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
238
239 if (!l_old && unlikely(htab->count >= map->max_entries)) {
240 /* if elem with this 'key' doesn't exist and we've reached
241 * max_entries limit, fail insertion of new elem
242 */
243 ret = -E2BIG;
244 goto err;
245 }
246
247 if (l_old && map_flags == BPF_NOEXIST) {
248 /* elem already exists */
249 ret = -EEXIST;
250 goto err;
251 }
252
253 if (!l_old && map_flags == BPF_EXIST) {
254 /* elem doesn't exist, cannot update it */
255 ret = -ENOENT;
256 goto err;
257 }
258
259 /* add new element to the head of the list, so that concurrent
260 * search will find it before old elem
261 */
262 hlist_add_head_rcu(&l_new->hash_node, head);
263 if (l_old) {
264 hlist_del_rcu(&l_old->hash_node);
265 kfree_rcu(l_old, rcu);
266 } else {
267 htab->count++;
268 }
269 spin_unlock_irqrestore(&htab->lock, flags);
270
271 return 0;
272err:
273 spin_unlock_irqrestore(&htab->lock, flags);
274 kfree(l_new);
275 return ret;
276}
277
278/* Called from syscall or from eBPF program */
279static int htab_map_delete_elem(struct bpf_map *map, void *key)
280{
281 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
282 struct hlist_head *head;
283 struct htab_elem *l;
284 unsigned long flags;
285 u32 hash, key_size;
286 int ret = -ENOENT;
287
288 WARN_ON_ONCE(!rcu_read_lock_held());
289
290 key_size = map->key_size;
291
292 hash = htab_map_hash(key, key_size);
293
294 spin_lock_irqsave(&htab->lock, flags);
295
296 head = select_bucket(htab, hash);
297
298 l = lookup_elem_raw(head, hash, key, key_size);
299
300 if (l) {
301 hlist_del_rcu(&l->hash_node);
302 htab->count--;
303 kfree_rcu(l, rcu);
304 ret = 0;
305 }
306
307 spin_unlock_irqrestore(&htab->lock, flags);
308 return ret;
309}
310
311static void delete_all_elements(struct bpf_htab *htab)
312{
313 int i;
314
315 for (i = 0; i < htab->n_buckets; i++) {
316 struct hlist_head *head = select_bucket(htab, i);
317 struct hlist_node *n;
318 struct htab_elem *l;
319
320 hlist_for_each_entry_safe(l, n, head, hash_node) {
321 hlist_del_rcu(&l->hash_node);
322 htab->count--;
323 kfree(l);
324 }
325 }
326}
327
328/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
329static void htab_map_free(struct bpf_map *map)
330{
331 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
332
333 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
334 * so the programs (can be more than one that used this map) were
335 * disconnected from events. Wait for outstanding critical sections in
336 * these programs to complete
337 */
338 synchronize_rcu();
339
340 /* some of kfree_rcu() callbacks for elements of this map may not have
341 * executed. It's ok. Proceed to free residual elements and map itself
342 */
343 delete_all_elements(htab);
344 kvfree(htab->buckets);
345 kfree(htab);
346}
347
348static struct bpf_map_ops htab_ops = {
349 .map_alloc = htab_map_alloc,
350 .map_free = htab_map_free,
351 .map_get_next_key = htab_map_get_next_key,
352 .map_lookup_elem = htab_map_lookup_elem,
353 .map_update_elem = htab_map_update_elem,
354 .map_delete_elem = htab_map_delete_elem,
355};
356
357static struct bpf_map_type_list tl = {
358 .ops = &htab_ops,
359 .type = BPF_MAP_TYPE_HASH,
360};
361
362static int __init register_htab_map(void)
363{
364 bpf_register_map_type(&tl);
365 return 0;
366}
367late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..9e3414d85459
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,89 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/rcupdate.h>
14
15/* If kernel subsystem is allowing eBPF programs to call this function,
16 * inside its own verifier_ops->get_func_proto() callback it should return
17 * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
18 *
19 * Different map implementations will rely on rcu in map methods
20 * lookup/update/delete, therefore eBPF programs must run under rcu lock
21 * if program is allowed to access maps, so check rcu_read_lock_held in
22 * all three functions.
23 */
24static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
25{
26 /* verifier checked that R1 contains a valid pointer to bpf_map
27 * and R2 points to a program stack and map->key_size bytes were
28 * initialized
29 */
30 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
31 void *key = (void *) (unsigned long) r2;
32 void *value;
33
34 WARN_ON_ONCE(!rcu_read_lock_held());
35
36 value = map->ops->map_lookup_elem(map, key);
37
38 /* lookup() returns either pointer to element value or NULL
39 * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
40 */
41 return (unsigned long) value;
42}
43
44struct bpf_func_proto bpf_map_lookup_elem_proto = {
45 .func = bpf_map_lookup_elem,
46 .gpl_only = false,
47 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
48 .arg1_type = ARG_CONST_MAP_PTR,
49 .arg2_type = ARG_PTR_TO_MAP_KEY,
50};
51
52static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
53{
54 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
55 void *key = (void *) (unsigned long) r2;
56 void *value = (void *) (unsigned long) r3;
57
58 WARN_ON_ONCE(!rcu_read_lock_held());
59
60 return map->ops->map_update_elem(map, key, value, r4);
61}
62
63struct bpf_func_proto bpf_map_update_elem_proto = {
64 .func = bpf_map_update_elem,
65 .gpl_only = false,
66 .ret_type = RET_INTEGER,
67 .arg1_type = ARG_CONST_MAP_PTR,
68 .arg2_type = ARG_PTR_TO_MAP_KEY,
69 .arg3_type = ARG_PTR_TO_MAP_VALUE,
70 .arg4_type = ARG_ANYTHING,
71};
72
73static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
74{
75 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
76 void *key = (void *) (unsigned long) r2;
77
78 WARN_ON_ONCE(!rcu_read_lock_held());
79
80 return map->ops->map_delete_elem(map, key);
81}
82
83struct bpf_func_proto bpf_map_delete_elem_proto = {
84 .func = bpf_map_delete_elem,
85 .gpl_only = false,
86 .ret_type = RET_INTEGER,
87 .arg1_type = ARG_CONST_MAP_PTR,
88 .arg2_type = ARG_PTR_TO_MAP_KEY,
89};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ba61c8c16032..088ac0b1b106 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -169,7 +169,7 @@ static int map_lookup_elem(union bpf_attr *attr)
169 if (copy_from_user(key, ukey, map->key_size) != 0) 169 if (copy_from_user(key, ukey, map->key_size) != 0)
170 goto free_key; 170 goto free_key;
171 171
172 err = -ESRCH; 172 err = -ENOENT;
173 rcu_read_lock(); 173 rcu_read_lock();
174 value = map->ops->map_lookup_elem(map, key); 174 value = map->ops->map_lookup_elem(map, key);
175 if (!value) 175 if (!value)
@@ -190,7 +190,7 @@ err_put:
190 return err; 190 return err;
191} 191}
192 192
193#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value 193#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
194 194
195static int map_update_elem(union bpf_attr *attr) 195static int map_update_elem(union bpf_attr *attr)
196{ 196{
@@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr)
231 * therefore all map accessors rely on this fact, so do the same here 231 * therefore all map accessors rely on this fact, so do the same here
232 */ 232 */
233 rcu_read_lock(); 233 rcu_read_lock();
234 err = map->ops->map_update_elem(map, key, value); 234 err = map->ops->map_update_elem(map, key, value, attr->flags);
235 rcu_read_unlock(); 235 rcu_read_unlock();
236 236
237free_value: 237free_value:
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
index fcaddff4003e..0ceae1e6e8b5 100644
--- a/kernel/bpf/test_stub.c
+++ b/kernel/bpf/test_stub.c
@@ -18,26 +18,18 @@ struct bpf_context {
18 u64 arg2; 18 u64 arg2;
19}; 19};
20 20
21static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
22{
23 return 0;
24}
25
26static struct bpf_func_proto test_funcs[] = {
27 [BPF_FUNC_unspec] = {
28 .func = test_func,
29 .gpl_only = true,
30 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
31 .arg1_type = ARG_CONST_MAP_PTR,
32 .arg2_type = ARG_PTR_TO_MAP_KEY,
33 },
34};
35
36static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) 21static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
37{ 22{
38 if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs)) 23 switch (func_id) {
24 case BPF_FUNC_map_lookup_elem:
25 return &bpf_map_lookup_elem_proto;
26 case BPF_FUNC_map_update_elem:
27 return &bpf_map_update_elem_proto;
28 case BPF_FUNC_map_delete_elem:
29 return &bpf_map_delete_elem_proto;
30 default:
39 return NULL; 31 return NULL;
40 return &test_funcs[func_id]; 32 }
41} 33}
42 34
43static const struct bpf_context_access { 35static const struct bpf_context_access {
@@ -78,38 +70,8 @@ static struct bpf_prog_type_list tl_prog = {
78 .type = BPF_PROG_TYPE_UNSPEC, 70 .type = BPF_PROG_TYPE_UNSPEC,
79}; 71};
80 72
81static struct bpf_map *test_map_alloc(union bpf_attr *attr)
82{
83 struct bpf_map *map;
84
85 map = kzalloc(sizeof(*map), GFP_USER);
86 if (!map)
87 return ERR_PTR(-ENOMEM);
88
89 map->key_size = attr->key_size;
90 map->value_size = attr->value_size;
91 map->max_entries = attr->max_entries;
92 return map;
93}
94
95static void test_map_free(struct bpf_map *map)
96{
97 kfree(map);
98}
99
100static struct bpf_map_ops test_map_ops = {
101 .map_alloc = test_map_alloc,
102 .map_free = test_map_free,
103};
104
105static struct bpf_map_type_list tl_map = {
106 .ops = &test_map_ops,
107 .type = BPF_MAP_TYPE_UNSPEC,
108};
109
110static int __init register_test_ops(void) 73static int __init register_test_ops(void)
111{ 74{
112 bpf_register_map_type(&tl_map);
113 bpf_register_prog_type(&tl_prog); 75 bpf_register_prog_type(&tl_prog);
114 return 0; 76 return 0;
115} 77}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 801f5f3b9307..a28e09c7825d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,22 +153,19 @@ struct reg_state {
153 153
154enum bpf_stack_slot_type { 154enum bpf_stack_slot_type {
155 STACK_INVALID, /* nothing was stored in this stack slot */ 155 STACK_INVALID, /* nothing was stored in this stack slot */
156 STACK_SPILL, /* 1st byte of register spilled into stack */ 156 STACK_SPILL, /* register spilled into stack */
157 STACK_SPILL_PART, /* other 7 bytes of register spill */
158 STACK_MISC /* BPF program wrote some data into this slot */ 157 STACK_MISC /* BPF program wrote some data into this slot */
159}; 158};
160 159
161struct bpf_stack_slot { 160#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
162 enum bpf_stack_slot_type stype;
163 struct reg_state reg_st;
164};
165 161
166/* state of the program: 162/* state of the program:
167 * type of all registers and stack info 163 * type of all registers and stack info
168 */ 164 */
169struct verifier_state { 165struct verifier_state {
170 struct reg_state regs[MAX_BPF_REG]; 166 struct reg_state regs[MAX_BPF_REG];
171 struct bpf_stack_slot stack[MAX_BPF_STACK]; 167 u8 stack_slot_type[MAX_BPF_STACK];
168 struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
172}; 169};
173 170
174/* linked list of verifier states used to prune search */ 171/* linked list of verifier states used to prune search */
@@ -259,10 +256,10 @@ static void print_verifier_state(struct verifier_env *env)
259 env->cur_state.regs[i].map_ptr->key_size, 256 env->cur_state.regs[i].map_ptr->key_size,
260 env->cur_state.regs[i].map_ptr->value_size); 257 env->cur_state.regs[i].map_ptr->value_size);
261 } 258 }
262 for (i = 0; i < MAX_BPF_STACK; i++) { 259 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
263 if (env->cur_state.stack[i].stype == STACK_SPILL) 260 if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
264 verbose(" fp%d=%s", -MAX_BPF_STACK + i, 261 verbose(" fp%d=%s", -MAX_BPF_STACK + i,
265 reg_type_str[env->cur_state.stack[i].reg_st.type]); 262 reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
266 } 263 }
267 verbose("\n"); 264 verbose("\n");
268} 265}
@@ -539,8 +536,10 @@ static int bpf_size_to_bytes(int bpf_size)
539static int check_stack_write(struct verifier_state *state, int off, int size, 536static int check_stack_write(struct verifier_state *state, int off, int size,
540 int value_regno) 537 int value_regno)
541{ 538{
542 struct bpf_stack_slot *slot;
543 int i; 539 int i;
540 /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
541 * so it's aligned access and [off, off + size) are within stack limits
542 */
544 543
545 if (value_regno >= 0 && 544 if (value_regno >= 0 &&
546 (state->regs[value_regno].type == PTR_TO_MAP_VALUE || 545 (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
@@ -548,30 +547,24 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
548 state->regs[value_regno].type == PTR_TO_CTX)) { 547 state->regs[value_regno].type == PTR_TO_CTX)) {
549 548
550 /* register containing pointer is being spilled into stack */ 549 /* register containing pointer is being spilled into stack */
551 if (size != 8) { 550 if (size != BPF_REG_SIZE) {
552 verbose("invalid size of register spill\n"); 551 verbose("invalid size of register spill\n");
553 return -EACCES; 552 return -EACCES;
554 } 553 }
555 554
556 slot = &state->stack[MAX_BPF_STACK + off];
557 slot->stype = STACK_SPILL;
558 /* save register state */ 555 /* save register state */
559 slot->reg_st = state->regs[value_regno]; 556 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
560 for (i = 1; i < 8; i++) { 557 state->regs[value_regno];
561 slot = &state->stack[MAX_BPF_STACK + off + i];
562 slot->stype = STACK_SPILL_PART;
563 slot->reg_st.type = UNKNOWN_VALUE;
564 slot->reg_st.map_ptr = NULL;
565 }
566 } else {
567 558
559 for (i = 0; i < BPF_REG_SIZE; i++)
560 state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
561 } else {
568 /* regular write of data into stack */ 562 /* regular write of data into stack */
569 for (i = 0; i < size; i++) { 563 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
570 slot = &state->stack[MAX_BPF_STACK + off + i]; 564 (struct reg_state) {};
571 slot->stype = STACK_MISC; 565
572 slot->reg_st.type = UNKNOWN_VALUE; 566 for (i = 0; i < size; i++)
573 slot->reg_st.map_ptr = NULL; 567 state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
574 }
575 } 568 }
576 return 0; 569 return 0;
577} 570}
@@ -579,19 +572,18 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
579static int check_stack_read(struct verifier_state *state, int off, int size, 572static int check_stack_read(struct verifier_state *state, int off, int size,
580 int value_regno) 573 int value_regno)
581{ 574{
575 u8 *slot_type;
582 int i; 576 int i;
583 struct bpf_stack_slot *slot;
584 577
585 slot = &state->stack[MAX_BPF_STACK + off]; 578 slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
586 579
587 if (slot->stype == STACK_SPILL) { 580 if (slot_type[0] == STACK_SPILL) {
588 if (size != 8) { 581 if (size != BPF_REG_SIZE) {
589 verbose("invalid size of register spill\n"); 582 verbose("invalid size of register spill\n");
590 return -EACCES; 583 return -EACCES;
591 } 584 }
592 for (i = 1; i < 8; i++) { 585 for (i = 1; i < BPF_REG_SIZE; i++) {
593 if (state->stack[MAX_BPF_STACK + off + i].stype != 586 if (slot_type[i] != STACK_SPILL) {
594 STACK_SPILL_PART) {
595 verbose("corrupted spill memory\n"); 587 verbose("corrupted spill memory\n");
596 return -EACCES; 588 return -EACCES;
597 } 589 }
@@ -599,12 +591,12 @@ static int check_stack_read(struct verifier_state *state, int off, int size,
599 591
600 if (value_regno >= 0) 592 if (value_regno >= 0)
601 /* restore register state from stack */ 593 /* restore register state from stack */
602 state->regs[value_regno] = slot->reg_st; 594 state->regs[value_regno] =
595 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
603 return 0; 596 return 0;
604 } else { 597 } else {
605 for (i = 0; i < size; i++) { 598 for (i = 0; i < size; i++) {
606 if (state->stack[MAX_BPF_STACK + off + i].stype != 599 if (slot_type[i] != STACK_MISC) {
607 STACK_MISC) {
608 verbose("invalid read from stack off %d+%d size %d\n", 600 verbose("invalid read from stack off %d+%d size %d\n",
609 off, i, size); 601 off, i, size);
610 return -EACCES; 602 return -EACCES;
@@ -747,7 +739,7 @@ static int check_stack_boundary(struct verifier_env *env,
747 } 739 }
748 740
749 for (i = 0; i < access_size; i++) { 741 for (i = 0; i < access_size; i++) {
750 if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) { 742 if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
751 verbose("invalid indirect read from stack off %d+%d size %d\n", 743 verbose("invalid indirect read from stack off %d+%d size %d\n",
752 off, i, access_size); 744 off, i, access_size);
753 return -EACCES; 745 return -EACCES;
@@ -1180,6 +1172,70 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
1180 return 0; 1172 return 0;
1181} 1173}
1182 1174
1175/* verify safety of LD_ABS|LD_IND instructions:
1176 * - they can only appear in the programs where ctx == skb
1177 * - since they are wrappers of function calls, they scratch R1-R5 registers,
1178 * preserve R6-R9, and store return value into R0
1179 *
1180 * Implicit input:
1181 * ctx == skb == R6 == CTX
1182 *
1183 * Explicit input:
1184 * SRC == any register
1185 * IMM == 32-bit immediate
1186 *
1187 * Output:
1188 * R0 - 8/16/32-bit skb data converted to cpu endianness
1189 */
1190static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
1191{
1192 struct reg_state *regs = env->cur_state.regs;
1193 u8 mode = BPF_MODE(insn->code);
1194 struct reg_state *reg;
1195 int i, err;
1196
1197 if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
1198 verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n");
1199 return -EINVAL;
1200 }
1201
1202 if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
1203 (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
1204 verbose("BPF_LD_ABS uses reserved fields\n");
1205 return -EINVAL;
1206 }
1207
1208 /* check whether implicit source operand (register R6) is readable */
1209 err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
1210 if (err)
1211 return err;
1212
1213 if (regs[BPF_REG_6].type != PTR_TO_CTX) {
1214 verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
1215 return -EINVAL;
1216 }
1217
1218 if (mode == BPF_IND) {
1219 /* check explicit source operand */
1220 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1221 if (err)
1222 return err;
1223 }
1224
1225 /* reset caller saved regs to unreadable */
1226 for (i = 0; i < CALLER_SAVED_REGS; i++) {
1227 reg = regs + caller_saved[i];
1228 reg->type = NOT_INIT;
1229 reg->imm = 0;
1230 }
1231
1232 /* mark destination R0 register as readable, since it contains
1233 * the value fetched from the packet
1234 */
1235 regs[BPF_REG_0].type = UNKNOWN_VALUE;
1236 return 0;
1237}
1238
1183/* non-recursive DFS pseudo code 1239/* non-recursive DFS pseudo code
1184 * 1 procedure DFS-iterative(G,v): 1240 * 1 procedure DFS-iterative(G,v):
1185 * 2 label v as discovered 1241 * 2 label v as discovered
@@ -1409,19 +1465,41 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
1409 if (memcmp(&old->regs[i], &cur->regs[i], 1465 if (memcmp(&old->regs[i], &cur->regs[i],
1410 sizeof(old->regs[0])) != 0) { 1466 sizeof(old->regs[0])) != 0) {
1411 if (old->regs[i].type == NOT_INIT || 1467 if (old->regs[i].type == NOT_INIT ||
1412 old->regs[i].type == UNKNOWN_VALUE) 1468 (old->regs[i].type == UNKNOWN_VALUE &&
1469 cur->regs[i].type != NOT_INIT))
1413 continue; 1470 continue;
1414 return false; 1471 return false;
1415 } 1472 }
1416 } 1473 }
1417 1474
1418 for (i = 0; i < MAX_BPF_STACK; i++) { 1475 for (i = 0; i < MAX_BPF_STACK; i++) {
1419 if (memcmp(&old->stack[i], &cur->stack[i], 1476 if (old->stack_slot_type[i] == STACK_INVALID)
1420 sizeof(old->stack[0])) != 0) { 1477 continue;
1421 if (old->stack[i].stype == STACK_INVALID) 1478 if (old->stack_slot_type[i] != cur->stack_slot_type[i])
1422 continue; 1479 /* Ex: old explored (safe) state has STACK_SPILL in
1480 * this stack slot, but current has has STACK_MISC ->
1481 * this verifier states are not equivalent,
1482 * return false to continue verification of this path
1483 */
1423 return false; 1484 return false;
1424 } 1485 if (i % BPF_REG_SIZE)
1486 continue;
1487 if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
1488 &cur->spilled_regs[i / BPF_REG_SIZE],
1489 sizeof(old->spilled_regs[0])))
1490 /* when explored and current stack slot types are
1491 * the same, check that stored pointers types
1492 * are the same as well.
1493 * Ex: explored safe path could have stored
1494 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
1495 * but current path has stored:
1496 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
1497 * such verifier states are not equivalent.
1498 * return false to continue verification of this path
1499 */
1500 return false;
1501 else
1502 continue;
1425 } 1503 }
1426 return true; 1504 return true;
1427} 1505}
@@ -1663,8 +1741,10 @@ process_bpf_exit:
1663 u8 mode = BPF_MODE(insn->code); 1741 u8 mode = BPF_MODE(insn->code);
1664 1742
1665 if (mode == BPF_ABS || mode == BPF_IND) { 1743 if (mode == BPF_ABS || mode == BPF_IND) {
1666 verbose("LD_ABS is not supported yet\n"); 1744 err = check_ld_abs(env, insn);
1667 return -EINVAL; 1745 if (err)
1746 return err;
1747
1668 } else if (mode == BPF_IMM) { 1748 } else if (mode == BPF_IMM) {
1669 err = check_ld_imm(env, insn); 1749 err = check_ld_imm(env, insn);
1670 if (err) 1750 if (err)
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 5664985c46a0..937ecdfdf258 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -107,46 +107,6 @@ void context_tracking_user_enter(void)
107} 107}
108NOKPROBE_SYMBOL(context_tracking_user_enter); 108NOKPROBE_SYMBOL(context_tracking_user_enter);
109 109
110#ifdef CONFIG_PREEMPT
111/**
112 * preempt_schedule_context - preempt_schedule called by tracing
113 *
114 * The tracing infrastructure uses preempt_enable_notrace to prevent
115 * recursion and tracing preempt enabling caused by the tracing
116 * infrastructure itself. But as tracing can happen in areas coming
117 * from userspace or just about to enter userspace, a preempt enable
118 * can occur before user_exit() is called. This will cause the scheduler
119 * to be called when the system is still in usermode.
120 *
121 * To prevent this, the preempt_enable_notrace will use this function
122 * instead of preempt_schedule() to exit user context if needed before
123 * calling the scheduler.
124 */
125asmlinkage __visible void __sched notrace preempt_schedule_context(void)
126{
127 enum ctx_state prev_ctx;
128
129 if (likely(!preemptible()))
130 return;
131
132 /*
133 * Need to disable preemption in case user_exit() is traced
134 * and the tracer calls preempt_enable_notrace() causing
135 * an infinite recursion.
136 */
137 preempt_disable_notrace();
138 prev_ctx = exception_enter();
139 preempt_enable_no_resched_notrace();
140
141 preempt_schedule();
142
143 preempt_disable_notrace();
144 exception_exit(prev_ctx);
145 preempt_enable_notrace();
146}
147EXPORT_SYMBOL_GPL(preempt_schedule_context);
148#endif /* CONFIG_PREEMPT */
149
150/** 110/**
151 * context_tracking_user_exit - Inform the context tracking that the CPU is 111 * context_tracking_user_exit - Inform the context tracking that the CPU is
152 * exiting userspace mode and entering the kernel. 112 * exiting userspace mode and entering the kernel.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 356450f09c1f..5d220234b3ca 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ static struct {
64 * an ongoing cpu hotplug operation. 64 * an ongoing cpu hotplug operation.
65 */ 65 */
66 int refcount; 66 int refcount;
67 /* And allows lockless put_online_cpus(). */
68 atomic_t puts_pending;
67 69
68#ifdef CONFIG_DEBUG_LOCK_ALLOC 70#ifdef CONFIG_DEBUG_LOCK_ALLOC
69 struct lockdep_map dep_map; 71 struct lockdep_map dep_map;
@@ -84,6 +86,16 @@ static struct {
84#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) 86#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
85#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) 87#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
86 88
89static void apply_puts_pending(int max)
90{
91 int delta;
92
93 if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
94 delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
95 cpu_hotplug.refcount -= delta;
96 }
97}
98
87void get_online_cpus(void) 99void get_online_cpus(void)
88{ 100{
89 might_sleep(); 101 might_sleep();
@@ -91,6 +103,7 @@ void get_online_cpus(void)
91 return; 103 return;
92 cpuhp_lock_acquire_read(); 104 cpuhp_lock_acquire_read();
93 mutex_lock(&cpu_hotplug.lock); 105 mutex_lock(&cpu_hotplug.lock);
106 apply_puts_pending(65536);
94 cpu_hotplug.refcount++; 107 cpu_hotplug.refcount++;
95 mutex_unlock(&cpu_hotplug.lock); 108 mutex_unlock(&cpu_hotplug.lock);
96} 109}
@@ -103,6 +116,7 @@ bool try_get_online_cpus(void)
103 if (!mutex_trylock(&cpu_hotplug.lock)) 116 if (!mutex_trylock(&cpu_hotplug.lock))
104 return false; 117 return false;
105 cpuhp_lock_acquire_tryread(); 118 cpuhp_lock_acquire_tryread();
119 apply_puts_pending(65536);
106 cpu_hotplug.refcount++; 120 cpu_hotplug.refcount++;
107 mutex_unlock(&cpu_hotplug.lock); 121 mutex_unlock(&cpu_hotplug.lock);
108 return true; 122 return true;
@@ -113,7 +127,11 @@ void put_online_cpus(void)
113{ 127{
114 if (cpu_hotplug.active_writer == current) 128 if (cpu_hotplug.active_writer == current)
115 return; 129 return;
116 mutex_lock(&cpu_hotplug.lock); 130 if (!mutex_trylock(&cpu_hotplug.lock)) {
131 atomic_inc(&cpu_hotplug.puts_pending);
132 cpuhp_lock_release();
133 return;
134 }
117 135
118 if (WARN_ON(!cpu_hotplug.refcount)) 136 if (WARN_ON(!cpu_hotplug.refcount))
119 cpu_hotplug.refcount++; /* try to fix things up */ 137 cpu_hotplug.refcount++; /* try to fix things up */
@@ -155,6 +173,7 @@ void cpu_hotplug_begin(void)
155 cpuhp_lock_acquire(); 173 cpuhp_lock_acquire();
156 for (;;) { 174 for (;;) {
157 mutex_lock(&cpu_hotplug.lock); 175 mutex_lock(&cpu_hotplug.lock);
176 apply_puts_pending(1);
158 if (likely(!cpu_hotplug.refcount)) 177 if (likely(!cpu_hotplug.refcount))
159 break; 178 break;
160 __set_current_state(TASK_UNINTERRUPTIBLE); 179 __set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1f107c74087b..723cfc9d0ad7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
506 goto out; 506 goto out;
507 } 507 }
508 508
509 /*
510 * We can't shrink if we won't have enough room for SCHED_DEADLINE
511 * tasks.
512 */
513 ret = -EBUSY;
514 if (is_cpu_exclusive(cur) &&
515 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
516 trial->cpus_allowed))
517 goto out;
518
509 ret = 0; 519 ret = 0;
510out: 520out:
511 rcu_read_unlock(); 521 rcu_read_unlock();
@@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1429 goto out_unlock; 1439 goto out_unlock;
1430 1440
1431 cgroup_taskset_for_each(task, tset) { 1441 cgroup_taskset_for_each(task, tset) {
1432 /* 1442 ret = task_can_attach(task, cs->cpus_allowed);
1433 * Kthreads which disallow setaffinity shouldn't be moved 1443 if (ret)
1434 * to a new cpuset; we don't want to change their cpu
1435 * affinity and isolating such threads by their set of
1436 * allowed nodes is unnecessary. Thus, cpusets are not
1437 * applicable for such threads. This prevents checking for
1438 * success of set_cpus_allowed_ptr() on all attached tasks
1439 * before cpus_allowed may be changed.
1440 */
1441 ret = -EINVAL;
1442 if (task->flags & PF_NO_SETAFFINITY)
1443 goto out_unlock; 1444 goto out_unlock;
1444 ret = security_task_setscheduler(task); 1445 ret = security_task_setscheduler(task);
1445 if (ret) 1446 if (ret)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1425d07018de..113b837470cd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -614,7 +614,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
614 if (!f.file) 614 if (!f.file)
615 return -EBADF; 615 return -EBADF;
616 616
617 css = css_tryget_online_from_dir(f.file->f_dentry, 617 css = css_tryget_online_from_dir(f.file->f_path.dentry,
618 &perf_event_cgrp_subsys); 618 &perf_event_cgrp_subsys);
619 if (IS_ERR(css)) { 619 if (IS_ERR(css)) {
620 ret = PTR_ERR(css); 620 ret = PTR_ERR(css);
@@ -1562,8 +1562,10 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group
1562 1562
1563 if (!task) { 1563 if (!task) {
1564 /* 1564 /*
1565 * Per cpu events are removed via an smp call and 1565 * Per cpu events are removed via an smp call. The removal can
1566 * the removal is always successful. 1566 * fail if the CPU is currently offline, but in that case we
1567 * already called __perf_remove_from_context from
1568 * perf_event_exit_cpu.
1567 */ 1569 */
1568 cpu_function_call(event->cpu, __perf_remove_from_context, &re); 1570 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1569 return; 1571 return;
@@ -4458,7 +4460,7 @@ perf_output_sample_regs(struct perf_output_handle *handle,
4458 } 4460 }
4459} 4461}
4460 4462
4461static void perf_sample_regs_user(struct perf_regs_user *regs_user, 4463static void perf_sample_regs_user(struct perf_regs *regs_user,
4462 struct pt_regs *regs) 4464 struct pt_regs *regs)
4463{ 4465{
4464 if (!user_mode(regs)) { 4466 if (!user_mode(regs)) {
@@ -4469,11 +4471,22 @@ static void perf_sample_regs_user(struct perf_regs_user *regs_user,
4469 } 4471 }
4470 4472
4471 if (regs) { 4473 if (regs) {
4472 regs_user->regs = regs;
4473 regs_user->abi = perf_reg_abi(current); 4474 regs_user->abi = perf_reg_abi(current);
4475 regs_user->regs = regs;
4476 } else {
4477 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
4478 regs_user->regs = NULL;
4474 } 4479 }
4475} 4480}
4476 4481
4482static void perf_sample_regs_intr(struct perf_regs *regs_intr,
4483 struct pt_regs *regs)
4484{
4485 regs_intr->regs = regs;
4486 regs_intr->abi = perf_reg_abi(current);
4487}
4488
4489
4477/* 4490/*
4478 * Get remaining task size from user stack pointer. 4491 * Get remaining task size from user stack pointer.
4479 * 4492 *
@@ -4855,6 +4868,23 @@ void perf_output_sample(struct perf_output_handle *handle,
4855 if (sample_type & PERF_SAMPLE_TRANSACTION) 4868 if (sample_type & PERF_SAMPLE_TRANSACTION)
4856 perf_output_put(handle, data->txn); 4869 perf_output_put(handle, data->txn);
4857 4870
4871 if (sample_type & PERF_SAMPLE_REGS_INTR) {
4872 u64 abi = data->regs_intr.abi;
4873 /*
4874 * If there are no regs to dump, notice it through
4875 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
4876 */
4877 perf_output_put(handle, abi);
4878
4879 if (abi) {
4880 u64 mask = event->attr.sample_regs_intr;
4881
4882 perf_output_sample_regs(handle,
4883 data->regs_intr.regs,
4884 mask);
4885 }
4886 }
4887
4858 if (!event->attr.watermark) { 4888 if (!event->attr.watermark) {
4859 int wakeup_events = event->attr.wakeup_events; 4889 int wakeup_events = event->attr.wakeup_events;
4860 4890
@@ -4920,12 +4950,13 @@ void perf_prepare_sample(struct perf_event_header *header,
4920 header->size += size; 4950 header->size += size;
4921 } 4951 }
4922 4952
4953 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
4954 perf_sample_regs_user(&data->regs_user, regs);
4955
4923 if (sample_type & PERF_SAMPLE_REGS_USER) { 4956 if (sample_type & PERF_SAMPLE_REGS_USER) {
4924 /* regs dump ABI info */ 4957 /* regs dump ABI info */
4925 int size = sizeof(u64); 4958 int size = sizeof(u64);
4926 4959
4927 perf_sample_regs_user(&data->regs_user, regs);
4928
4929 if (data->regs_user.regs) { 4960 if (data->regs_user.regs) {
4930 u64 mask = event->attr.sample_regs_user; 4961 u64 mask = event->attr.sample_regs_user;
4931 size += hweight64(mask) * sizeof(u64); 4962 size += hweight64(mask) * sizeof(u64);
@@ -4941,15 +4972,11 @@ void perf_prepare_sample(struct perf_event_header *header,
4941 * in case new sample type is added, because we could eat 4972 * in case new sample type is added, because we could eat
4942 * up the rest of the sample size. 4973 * up the rest of the sample size.
4943 */ 4974 */
4944 struct perf_regs_user *uregs = &data->regs_user;
4945 u16 stack_size = event->attr.sample_stack_user; 4975 u16 stack_size = event->attr.sample_stack_user;
4946 u16 size = sizeof(u64); 4976 u16 size = sizeof(u64);
4947 4977
4948 if (!uregs->abi)
4949 perf_sample_regs_user(uregs, regs);
4950
4951 stack_size = perf_sample_ustack_size(stack_size, header->size, 4978 stack_size = perf_sample_ustack_size(stack_size, header->size,
4952 uregs->regs); 4979 data->regs_user.regs);
4953 4980
4954 /* 4981 /*
4955 * If there is something to dump, add space for the dump 4982 * If there is something to dump, add space for the dump
@@ -4962,6 +4989,21 @@ void perf_prepare_sample(struct perf_event_header *header,
4962 data->stack_user_size = stack_size; 4989 data->stack_user_size = stack_size;
4963 header->size += size; 4990 header->size += size;
4964 } 4991 }
4992
4993 if (sample_type & PERF_SAMPLE_REGS_INTR) {
4994 /* regs dump ABI info */
4995 int size = sizeof(u64);
4996
4997 perf_sample_regs_intr(&data->regs_intr, regs);
4998
4999 if (data->regs_intr.regs) {
5000 u64 mask = event->attr.sample_regs_intr;
5001
5002 size += hweight64(mask) * sizeof(u64);
5003 }
5004
5005 header->size += size;
5006 }
4965} 5007}
4966 5008
4967static void perf_event_output(struct perf_event *event, 5009static void perf_event_output(struct perf_event *event,
@@ -6071,11 +6113,6 @@ static int perf_swevent_init(struct perf_event *event)
6071 return 0; 6113 return 0;
6072} 6114}
6073 6115
6074static int perf_swevent_event_idx(struct perf_event *event)
6075{
6076 return 0;
6077}
6078
6079static struct pmu perf_swevent = { 6116static struct pmu perf_swevent = {
6080 .task_ctx_nr = perf_sw_context, 6117 .task_ctx_nr = perf_sw_context,
6081 6118
@@ -6085,8 +6122,6 @@ static struct pmu perf_swevent = {
6085 .start = perf_swevent_start, 6122 .start = perf_swevent_start,
6086 .stop = perf_swevent_stop, 6123 .stop = perf_swevent_stop,
6087 .read = perf_swevent_read, 6124 .read = perf_swevent_read,
6088
6089 .event_idx = perf_swevent_event_idx,
6090}; 6125};
6091 6126
6092#ifdef CONFIG_EVENT_TRACING 6127#ifdef CONFIG_EVENT_TRACING
@@ -6204,8 +6239,6 @@ static struct pmu perf_tracepoint = {
6204 .start = perf_swevent_start, 6239 .start = perf_swevent_start,
6205 .stop = perf_swevent_stop, 6240 .stop = perf_swevent_stop,
6206 .read = perf_swevent_read, 6241 .read = perf_swevent_read,
6207
6208 .event_idx = perf_swevent_event_idx,
6209}; 6242};
6210 6243
6211static inline void perf_tp_register(void) 6244static inline void perf_tp_register(void)
@@ -6431,8 +6464,6 @@ static struct pmu perf_cpu_clock = {
6431 .start = cpu_clock_event_start, 6464 .start = cpu_clock_event_start,
6432 .stop = cpu_clock_event_stop, 6465 .stop = cpu_clock_event_stop,
6433 .read = cpu_clock_event_read, 6466 .read = cpu_clock_event_read,
6434
6435 .event_idx = perf_swevent_event_idx,
6436}; 6467};
6437 6468
6438/* 6469/*
@@ -6511,8 +6542,6 @@ static struct pmu perf_task_clock = {
6511 .start = task_clock_event_start, 6542 .start = task_clock_event_start,
6512 .stop = task_clock_event_stop, 6543 .stop = task_clock_event_stop,
6513 .read = task_clock_event_read, 6544 .read = task_clock_event_read,
6514
6515 .event_idx = perf_swevent_event_idx,
6516}; 6545};
6517 6546
6518static void perf_pmu_nop_void(struct pmu *pmu) 6547static void perf_pmu_nop_void(struct pmu *pmu)
@@ -6542,7 +6571,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
6542 6571
6543static int perf_event_idx_default(struct perf_event *event) 6572static int perf_event_idx_default(struct perf_event *event)
6544{ 6573{
6545 return event->hw.idx + 1; 6574 return 0;
6546} 6575}
6547 6576
6548/* 6577/*
@@ -7162,6 +7191,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
7162 ret = -EINVAL; 7191 ret = -EINVAL;
7163 } 7192 }
7164 7193
7194 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
7195 ret = perf_reg_validate(attr->sample_regs_intr);
7165out: 7196out:
7166 return ret; 7197 return ret;
7167 7198
@@ -8130,7 +8161,7 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
8130 8161
8131static void __perf_event_exit_context(void *__info) 8162static void __perf_event_exit_context(void *__info)
8132{ 8163{
8133 struct remove_event re = { .detach_group = false }; 8164 struct remove_event re = { .detach_group = true };
8134 struct perf_event_context *ctx = __info; 8165 struct perf_event_context *ctx = __info;
8135 8166
8136 perf_pmu_rotate_stop(ctx->pmu); 8167 perf_pmu_rotate_stop(ctx->pmu);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 1559fb0b9296..9803a6600d49 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
605 bp->hw.state = PERF_HES_STOPPED; 605 bp->hw.state = PERF_HES_STOPPED;
606} 606}
607 607
608static int hw_breakpoint_event_idx(struct perf_event *bp)
609{
610 return 0;
611}
612
613static struct pmu perf_breakpoint = { 608static struct pmu perf_breakpoint = {
614 .task_ctx_nr = perf_sw_context, /* could eventually get its own */ 609 .task_ctx_nr = perf_sw_context, /* could eventually get its own */
615 610
@@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = {
619 .start = hw_breakpoint_start, 614 .start = hw_breakpoint_start,
620 .stop = hw_breakpoint_stop, 615 .stop = hw_breakpoint_stop,
621 .read = hw_breakpoint_pmu_read, 616 .read = hw_breakpoint_pmu_read,
622
623 .event_idx = hw_breakpoint_event_idx,
624}; 617};
625 618
626int __init init_hw_breakpoint(void) 619int __init init_hw_breakpoint(void)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1d0af8a2c646..ed8f2cde34c5 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1640,7 +1640,6 @@ bool uprobe_deny_signal(void)
1640 if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { 1640 if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
1641 utask->state = UTASK_SSTEP_TRAPPED; 1641 utask->state = UTASK_SSTEP_TRAPPED;
1642 set_tsk_thread_flag(t, TIF_UPROBE); 1642 set_tsk_thread_flag(t, TIF_UPROBE);
1643 set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
1644 } 1643 }
1645 } 1644 }
1646 1645
diff --git a/kernel/exit.c b/kernel/exit.c
index 5d30019ff953..8714e5ded8b4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -118,13 +118,10 @@ static void __exit_signal(struct task_struct *tsk)
118 } 118 }
119 119
120 /* 120 /*
121 * Accumulate here the counters for all threads but the group leader 121 * Accumulate here the counters for all threads as they die. We could
122 * as they die, so they can be added into the process-wide totals 122 * skip the group leader because it is the last user of signal_struct,
123 * when those are taken. The group leader stays around as a zombie as 123 * but we want to avoid the race with thread_group_cputime() which can
124 * long as there are other threads. When it gets reaped, the exit.c 124 * see the empty ->thread_head list.
125 * code will add its counts into these totals. We won't ever get here
126 * for the group leader, since it will have been the last reference on
127 * the signal_struct.
128 */ 125 */
129 task_cputime(tsk, &utime, &stime); 126 task_cputime(tsk, &utime, &stime);
130 write_seqlock(&sig->stats_lock); 127 write_seqlock(&sig->stats_lock);
@@ -462,6 +459,44 @@ static void exit_mm(struct task_struct *tsk)
462 clear_thread_flag(TIF_MEMDIE); 459 clear_thread_flag(TIF_MEMDIE);
463} 460}
464 461
462static struct task_struct *find_alive_thread(struct task_struct *p)
463{
464 struct task_struct *t;
465
466 for_each_thread(p, t) {
467 if (!(t->flags & PF_EXITING))
468 return t;
469 }
470 return NULL;
471}
472
473static struct task_struct *find_child_reaper(struct task_struct *father)
474 __releases(&tasklist_lock)
475 __acquires(&tasklist_lock)
476{
477 struct pid_namespace *pid_ns = task_active_pid_ns(father);
478 struct task_struct *reaper = pid_ns->child_reaper;
479
480 if (likely(reaper != father))
481 return reaper;
482
483 reaper = find_alive_thread(father);
484 if (reaper) {
485 pid_ns->child_reaper = reaper;
486 return reaper;
487 }
488
489 write_unlock_irq(&tasklist_lock);
490 if (unlikely(pid_ns == &init_pid_ns)) {
491 panic("Attempted to kill init! exitcode=0x%08x\n",
492 father->signal->group_exit_code ?: father->exit_code);
493 }
494 zap_pid_ns_processes(pid_ns);
495 write_lock_irq(&tasklist_lock);
496
497 return father;
498}
499
465/* 500/*
466 * When we die, we re-parent all our children, and try to: 501 * When we die, we re-parent all our children, and try to:
467 * 1. give them to another thread in our thread group, if such a member exists 502 * 1. give them to another thread in our thread group, if such a member exists
@@ -469,58 +504,36 @@ static void exit_mm(struct task_struct *tsk)
469 * child_subreaper for its children (like a service manager) 504 * child_subreaper for its children (like a service manager)
470 * 3. give it to the init process (PID 1) in our pid namespace 505 * 3. give it to the init process (PID 1) in our pid namespace
471 */ 506 */
472static struct task_struct *find_new_reaper(struct task_struct *father) 507static struct task_struct *find_new_reaper(struct task_struct *father,
473 __releases(&tasklist_lock) 508 struct task_struct *child_reaper)
474 __acquires(&tasklist_lock)
475{ 509{
476 struct pid_namespace *pid_ns = task_active_pid_ns(father); 510 struct task_struct *thread, *reaper;
477 struct task_struct *thread;
478 511
479 thread = father; 512 thread = find_alive_thread(father);
480 while_each_thread(father, thread) { 513 if (thread)
481 if (thread->flags & PF_EXITING)
482 continue;
483 if (unlikely(pid_ns->child_reaper == father))
484 pid_ns->child_reaper = thread;
485 return thread; 514 return thread;
486 }
487
488 if (unlikely(pid_ns->child_reaper == father)) {
489 write_unlock_irq(&tasklist_lock);
490 if (unlikely(pid_ns == &init_pid_ns)) {
491 panic("Attempted to kill init! exitcode=0x%08x\n",
492 father->signal->group_exit_code ?:
493 father->exit_code);
494 }
495
496 zap_pid_ns_processes(pid_ns);
497 write_lock_irq(&tasklist_lock);
498 } else if (father->signal->has_child_subreaper) {
499 struct task_struct *reaper;
500 515
516 if (father->signal->has_child_subreaper) {
501 /* 517 /*
502 * Find the first ancestor marked as child_subreaper. 518 * Find the first ->is_child_subreaper ancestor in our pid_ns.
503 * Note that the code below checks same_thread_group(reaper, 519 * We start from father to ensure we can not look into another
504 * pid_ns->child_reaper). This is what we need to DTRT in a 520 * namespace, this is safe because all its threads are dead.
505 * PID namespace. However we still need the check above, see
506 * http://marc.info/?l=linux-kernel&m=131385460420380
507 */ 521 */
508 for (reaper = father->real_parent; 522 for (reaper = father;
509 reaper != &init_task; 523 !same_thread_group(reaper, child_reaper);
510 reaper = reaper->real_parent) { 524 reaper = reaper->real_parent) {
511 if (same_thread_group(reaper, pid_ns->child_reaper)) 525 /* call_usermodehelper() descendants need this check */
526 if (reaper == &init_task)
512 break; 527 break;
513 if (!reaper->signal->is_child_subreaper) 528 if (!reaper->signal->is_child_subreaper)
514 continue; 529 continue;
515 thread = reaper; 530 thread = find_alive_thread(reaper);
516 do { 531 if (thread)
517 if (!(thread->flags & PF_EXITING)) 532 return thread;
518 return reaper;
519 } while_each_thread(reaper, thread);
520 } 533 }
521 } 534 }
522 535
523 return pid_ns->child_reaper; 536 return child_reaper;
524} 537}
525 538
526/* 539/*
@@ -529,15 +542,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
529static void reparent_leader(struct task_struct *father, struct task_struct *p, 542static void reparent_leader(struct task_struct *father, struct task_struct *p,
530 struct list_head *dead) 543 struct list_head *dead)
531{ 544{
532 list_move_tail(&p->sibling, &p->real_parent->children); 545 if (unlikely(p->exit_state == EXIT_DEAD))
533
534 if (p->exit_state == EXIT_DEAD)
535 return;
536 /*
537 * If this is a threaded reparent there is no need to
538 * notify anyone anything has happened.
539 */
540 if (same_thread_group(p->real_parent, father))
541 return; 546 return;
542 547
543 /* We don't want people slaying init. */ 548 /* We don't want people slaying init. */
@@ -548,49 +553,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
548 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 553 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
549 if (do_notify_parent(p, p->exit_signal)) { 554 if (do_notify_parent(p, p->exit_signal)) {
550 p->exit_state = EXIT_DEAD; 555 p->exit_state = EXIT_DEAD;
551 list_move_tail(&p->sibling, dead); 556 list_add(&p->ptrace_entry, dead);
552 } 557 }
553 } 558 }
554 559
555 kill_orphaned_pgrp(p, father); 560 kill_orphaned_pgrp(p, father);
556} 561}
557 562
558static void forget_original_parent(struct task_struct *father) 563/*
564 * This does two things:
565 *
566 * A. Make init inherit all the child processes
567 * B. Check to see if any process groups have become orphaned
568 * as a result of our exiting, and if they have any stopped
569 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
570 */
571static void forget_original_parent(struct task_struct *father,
572 struct list_head *dead)
559{ 573{
560 struct task_struct *p, *n, *reaper; 574 struct task_struct *p, *t, *reaper;
561 LIST_HEAD(dead_children);
562 575
563 write_lock_irq(&tasklist_lock); 576 if (unlikely(!list_empty(&father->ptraced)))
564 /* 577 exit_ptrace(father, dead);
565 * Note that exit_ptrace() and find_new_reaper() might
566 * drop tasklist_lock and reacquire it.
567 */
568 exit_ptrace(father);
569 reaper = find_new_reaper(father);
570 578
571 list_for_each_entry_safe(p, n, &father->children, sibling) { 579 /* Can drop and reacquire tasklist_lock */
572 struct task_struct *t = p; 580 reaper = find_child_reaper(father);
581 if (list_empty(&father->children))
582 return;
573 583
574 do { 584 reaper = find_new_reaper(father, reaper);
585 list_for_each_entry(p, &father->children, sibling) {
586 for_each_thread(p, t) {
575 t->real_parent = reaper; 587 t->real_parent = reaper;
576 if (t->parent == father) { 588 BUG_ON((!t->ptrace) != (t->parent == father));
577 BUG_ON(t->ptrace); 589 if (likely(!t->ptrace))
578 t->parent = t->real_parent; 590 t->parent = t->real_parent;
579 }
580 if (t->pdeath_signal) 591 if (t->pdeath_signal)
581 group_send_sig_info(t->pdeath_signal, 592 group_send_sig_info(t->pdeath_signal,
582 SEND_SIG_NOINFO, t); 593 SEND_SIG_NOINFO, t);
583 } while_each_thread(p, t); 594 }
584 reparent_leader(father, p, &dead_children); 595 /*
585 } 596 * If this is a threaded reparent there is no need to
586 write_unlock_irq(&tasklist_lock); 597 * notify anyone anything has happened.
587 598 */
588 BUG_ON(!list_empty(&father->children)); 599 if (!same_thread_group(reaper, father))
589 600 reparent_leader(father, p, dead);
590 list_for_each_entry_safe(p, n, &dead_children, sibling) {
591 list_del_init(&p->sibling);
592 release_task(p);
593 } 601 }
602 list_splice_tail_init(&father->children, &reaper->children);
594} 603}
595 604
596/* 605/*
@@ -600,18 +609,12 @@ static void forget_original_parent(struct task_struct *father)
600static void exit_notify(struct task_struct *tsk, int group_dead) 609static void exit_notify(struct task_struct *tsk, int group_dead)
601{ 610{
602 bool autoreap; 611 bool autoreap;
603 612 struct task_struct *p, *n;
604 /* 613 LIST_HEAD(dead);
605 * This does two things:
606 *
607 * A. Make init inherit all the child processes
608 * B. Check to see if any process groups have become orphaned
609 * as a result of our exiting, and if they have any stopped
610 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
611 */
612 forget_original_parent(tsk);
613 614
614 write_lock_irq(&tasklist_lock); 615 write_lock_irq(&tasklist_lock);
616 forget_original_parent(tsk, &dead);
617
615 if (group_dead) 618 if (group_dead)
616 kill_orphaned_pgrp(tsk->group_leader, NULL); 619 kill_orphaned_pgrp(tsk->group_leader, NULL);
617 620
@@ -629,15 +632,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
629 } 632 }
630 633
631 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; 634 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
635 if (tsk->exit_state == EXIT_DEAD)
636 list_add(&tsk->ptrace_entry, &dead);
632 637
633 /* mt-exec, de_thread() is waiting for group leader */ 638 /* mt-exec, de_thread() is waiting for group leader */
634 if (unlikely(tsk->signal->notify_count < 0)) 639 if (unlikely(tsk->signal->notify_count < 0))
635 wake_up_process(tsk->signal->group_exit_task); 640 wake_up_process(tsk->signal->group_exit_task);
636 write_unlock_irq(&tasklist_lock); 641 write_unlock_irq(&tasklist_lock);
637 642
638 /* If the process is dead, release it - nobody will wait for it */ 643 list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
639 if (autoreap) 644 list_del_init(&p->ptrace_entry);
640 release_task(tsk); 645 release_task(p);
646 }
641} 647}
642 648
643#ifdef CONFIG_DEBUG_STACK_USAGE 649#ifdef CONFIG_DEBUG_STACK_USAGE
@@ -982,8 +988,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
982 */ 988 */
983static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) 989static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
984{ 990{
985 unsigned long state; 991 int state, retval, status;
986 int retval, status, traced;
987 pid_t pid = task_pid_vnr(p); 992 pid_t pid = task_pid_vnr(p);
988 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); 993 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
989 struct siginfo __user *infop; 994 struct siginfo __user *infop;
@@ -997,6 +1002,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
997 1002
998 get_task_struct(p); 1003 get_task_struct(p);
999 read_unlock(&tasklist_lock); 1004 read_unlock(&tasklist_lock);
1005 sched_annotate_sleep();
1006
1000 if ((exit_code & 0x7f) == 0) { 1007 if ((exit_code & 0x7f) == 0) {
1001 why = CLD_EXITED; 1008 why = CLD_EXITED;
1002 status = exit_code >> 8; 1009 status = exit_code >> 8;
@@ -1006,21 +1013,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1006 } 1013 }
1007 return wait_noreap_copyout(wo, p, pid, uid, why, status); 1014 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1008 } 1015 }
1009
1010 traced = ptrace_reparented(p);
1011 /* 1016 /*
1012 * Move the task's state to DEAD/TRACE, only one thread can do this. 1017 * Move the task's state to DEAD/TRACE, only one thread can do this.
1013 */ 1018 */
1014 state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; 1019 state = (ptrace_reparented(p) && thread_group_leader(p)) ?
1020 EXIT_TRACE : EXIT_DEAD;
1015 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) 1021 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1016 return 0; 1022 return 0;
1017 /* 1023 /*
1018 * It can be ptraced but not reparented, check 1024 * We own this thread, nobody else can reap it.
1019 * thread_group_leader() to filter out sub-threads.
1020 */ 1025 */
1021 if (likely(!traced) && thread_group_leader(p)) { 1026 read_unlock(&tasklist_lock);
1022 struct signal_struct *psig; 1027 sched_annotate_sleep();
1023 struct signal_struct *sig; 1028
1029 /*
1030 * Check thread_group_leader() to exclude the traced sub-threads.
1031 */
1032 if (state == EXIT_DEAD && thread_group_leader(p)) {
1033 struct signal_struct *sig = p->signal;
1034 struct signal_struct *psig = current->signal;
1024 unsigned long maxrss; 1035 unsigned long maxrss;
1025 cputime_t tgutime, tgstime; 1036 cputime_t tgutime, tgstime;
1026 1037
@@ -1032,21 +1043,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1032 * accumulate in the parent's signal_struct c* fields. 1043 * accumulate in the parent's signal_struct c* fields.
1033 * 1044 *
1034 * We don't bother to take a lock here to protect these 1045 * We don't bother to take a lock here to protect these
1035 * p->signal fields, because they are only touched by 1046 * p->signal fields because the whole thread group is dead
1036 * __exit_signal, which runs with tasklist_lock 1047 * and nobody can change them.
1037 * write-locked anyway, and so is excluded here. We do 1048 *
1038 * need to protect the access to parent->signal fields, 1049 * psig->stats_lock also protects us from our sub-theads
1039 * as other threads in the parent group can be right 1050 * which can reap other children at the same time. Until
1040 * here reaping other children at the same time. 1051 * we change k_getrusage()-like users to rely on this lock
1052 * we have to take ->siglock as well.
1041 * 1053 *
1042 * We use thread_group_cputime_adjusted() to get times for 1054 * We use thread_group_cputime_adjusted() to get times for
1043 * the thread group, which consolidates times for all threads 1055 * the thread group, which consolidates times for all threads
1044 * in the group including the group leader. 1056 * in the group including the group leader.
1045 */ 1057 */
1046 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1058 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1047 spin_lock_irq(&p->real_parent->sighand->siglock); 1059 spin_lock_irq(&current->sighand->siglock);
1048 psig = p->real_parent->signal;
1049 sig = p->signal;
1050 write_seqlock(&psig->stats_lock); 1060 write_seqlock(&psig->stats_lock);
1051 psig->cutime += tgutime + sig->cutime; 1061 psig->cutime += tgutime + sig->cutime;
1052 psig->cstime += tgstime + sig->cstime; 1062 psig->cstime += tgstime + sig->cstime;
@@ -1071,15 +1081,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1071 task_io_accounting_add(&psig->ioac, &p->ioac); 1081 task_io_accounting_add(&psig->ioac, &p->ioac);
1072 task_io_accounting_add(&psig->ioac, &sig->ioac); 1082 task_io_accounting_add(&psig->ioac, &sig->ioac);
1073 write_sequnlock(&psig->stats_lock); 1083 write_sequnlock(&psig->stats_lock);
1074 spin_unlock_irq(&p->real_parent->sighand->siglock); 1084 spin_unlock_irq(&current->sighand->siglock);
1075 } 1085 }
1076 1086
1077 /*
1078 * Now we are sure this task is interesting, and no other
1079 * thread can reap it because we its state == DEAD/TRACE.
1080 */
1081 read_unlock(&tasklist_lock);
1082
1083 retval = wo->wo_rusage 1087 retval = wo->wo_rusage
1084 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1088 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1085 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1089 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
@@ -1210,6 +1214,7 @@ unlock_sig:
1210 pid = task_pid_vnr(p); 1214 pid = task_pid_vnr(p);
1211 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1215 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1212 read_unlock(&tasklist_lock); 1216 read_unlock(&tasklist_lock);
1217 sched_annotate_sleep();
1213 1218
1214 if (unlikely(wo->wo_flags & WNOWAIT)) 1219 if (unlikely(wo->wo_flags & WNOWAIT))
1215 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); 1220 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
@@ -1272,6 +1277,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1272 pid = task_pid_vnr(p); 1277 pid = task_pid_vnr(p);
1273 get_task_struct(p); 1278 get_task_struct(p);
1274 read_unlock(&tasklist_lock); 1279 read_unlock(&tasklist_lock);
1280 sched_annotate_sleep();
1275 1281
1276 if (!wo->wo_info) { 1282 if (!wo->wo_info) {
1277 retval = wo->wo_rusage 1283 retval = wo->wo_rusage
diff --git a/kernel/extable.c b/kernel/extable.c
index d8a6446adbcb..c98f926277a8 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,6 +18,7 @@
18#include <linux/ftrace.h> 18#include <linux/ftrace.h>
19#include <linux/memory.h> 19#include <linux/memory.h>
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/ftrace.h>
21#include <linux/mutex.h> 22#include <linux/mutex.h>
22#include <linux/init.h> 23#include <linux/init.h>
23 24
@@ -102,6 +103,8 @@ int __kernel_text_address(unsigned long addr)
102 return 1; 103 return 1;
103 if (is_module_text_address(addr)) 104 if (is_module_text_address(addr))
104 return 1; 105 return 1;
106 if (is_ftrace_trampoline(addr))
107 return 1;
105 /* 108 /*
106 * There might be init symbols in saved stacktraces. 109 * There might be init symbols in saved stacktraces.
107 * Give those symbols a chance to be printed in 110 * Give those symbols a chance to be printed in
@@ -119,7 +122,9 @@ int kernel_text_address(unsigned long addr)
119{ 122{
120 if (core_kernel_text(addr)) 123 if (core_kernel_text(addr))
121 return 1; 124 return 1;
122 return is_module_text_address(addr); 125 if (is_module_text_address(addr))
126 return 1;
127 return is_ftrace_trampoline(addr);
123} 128}
124 129
125/* 130/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b7d746d6d62..9ca84189cfc2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1022,11 +1022,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
1022{ 1022{
1023 if (atomic_dec_and_test(&sighand->count)) { 1023 if (atomic_dec_and_test(&sighand->count)) {
1024 signalfd_cleanup(sighand); 1024 signalfd_cleanup(sighand);
1025 /*
1026 * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
1027 * without an RCU grace period, see __lock_task_sighand().
1028 */
1025 kmem_cache_free(sighand_cachep, sighand); 1029 kmem_cache_free(sighand_cachep, sighand);
1026 } 1030 }
1027} 1031}
1028 1032
1029
1030/* 1033/*
1031 * Initialize POSIX timer handling for a thread group. 1034 * Initialize POSIX timer handling for a thread group.
1032 */ 1035 */
diff --git a/kernel/futex.c b/kernel/futex.c
index f3a3a071283c..63678b573d61 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -143,9 +143,8 @@
143 * 143 *
144 * Where (A) orders the waiters increment and the futex value read through 144 * Where (A) orders the waiters increment and the futex value read through
145 * atomic operations (see hb_waiters_inc) and where (B) orders the write 145 * atomic operations (see hb_waiters_inc) and where (B) orders the write
146 * to futex and the waiters read -- this is done by the barriers in 146 * to futex and the waiters read -- this is done by the barriers for both
147 * get_futex_key_refs(), through either ihold or atomic_inc, depending on the 147 * shared and private futexes in get_futex_key_refs().
148 * futex type.
149 * 148 *
150 * This yields the following case (where X:=waiters, Y:=futex): 149 * This yields the following case (where X:=waiters, Y:=futex):
151 * 150 *
@@ -344,13 +343,20 @@ static void get_futex_key_refs(union futex_key *key)
344 futex_get_mm(key); /* implies MB (B) */ 343 futex_get_mm(key); /* implies MB (B) */
345 break; 344 break;
346 default: 345 default:
346 /*
347 * Private futexes do not hold reference on an inode or
348 * mm, therefore the only purpose of calling get_futex_key_refs
349 * is because we need the barrier for the lockless waiter check.
350 */
347 smp_mb(); /* explicit MB (B) */ 351 smp_mb(); /* explicit MB (B) */
348 } 352 }
349} 353}
350 354
351/* 355/*
352 * Drop a reference to the resource addressed by a key. 356 * Drop a reference to the resource addressed by a key.
353 * The hash bucket spinlock must not be held. 357 * The hash bucket spinlock must not be held. This is
358 * a no-op for private futexes, see comment in the get
359 * counterpart.
354 */ 360 */
355static void drop_futex_key_refs(union futex_key *key) 361static void drop_futex_key_refs(union futex_key *key)
356{ 362{
@@ -641,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void)
641 return pi_state; 647 return pi_state;
642} 648}
643 649
650/*
651 * Must be called with the hb lock held.
652 */
644static void free_pi_state(struct futex_pi_state *pi_state) 653static void free_pi_state(struct futex_pi_state *pi_state)
645{ 654{
655 if (!pi_state)
656 return;
657
646 if (!atomic_dec_and_test(&pi_state->refcount)) 658 if (!atomic_dec_and_test(&pi_state->refcount))
647 return; 659 return;
648 660
@@ -1521,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1521 } 1533 }
1522 1534
1523retry: 1535retry:
1524 if (pi_state != NULL) {
1525 /*
1526 * We will have to lookup the pi_state again, so free this one
1527 * to keep the accounting correct.
1528 */
1529 free_pi_state(pi_state);
1530 pi_state = NULL;
1531 }
1532
1533 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); 1536 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1534 if (unlikely(ret != 0)) 1537 if (unlikely(ret != 0))
1535 goto out; 1538 goto out;
@@ -1619,6 +1622,8 @@ retry_private:
1619 case 0: 1622 case 0:
1620 break; 1623 break;
1621 case -EFAULT: 1624 case -EFAULT:
1625 free_pi_state(pi_state);
1626 pi_state = NULL;
1622 double_unlock_hb(hb1, hb2); 1627 double_unlock_hb(hb1, hb2);
1623 hb_waiters_dec(hb2); 1628 hb_waiters_dec(hb2);
1624 put_futex_key(&key2); 1629 put_futex_key(&key2);
@@ -1634,6 +1639,8 @@ retry_private:
1634 * exit to complete. 1639 * exit to complete.
1635 * - The user space value changed. 1640 * - The user space value changed.
1636 */ 1641 */
1642 free_pi_state(pi_state);
1643 pi_state = NULL;
1637 double_unlock_hb(hb1, hb2); 1644 double_unlock_hb(hb1, hb2);
1638 hb_waiters_dec(hb2); 1645 hb_waiters_dec(hb2);
1639 put_futex_key(&key2); 1646 put_futex_key(&key2);
@@ -1710,6 +1717,7 @@ retry_private:
1710 } 1717 }
1711 1718
1712out_unlock: 1719out_unlock:
1720 free_pi_state(pi_state);
1713 double_unlock_hb(hb1, hb2); 1721 double_unlock_hb(hb1, hb2);
1714 hb_waiters_dec(hb2); 1722 hb_waiters_dec(hb2);
1715 1723
@@ -1727,8 +1735,6 @@ out_put_keys:
1727out_put_key1: 1735out_put_key1:
1728 put_futex_key(&key1); 1736 put_futex_key(&key1);
1729out: 1737out:
1730 if (pi_state != NULL)
1731 free_pi_state(pi_state);
1732 return ret ? ret : task_count; 1738 return ret ? ret : task_count;
1733} 1739}
1734 1740
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index cf66c5c8458e..3b7408759bdf 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
35config GCOV_PROFILE_ALL 35config GCOV_PROFILE_ALL
36 bool "Profile entire Kernel" 36 bool "Profile entire Kernel"
37 depends on GCOV_KERNEL 37 depends on GCOV_KERNEL
38 depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM 38 depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64
39 default n 39 default n
40 ---help--- 40 ---help---
41 This options activates profiling for the entire kernel. 41 This options activates profiling for the entire kernel.
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 225086b2652e..9a76e3beda54 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -55,6 +55,21 @@ config GENERIC_IRQ_CHIP
55config IRQ_DOMAIN 55config IRQ_DOMAIN
56 bool 56 bool
57 57
58# Support for hierarchical irq domains
59config IRQ_DOMAIN_HIERARCHY
60 bool
61 select IRQ_DOMAIN
62
63# Generic MSI interrupt support
64config GENERIC_MSI_IRQ
65 bool
66
67# Generic MSI hierarchical interrupt domain support
68config GENERIC_MSI_IRQ_DOMAIN
69 bool
70 select IRQ_DOMAIN_HIERARCHY
71 select GENERIC_MSI_IRQ
72
58config HANDLE_DOMAIN_IRQ 73config HANDLE_DOMAIN_IRQ
59 bool 74 bool
60 75
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index fff17381f0af..d12123526e2b 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
6obj-$(CONFIG_PROC_FS) += proc.o 6obj-$(CONFIG_PROC_FS) += proc.o
7obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 7obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
8obj-$(CONFIG_PM_SLEEP) += pm.o 8obj-$(CONFIG_PM_SLEEP) += pm.o
9obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index e5202f00cabc..6f1c7a566b95 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/irqdomain.h>
18 19
19#include <trace/events/irq.h> 20#include <trace/events/irq.h>
20 21
@@ -178,6 +179,7 @@ int irq_startup(struct irq_desc *desc, bool resend)
178 irq_state_clr_disabled(desc); 179 irq_state_clr_disabled(desc);
179 desc->depth = 0; 180 desc->depth = 0;
180 181
182 irq_domain_activate_irq(&desc->irq_data);
181 if (desc->irq_data.chip->irq_startup) { 183 if (desc->irq_data.chip->irq_startup) {
182 ret = desc->irq_data.chip->irq_startup(&desc->irq_data); 184 ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
183 irq_state_clr_masked(desc); 185 irq_state_clr_masked(desc);
@@ -199,6 +201,7 @@ void irq_shutdown(struct irq_desc *desc)
199 desc->irq_data.chip->irq_disable(&desc->irq_data); 201 desc->irq_data.chip->irq_disable(&desc->irq_data);
200 else 202 else
201 desc->irq_data.chip->irq_mask(&desc->irq_data); 203 desc->irq_data.chip->irq_mask(&desc->irq_data);
204 irq_domain_deactivate_irq(&desc->irq_data);
202 irq_state_set_masked(desc); 205 irq_state_set_masked(desc);
203} 206}
204 207
@@ -728,7 +731,30 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
728 if (!handle) { 731 if (!handle) {
729 handle = handle_bad_irq; 732 handle = handle_bad_irq;
730 } else { 733 } else {
731 if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) 734 struct irq_data *irq_data = &desc->irq_data;
735#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
736 /*
737 * With hierarchical domains we might run into a
738 * situation where the outermost chip is not yet set
739 * up, but the inner chips are there. Instead of
740 * bailing we install the handler, but obviously we
741 * cannot enable/startup the interrupt at this point.
742 */
743 while (irq_data) {
744 if (irq_data->chip != &no_irq_chip)
745 break;
746 /*
747 * Bail out if the outer chip is not set up
748 * and the interrrupt supposed to be started
749 * right away.
750 */
751 if (WARN_ON(is_chained))
752 goto out;
753 /* Try the parent */
754 irq_data = irq_data->parent_data;
755 }
756#endif
757 if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip))
732 goto out; 758 goto out;
733 } 759 }
734 760
@@ -847,3 +873,105 @@ void irq_cpu_offline(void)
847 raw_spin_unlock_irqrestore(&desc->lock, flags); 873 raw_spin_unlock_irqrestore(&desc->lock, flags);
848 } 874 }
849} 875}
876
877#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
878/**
879 * irq_chip_ack_parent - Acknowledge the parent interrupt
880 * @data: Pointer to interrupt specific data
881 */
882void irq_chip_ack_parent(struct irq_data *data)
883{
884 data = data->parent_data;
885 data->chip->irq_ack(data);
886}
887
888/**
889 * irq_chip_mask_parent - Mask the parent interrupt
890 * @data: Pointer to interrupt specific data
891 */
892void irq_chip_mask_parent(struct irq_data *data)
893{
894 data = data->parent_data;
895 data->chip->irq_mask(data);
896}
897
898/**
899 * irq_chip_unmask_parent - Unmask the parent interrupt
900 * @data: Pointer to interrupt specific data
901 */
902void irq_chip_unmask_parent(struct irq_data *data)
903{
904 data = data->parent_data;
905 data->chip->irq_unmask(data);
906}
907
908/**
909 * irq_chip_eoi_parent - Invoke EOI on the parent interrupt
910 * @data: Pointer to interrupt specific data
911 */
912void irq_chip_eoi_parent(struct irq_data *data)
913{
914 data = data->parent_data;
915 data->chip->irq_eoi(data);
916}
917
918/**
919 * irq_chip_set_affinity_parent - Set affinity on the parent interrupt
920 * @data: Pointer to interrupt specific data
921 * @dest: The affinity mask to set
922 * @force: Flag to enforce setting (disable online checks)
923 *
924 * Conditinal, as the underlying parent chip might not implement it.
925 */
926int irq_chip_set_affinity_parent(struct irq_data *data,
927 const struct cpumask *dest, bool force)
928{
929 data = data->parent_data;
930 if (data->chip->irq_set_affinity)
931 return data->chip->irq_set_affinity(data, dest, force);
932
933 return -ENOSYS;
934}
935
936/**
937 * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
938 * @data: Pointer to interrupt specific data
939 *
940 * Iterate through the domain hierarchy of the interrupt and check
941 * whether a hw retrigger function exists. If yes, invoke it.
942 */
943int irq_chip_retrigger_hierarchy(struct irq_data *data)
944{
945 for (data = data->parent_data; data; data = data->parent_data)
946 if (data->chip && data->chip->irq_retrigger)
947 return data->chip->irq_retrigger(data);
948
949 return -ENOSYS;
950}
951#endif
952
953/**
954 * irq_chip_compose_msi_msg - Componse msi message for a irq chip
955 * @data: Pointer to interrupt specific data
956 * @msg: Pointer to the MSI message
957 *
958 * For hierarchical domains we find the first chip in the hierarchy
959 * which implements the irq_compose_msi_msg callback. For non
960 * hierarchical we use the top level chip.
961 */
962int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
963{
964 struct irq_data *pos = NULL;
965
966#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
967 for (; data; data = data->parent_data)
968#endif
969 if (data->chip && data->chip->irq_compose_msi_msg)
970 pos = data;
971 if (!pos)
972 return -ENOSYS;
973
974 pos->chip->irq_compose_msi_msg(pos, msg);
975
976 return 0;
977}
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index cf80e7b0ddab..61024e8abdef 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
39 u32 mask = d->mask; 39 u32 mask = d->mask;
40 40
41 irq_gc_lock(gc); 41 irq_gc_lock(gc);
42 irq_reg_writel(mask, gc->reg_base + ct->regs.disable); 42 irq_reg_writel(gc, mask, ct->regs.disable);
43 *ct->mask_cache &= ~mask; 43 *ct->mask_cache &= ~mask;
44 irq_gc_unlock(gc); 44 irq_gc_unlock(gc);
45} 45}
@@ -59,7 +59,7 @@ void irq_gc_mask_set_bit(struct irq_data *d)
59 59
60 irq_gc_lock(gc); 60 irq_gc_lock(gc);
61 *ct->mask_cache |= mask; 61 *ct->mask_cache |= mask;
62 irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); 62 irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
63 irq_gc_unlock(gc); 63 irq_gc_unlock(gc);
64} 64}
65EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); 65EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
@@ -79,7 +79,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
79 79
80 irq_gc_lock(gc); 80 irq_gc_lock(gc);
81 *ct->mask_cache &= ~mask; 81 *ct->mask_cache &= ~mask;
82 irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); 82 irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
83 irq_gc_unlock(gc); 83 irq_gc_unlock(gc);
84} 84}
85EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); 85EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
@@ -98,7 +98,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
98 u32 mask = d->mask; 98 u32 mask = d->mask;
99 99
100 irq_gc_lock(gc); 100 irq_gc_lock(gc);
101 irq_reg_writel(mask, gc->reg_base + ct->regs.enable); 101 irq_reg_writel(gc, mask, ct->regs.enable);
102 *ct->mask_cache |= mask; 102 *ct->mask_cache |= mask;
103 irq_gc_unlock(gc); 103 irq_gc_unlock(gc);
104} 104}
@@ -114,7 +114,7 @@ void irq_gc_ack_set_bit(struct irq_data *d)
114 u32 mask = d->mask; 114 u32 mask = d->mask;
115 115
116 irq_gc_lock(gc); 116 irq_gc_lock(gc);
117 irq_reg_writel(mask, gc->reg_base + ct->regs.ack); 117 irq_reg_writel(gc, mask, ct->regs.ack);
118 irq_gc_unlock(gc); 118 irq_gc_unlock(gc);
119} 119}
120EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); 120EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
@@ -130,7 +130,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
130 u32 mask = ~d->mask; 130 u32 mask = ~d->mask;
131 131
132 irq_gc_lock(gc); 132 irq_gc_lock(gc);
133 irq_reg_writel(mask, gc->reg_base + ct->regs.ack); 133 irq_reg_writel(gc, mask, ct->regs.ack);
134 irq_gc_unlock(gc); 134 irq_gc_unlock(gc);
135} 135}
136 136
@@ -145,8 +145,8 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
145 u32 mask = d->mask; 145 u32 mask = d->mask;
146 146
147 irq_gc_lock(gc); 147 irq_gc_lock(gc);
148 irq_reg_writel(mask, gc->reg_base + ct->regs.mask); 148 irq_reg_writel(gc, mask, ct->regs.mask);
149 irq_reg_writel(mask, gc->reg_base + ct->regs.ack); 149 irq_reg_writel(gc, mask, ct->regs.ack);
150 irq_gc_unlock(gc); 150 irq_gc_unlock(gc);
151} 151}
152 152
@@ -161,7 +161,7 @@ void irq_gc_eoi(struct irq_data *d)
161 u32 mask = d->mask; 161 u32 mask = d->mask;
162 162
163 irq_gc_lock(gc); 163 irq_gc_lock(gc);
164 irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); 164 irq_reg_writel(gc, mask, ct->regs.eoi);
165 irq_gc_unlock(gc); 165 irq_gc_unlock(gc);
166} 166}
167 167
@@ -191,6 +191,16 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
191 return 0; 191 return 0;
192} 192}
193 193
194static u32 irq_readl_be(void __iomem *addr)
195{
196 return ioread32be(addr);
197}
198
199static void irq_writel_be(u32 val, void __iomem *addr)
200{
201 iowrite32be(val, addr);
202}
203
194static void 204static void
195irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, 205irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
196 int num_ct, unsigned int irq_base, 206 int num_ct, unsigned int irq_base,
@@ -245,7 +255,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
245 } 255 }
246 ct[i].mask_cache = mskptr; 256 ct[i].mask_cache = mskptr;
247 if (flags & IRQ_GC_INIT_MASK_CACHE) 257 if (flags & IRQ_GC_INIT_MASK_CACHE)
248 *mskptr = irq_reg_readl(gc->reg_base + mskreg); 258 *mskptr = irq_reg_readl(gc, mskreg);
249 } 259 }
250} 260}
251 261
@@ -300,7 +310,13 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
300 dgc->gc[i] = gc = tmp; 310 dgc->gc[i] = gc = tmp;
301 irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, 311 irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
302 NULL, handler); 312 NULL, handler);
313
303 gc->domain = d; 314 gc->domain = d;
315 if (gcflags & IRQ_GC_BE_IO) {
316 gc->reg_readl = &irq_readl_be;
317 gc->reg_writel = &irq_writel_be;
318 }
319
304 raw_spin_lock_irqsave(&gc_lock, flags); 320 raw_spin_lock_irqsave(&gc_lock, flags);
305 list_add_tail(&gc->list, &gc_list); 321 list_add_tail(&gc->list, &gc_list);
306 raw_spin_unlock_irqrestore(&gc_lock, flags); 322 raw_spin_unlock_irqrestore(&gc_lock, flags);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 6534ff6ce02e..7fac311057b8 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -23,6 +23,10 @@ static DEFINE_MUTEX(irq_domain_mutex);
23static DEFINE_MUTEX(revmap_trees_mutex); 23static DEFINE_MUTEX(revmap_trees_mutex);
24static struct irq_domain *irq_default_domain; 24static struct irq_domain *irq_default_domain;
25 25
26static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
27 irq_hw_number_t hwirq, int node);
28static void irq_domain_check_hierarchy(struct irq_domain *domain);
29
26/** 30/**
27 * __irq_domain_add() - Allocate a new irq_domain data structure 31 * __irq_domain_add() - Allocate a new irq_domain data structure
28 * @of_node: optional device-tree node of the interrupt controller 32 * @of_node: optional device-tree node of the interrupt controller
@@ -30,7 +34,7 @@ static struct irq_domain *irq_default_domain;
30 * @hwirq_max: Maximum number of interrupts supported by controller 34 * @hwirq_max: Maximum number of interrupts supported by controller
31 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no 35 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
32 * direct mapping 36 * direct mapping
33 * @ops: map/unmap domain callbacks 37 * @ops: domain callbacks
34 * @host_data: Controller private data pointer 38 * @host_data: Controller private data pointer
35 * 39 *
36 * Allocates and initialize and irq_domain structure. 40 * Allocates and initialize and irq_domain structure.
@@ -56,6 +60,7 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
56 domain->hwirq_max = hwirq_max; 60 domain->hwirq_max = hwirq_max;
57 domain->revmap_size = size; 61 domain->revmap_size = size;
58 domain->revmap_direct_max_irq = direct_max; 62 domain->revmap_direct_max_irq = direct_max;
63 irq_domain_check_hierarchy(domain);
59 64
60 mutex_lock(&irq_domain_mutex); 65 mutex_lock(&irq_domain_mutex);
61 list_add(&domain->link, &irq_domain_list); 66 list_add(&domain->link, &irq_domain_list);
@@ -109,7 +114,7 @@ EXPORT_SYMBOL_GPL(irq_domain_remove);
109 * @first_irq: first number of irq block assigned to the domain, 114 * @first_irq: first number of irq block assigned to the domain,
110 * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then 115 * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
111 * pre-map all of the irqs in the domain to virqs starting at first_irq. 116 * pre-map all of the irqs in the domain to virqs starting at first_irq.
112 * @ops: map/unmap domain callbacks 117 * @ops: domain callbacks
113 * @host_data: Controller private data pointer 118 * @host_data: Controller private data pointer
114 * 119 *
115 * Allocates an irq_domain, and optionally if first_irq is positive then also 120 * Allocates an irq_domain, and optionally if first_irq is positive then also
@@ -174,10 +179,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
174 179
175 domain = __irq_domain_add(of_node, first_hwirq + size, 180 domain = __irq_domain_add(of_node, first_hwirq + size,
176 first_hwirq + size, 0, ops, host_data); 181 first_hwirq + size, 0, ops, host_data);
177 if (!domain) 182 if (domain)
178 return NULL; 183 irq_domain_associate_many(domain, first_irq, first_hwirq, size);
179
180 irq_domain_associate_many(domain, first_irq, first_hwirq, size);
181 184
182 return domain; 185 return domain;
183} 186}
@@ -388,7 +391,6 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
388unsigned int irq_create_mapping(struct irq_domain *domain, 391unsigned int irq_create_mapping(struct irq_domain *domain,
389 irq_hw_number_t hwirq) 392 irq_hw_number_t hwirq)
390{ 393{
391 unsigned int hint;
392 int virq; 394 int virq;
393 395
394 pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); 396 pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
@@ -410,12 +412,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
410 } 412 }
411 413
412 /* Allocate a virtual interrupt number */ 414 /* Allocate a virtual interrupt number */
413 hint = hwirq % nr_irqs; 415 virq = irq_domain_alloc_descs(-1, 1, hwirq,
414 if (hint == 0) 416 of_node_to_nid(domain->of_node));
415 hint++;
416 virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
417 if (virq <= 0)
418 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
419 if (virq <= 0) { 417 if (virq <= 0) {
420 pr_debug("-> virq allocation failed\n"); 418 pr_debug("-> virq allocation failed\n");
421 return 0; 419 return 0;
@@ -471,7 +469,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
471 struct irq_domain *domain; 469 struct irq_domain *domain;
472 irq_hw_number_t hwirq; 470 irq_hw_number_t hwirq;
473 unsigned int type = IRQ_TYPE_NONE; 471 unsigned int type = IRQ_TYPE_NONE;
474 unsigned int virq; 472 int virq;
475 473
476 domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; 474 domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
477 if (!domain) { 475 if (!domain) {
@@ -489,10 +487,24 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
489 return 0; 487 return 0;
490 } 488 }
491 489
492 /* Create mapping */ 490 if (irq_domain_is_hierarchy(domain)) {
493 virq = irq_create_mapping(domain, hwirq); 491 /*
494 if (!virq) 492 * If we've already configured this interrupt,
495 return virq; 493 * don't do it again, or hell will break loose.
494 */
495 virq = irq_find_mapping(domain, hwirq);
496 if (virq)
497 return virq;
498
499 virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data);
500 if (virq <= 0)
501 return 0;
502 } else {
503 /* Create mapping */
504 virq = irq_create_mapping(domain, hwirq);
505 if (!virq)
506 return virq;
507 }
496 508
497 /* Set type if specified and different than the current one */ 509 /* Set type if specified and different than the current one */
498 if (type != IRQ_TYPE_NONE && 510 if (type != IRQ_TYPE_NONE &&
@@ -540,8 +552,8 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
540 return 0; 552 return 0;
541 553
542 if (hwirq < domain->revmap_direct_max_irq) { 554 if (hwirq < domain->revmap_direct_max_irq) {
543 data = irq_get_irq_data(hwirq); 555 data = irq_domain_get_irq_data(domain, hwirq);
544 if (data && (data->domain == domain) && (data->hwirq == hwirq)) 556 if (data && data->hwirq == hwirq)
545 return hwirq; 557 return hwirq;
546 } 558 }
547 559
@@ -709,3 +721,518 @@ const struct irq_domain_ops irq_domain_simple_ops = {
709 .xlate = irq_domain_xlate_onetwocell, 721 .xlate = irq_domain_xlate_onetwocell,
710}; 722};
711EXPORT_SYMBOL_GPL(irq_domain_simple_ops); 723EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
724
725static int irq_domain_alloc_descs(int virq, unsigned int cnt,
726 irq_hw_number_t hwirq, int node)
727{
728 unsigned int hint;
729
730 if (virq >= 0) {
731 virq = irq_alloc_descs(virq, virq, cnt, node);
732 } else {
733 hint = hwirq % nr_irqs;
734 if (hint == 0)
735 hint++;
736 virq = irq_alloc_descs_from(hint, cnt, node);
737 if (virq <= 0 && hint > 1)
738 virq = irq_alloc_descs_from(1, cnt, node);
739 }
740
741 return virq;
742}
743
744#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
745/**
746 * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy
747 * @parent: Parent irq domain to associate with the new domain
748 * @flags: Irq domain flags associated to the domain
749 * @size: Size of the domain. See below
750 * @node: Optional device-tree node of the interrupt controller
751 * @ops: Pointer to the interrupt domain callbacks
752 * @host_data: Controller private data pointer
753 *
754 * If @size is 0 a tree domain is created, otherwise a linear domain.
755 *
756 * If successful the parent is associated to the new domain and the
757 * domain flags are set.
758 * Returns pointer to IRQ domain, or NULL on failure.
759 */
760struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent,
761 unsigned int flags,
762 unsigned int size,
763 struct device_node *node,
764 const struct irq_domain_ops *ops,
765 void *host_data)
766{
767 struct irq_domain *domain;
768
769 if (size)
770 domain = irq_domain_add_linear(node, size, ops, host_data);
771 else
772 domain = irq_domain_add_tree(node, ops, host_data);
773 if (domain) {
774 domain->parent = parent;
775 domain->flags |= flags;
776 }
777
778 return domain;
779}
780
781static void irq_domain_insert_irq(int virq)
782{
783 struct irq_data *data;
784
785 for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
786 struct irq_domain *domain = data->domain;
787 irq_hw_number_t hwirq = data->hwirq;
788
789 if (hwirq < domain->revmap_size) {
790 domain->linear_revmap[hwirq] = virq;
791 } else {
792 mutex_lock(&revmap_trees_mutex);
793 radix_tree_insert(&domain->revmap_tree, hwirq, data);
794 mutex_unlock(&revmap_trees_mutex);
795 }
796
797 /* If not already assigned, give the domain the chip's name */
798 if (!domain->name && data->chip)
799 domain->name = data->chip->name;
800 }
801
802 irq_clear_status_flags(virq, IRQ_NOREQUEST);
803}
804
805static void irq_domain_remove_irq(int virq)
806{
807 struct irq_data *data;
808
809 irq_set_status_flags(virq, IRQ_NOREQUEST);
810 irq_set_chip_and_handler(virq, NULL, NULL);
811 synchronize_irq(virq);
812 smp_mb();
813
814 for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
815 struct irq_domain *domain = data->domain;
816 irq_hw_number_t hwirq = data->hwirq;
817
818 if (hwirq < domain->revmap_size) {
819 domain->linear_revmap[hwirq] = 0;
820 } else {
821 mutex_lock(&revmap_trees_mutex);
822 radix_tree_delete(&domain->revmap_tree, hwirq);
823 mutex_unlock(&revmap_trees_mutex);
824 }
825 }
826}
827
828static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
829 struct irq_data *child)
830{
831 struct irq_data *irq_data;
832
833 irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node);
834 if (irq_data) {
835 child->parent_data = irq_data;
836 irq_data->irq = child->irq;
837 irq_data->node = child->node;
838 irq_data->domain = domain;
839 }
840
841 return irq_data;
842}
843
844static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
845{
846 struct irq_data *irq_data, *tmp;
847 int i;
848
849 for (i = 0; i < nr_irqs; i++) {
850 irq_data = irq_get_irq_data(virq + i);
851 tmp = irq_data->parent_data;
852 irq_data->parent_data = NULL;
853 irq_data->domain = NULL;
854
855 while (tmp) {
856 irq_data = tmp;
857 tmp = tmp->parent_data;
858 kfree(irq_data);
859 }
860 }
861}
862
863static int irq_domain_alloc_irq_data(struct irq_domain *domain,
864 unsigned int virq, unsigned int nr_irqs)
865{
866 struct irq_data *irq_data;
867 struct irq_domain *parent;
868 int i;
869
870 /* The outermost irq_data is embedded in struct irq_desc */
871 for (i = 0; i < nr_irqs; i++) {
872 irq_data = irq_get_irq_data(virq + i);
873 irq_data->domain = domain;
874
875 for (parent = domain->parent; parent; parent = parent->parent) {
876 irq_data = irq_domain_insert_irq_data(parent, irq_data);
877 if (!irq_data) {
878 irq_domain_free_irq_data(virq, i + 1);
879 return -ENOMEM;
880 }
881 }
882 }
883
884 return 0;
885}
886
887/**
888 * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
889 * @domain: domain to match
890 * @virq: IRQ number to get irq_data
891 */
892struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
893 unsigned int virq)
894{
895 struct irq_data *irq_data;
896
897 for (irq_data = irq_get_irq_data(virq); irq_data;
898 irq_data = irq_data->parent_data)
899 if (irq_data->domain == domain)
900 return irq_data;
901
902 return NULL;
903}
904
905/**
906 * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain
907 * @domain: Interrupt domain to match
908 * @virq: IRQ number
909 * @hwirq: The hwirq number
910 * @chip: The associated interrupt chip
911 * @chip_data: The associated chip data
912 */
913int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
914 irq_hw_number_t hwirq, struct irq_chip *chip,
915 void *chip_data)
916{
917 struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
918
919 if (!irq_data)
920 return -ENOENT;
921
922 irq_data->hwirq = hwirq;
923 irq_data->chip = chip ? chip : &no_irq_chip;
924 irq_data->chip_data = chip_data;
925
926 return 0;
927}
928
929/**
930 * irq_domain_set_info - Set the complete data for a @virq in @domain
931 * @domain: Interrupt domain to match
932 * @virq: IRQ number
933 * @hwirq: The hardware interrupt number
934 * @chip: The associated interrupt chip
935 * @chip_data: The associated interrupt chip data
936 * @handler: The interrupt flow handler
937 * @handler_data: The interrupt flow handler data
938 * @handler_name: The interrupt handler name
939 */
940void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
941 irq_hw_number_t hwirq, struct irq_chip *chip,
942 void *chip_data, irq_flow_handler_t handler,
943 void *handler_data, const char *handler_name)
944{
945 irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data);
946 __irq_set_handler(virq, handler, 0, handler_name);
947 irq_set_handler_data(virq, handler_data);
948}
949
950/**
951 * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
952 * @irq_data: The pointer to irq_data
953 */
954void irq_domain_reset_irq_data(struct irq_data *irq_data)
955{
956 irq_data->hwirq = 0;
957 irq_data->chip = &no_irq_chip;
958 irq_data->chip_data = NULL;
959}
960
961/**
962 * irq_domain_free_irqs_common - Clear irq_data and free the parent
963 * @domain: Interrupt domain to match
964 * @virq: IRQ number to start with
965 * @nr_irqs: The number of irqs to free
966 */
967void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq,
968 unsigned int nr_irqs)
969{
970 struct irq_data *irq_data;
971 int i;
972
973 for (i = 0; i < nr_irqs; i++) {
974 irq_data = irq_domain_get_irq_data(domain, virq + i);
975 if (irq_data)
976 irq_domain_reset_irq_data(irq_data);
977 }
978 irq_domain_free_irqs_parent(domain, virq, nr_irqs);
979}
980
981/**
982 * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent
983 * @domain: Interrupt domain to match
984 * @virq: IRQ number to start with
985 * @nr_irqs: The number of irqs to free
986 */
987void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq,
988 unsigned int nr_irqs)
989{
990 int i;
991
992 for (i = 0; i < nr_irqs; i++) {
993 irq_set_handler_data(virq + i, NULL);
994 irq_set_handler(virq + i, NULL);
995 }
996 irq_domain_free_irqs_common(domain, virq, nr_irqs);
997}
998
999static bool irq_domain_is_auto_recursive(struct irq_domain *domain)
1000{
1001 return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE;
1002}
1003
1004static void irq_domain_free_irqs_recursive(struct irq_domain *domain,
1005 unsigned int irq_base,
1006 unsigned int nr_irqs)
1007{
1008 domain->ops->free(domain, irq_base, nr_irqs);
1009 if (irq_domain_is_auto_recursive(domain)) {
1010 BUG_ON(!domain->parent);
1011 irq_domain_free_irqs_recursive(domain->parent, irq_base,
1012 nr_irqs);
1013 }
1014}
1015
1016static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
1017 unsigned int irq_base,
1018 unsigned int nr_irqs, void *arg)
1019{
1020 int ret = 0;
1021 struct irq_domain *parent = domain->parent;
1022 bool recursive = irq_domain_is_auto_recursive(domain);
1023
1024 BUG_ON(recursive && !parent);
1025 if (recursive)
1026 ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
1027 nr_irqs, arg);
1028 if (ret >= 0)
1029 ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
1030 if (ret < 0 && recursive)
1031 irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
1032
1033 return ret;
1034}
1035
1036/**
1037 * __irq_domain_alloc_irqs - Allocate IRQs from domain
1038 * @domain: domain to allocate from
1039 * @irq_base: allocate specified IRQ nubmer if irq_base >= 0
1040 * @nr_irqs: number of IRQs to allocate
1041 * @node: NUMA node id for memory allocation
1042 * @arg: domain specific argument
1043 * @realloc: IRQ descriptors have already been allocated if true
1044 *
1045 * Allocate IRQ numbers and initialized all data structures to support
1046 * hierarchy IRQ domains.
1047 * Parameter @realloc is mainly to support legacy IRQs.
1048 * Returns error code or allocated IRQ number
1049 *
1050 * The whole process to setup an IRQ has been split into two steps.
1051 * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
1052 * descriptor and required hardware resources. The second step,
1053 * irq_domain_activate_irq(), is to program hardwares with preallocated
1054 * resources. In this way, it's easier to rollback when failing to
1055 * allocate resources.
1056 */
1057int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
1058 unsigned int nr_irqs, int node, void *arg,
1059 bool realloc)
1060{
1061 int i, ret, virq;
1062
1063 if (domain == NULL) {
1064 domain = irq_default_domain;
1065 if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
1066 return -EINVAL;
1067 }
1068
1069 if (!domain->ops->alloc) {
1070 pr_debug("domain->ops->alloc() is NULL\n");
1071 return -ENOSYS;
1072 }
1073
1074 if (realloc && irq_base >= 0) {
1075 virq = irq_base;
1076 } else {
1077 virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node);
1078 if (virq < 0) {
1079 pr_debug("cannot allocate IRQ(base %d, count %d)\n",
1080 irq_base, nr_irqs);
1081 return virq;
1082 }
1083 }
1084
1085 if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) {
1086 pr_debug("cannot allocate memory for IRQ%d\n", virq);
1087 ret = -ENOMEM;
1088 goto out_free_desc;
1089 }
1090
1091 mutex_lock(&irq_domain_mutex);
1092 ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg);
1093 if (ret < 0) {
1094 mutex_unlock(&irq_domain_mutex);
1095 goto out_free_irq_data;
1096 }
1097 for (i = 0; i < nr_irqs; i++)
1098 irq_domain_insert_irq(virq + i);
1099 mutex_unlock(&irq_domain_mutex);
1100
1101 return virq;
1102
1103out_free_irq_data:
1104 irq_domain_free_irq_data(virq, nr_irqs);
1105out_free_desc:
1106 irq_free_descs(virq, nr_irqs);
1107 return ret;
1108}
1109
1110/**
1111 * irq_domain_free_irqs - Free IRQ number and associated data structures
1112 * @virq: base IRQ number
1113 * @nr_irqs: number of IRQs to free
1114 */
1115void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
1116{
1117 struct irq_data *data = irq_get_irq_data(virq);
1118 int i;
1119
1120 if (WARN(!data || !data->domain || !data->domain->ops->free,
1121 "NULL pointer, cannot free irq\n"))
1122 return;
1123
1124 mutex_lock(&irq_domain_mutex);
1125 for (i = 0; i < nr_irqs; i++)
1126 irq_domain_remove_irq(virq + i);
1127 irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs);
1128 mutex_unlock(&irq_domain_mutex);
1129
1130 irq_domain_free_irq_data(virq, nr_irqs);
1131 irq_free_descs(virq, nr_irqs);
1132}
1133
1134/**
1135 * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain
1136 * @irq_base: Base IRQ number
1137 * @nr_irqs: Number of IRQs to allocate
1138 * @arg: Allocation data (arch/domain specific)
1139 *
1140 * Check whether the domain has been setup recursive. If not allocate
1141 * through the parent domain.
1142 */
1143int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
1144 unsigned int irq_base, unsigned int nr_irqs,
1145 void *arg)
1146{
1147 /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */
1148 if (irq_domain_is_auto_recursive(domain))
1149 return 0;
1150
1151 domain = domain->parent;
1152 if (domain)
1153 return irq_domain_alloc_irqs_recursive(domain, irq_base,
1154 nr_irqs, arg);
1155 return -ENOSYS;
1156}
1157
1158/**
1159 * irq_domain_free_irqs_parent - Free interrupts from parent domain
1160 * @irq_base: Base IRQ number
1161 * @nr_irqs: Number of IRQs to free
1162 *
1163 * Check whether the domain has been setup recursive. If not free
1164 * through the parent domain.
1165 */
1166void irq_domain_free_irqs_parent(struct irq_domain *domain,
1167 unsigned int irq_base, unsigned int nr_irqs)
1168{
1169 /* irq_domain_free_irqs_recursive() will call parent's free */
1170 if (!irq_domain_is_auto_recursive(domain) && domain->parent)
1171 irq_domain_free_irqs_recursive(domain->parent, irq_base,
1172 nr_irqs);
1173}
1174
1175/**
1176 * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
1177 * interrupt
1178 * @irq_data: outermost irq_data associated with interrupt
1179 *
1180 * This is the second step to call domain_ops->activate to program interrupt
1181 * controllers, so the interrupt could actually get delivered.
1182 */
1183void irq_domain_activate_irq(struct irq_data *irq_data)
1184{
1185 if (irq_data && irq_data->domain) {
1186 struct irq_domain *domain = irq_data->domain;
1187
1188 if (irq_data->parent_data)
1189 irq_domain_activate_irq(irq_data->parent_data);
1190 if (domain->ops->activate)
1191 domain->ops->activate(domain, irq_data);
1192 }
1193}
1194
1195/**
1196 * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to
1197 * deactivate interrupt
1198 * @irq_data: outermost irq_data associated with interrupt
1199 *
1200 * It calls domain_ops->deactivate to program interrupt controllers to disable
1201 * interrupt delivery.
1202 */
1203void irq_domain_deactivate_irq(struct irq_data *irq_data)
1204{
1205 if (irq_data && irq_data->domain) {
1206 struct irq_domain *domain = irq_data->domain;
1207
1208 if (domain->ops->deactivate)
1209 domain->ops->deactivate(domain, irq_data);
1210 if (irq_data->parent_data)
1211 irq_domain_deactivate_irq(irq_data->parent_data);
1212 }
1213}
1214
1215static void irq_domain_check_hierarchy(struct irq_domain *domain)
1216{
1217 /* Hierarchy irq_domains must implement callback alloc() */
1218 if (domain->ops->alloc)
1219 domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
1220}
1221#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
1222/**
1223 * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
1224 * @domain: domain to match
1225 * @virq: IRQ number to get irq_data
1226 */
1227struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
1228 unsigned int virq)
1229{
1230 struct irq_data *irq_data = irq_get_irq_data(virq);
1231
1232 return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
1233}
1234
1235static void irq_domain_check_hierarchy(struct irq_domain *domain)
1236{
1237}
1238#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0a9104b4608b..80692373abd6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -183,6 +183,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
183 ret = chip->irq_set_affinity(data, mask, force); 183 ret = chip->irq_set_affinity(data, mask, force);
184 switch (ret) { 184 switch (ret) {
185 case IRQ_SET_MASK_OK: 185 case IRQ_SET_MASK_OK:
186 case IRQ_SET_MASK_OK_DONE:
186 cpumask_copy(data->affinity, mask); 187 cpumask_copy(data->affinity, mask);
187 case IRQ_SET_MASK_OK_NOCOPY: 188 case IRQ_SET_MASK_OK_NOCOPY:
188 irq_set_thread_affinity(desc); 189 irq_set_thread_affinity(desc);
@@ -600,6 +601,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
600 601
601 switch (ret) { 602 switch (ret) {
602 case IRQ_SET_MASK_OK: 603 case IRQ_SET_MASK_OK:
604 case IRQ_SET_MASK_OK_DONE:
603 irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); 605 irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
604 irqd_set(&desc->irq_data, flags); 606 irqd_set(&desc->irq_data, flags);
605 607
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
new file mode 100644
index 000000000000..3e18163f336f
--- /dev/null
+++ b/kernel/irq/msi.c
@@ -0,0 +1,330 @@
1/*
2 * linux/kernel/irq/msi.c
3 *
4 * Copyright (C) 2014 Intel Corp.
5 * Author: Jiang Liu <jiang.liu@linux.intel.com>
6 *
7 * This file is licensed under GPLv2.
8 *
9 * This file contains common code to support Message Signalled Interrupt for
10 * PCI compatible and non PCI compatible devices.
11 */
12#include <linux/types.h>
13#include <linux/device.h>
14#include <linux/irq.h>
15#include <linux/irqdomain.h>
16#include <linux/msi.h>
17
18/* Temparory solution for building, will be removed later */
19#include <linux/pci.h>
20
21void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
22{
23 *msg = entry->msg;
24}
25
26void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
27{
28 struct msi_desc *entry = irq_get_msi_desc(irq);
29
30 __get_cached_msi_msg(entry, msg);
31}
32EXPORT_SYMBOL_GPL(get_cached_msi_msg);
33
34#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
35static inline void irq_chip_write_msi_msg(struct irq_data *data,
36 struct msi_msg *msg)
37{
38 data->chip->irq_write_msi_msg(data, msg);
39}
40
41/**
42 * msi_domain_set_affinity - Generic affinity setter function for MSI domains
43 * @irq_data: The irq data associated to the interrupt
44 * @mask: The affinity mask to set
45 * @force: Flag to enforce setting (disable online checks)
46 *
47 * Intended to be used by MSI interrupt controllers which are
48 * implemented with hierarchical domains.
49 */
50int msi_domain_set_affinity(struct irq_data *irq_data,
51 const struct cpumask *mask, bool force)
52{
53 struct irq_data *parent = irq_data->parent_data;
54 struct msi_msg msg;
55 int ret;
56
57 ret = parent->chip->irq_set_affinity(parent, mask, force);
58 if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
59 BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
60 irq_chip_write_msi_msg(irq_data, &msg);
61 }
62
63 return ret;
64}
65
66static void msi_domain_activate(struct irq_domain *domain,
67 struct irq_data *irq_data)
68{
69 struct msi_msg msg;
70
71 BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
72 irq_chip_write_msi_msg(irq_data, &msg);
73}
74
75static void msi_domain_deactivate(struct irq_domain *domain,
76 struct irq_data *irq_data)
77{
78 struct msi_msg msg;
79
80 memset(&msg, 0, sizeof(msg));
81 irq_chip_write_msi_msg(irq_data, &msg);
82}
83
84static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
85 unsigned int nr_irqs, void *arg)
86{
87 struct msi_domain_info *info = domain->host_data;
88 struct msi_domain_ops *ops = info->ops;
89 irq_hw_number_t hwirq = ops->get_hwirq(info, arg);
90 int i, ret;
91
92 if (irq_find_mapping(domain, hwirq) > 0)
93 return -EEXIST;
94
95 ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
96 if (ret < 0)
97 return ret;
98
99 for (i = 0; i < nr_irqs; i++) {
100 ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
101 if (ret < 0) {
102 if (ops->msi_free) {
103 for (i--; i > 0; i--)
104 ops->msi_free(domain, info, virq + i);
105 }
106 irq_domain_free_irqs_top(domain, virq, nr_irqs);
107 return ret;
108 }
109 }
110
111 return 0;
112}
113
114static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
115 unsigned int nr_irqs)
116{
117 struct msi_domain_info *info = domain->host_data;
118 int i;
119
120 if (info->ops->msi_free) {
121 for (i = 0; i < nr_irqs; i++)
122 info->ops->msi_free(domain, info, virq + i);
123 }
124 irq_domain_free_irqs_top(domain, virq, nr_irqs);
125}
126
127static struct irq_domain_ops msi_domain_ops = {
128 .alloc = msi_domain_alloc,
129 .free = msi_domain_free,
130 .activate = msi_domain_activate,
131 .deactivate = msi_domain_deactivate,
132};
133
134#ifdef GENERIC_MSI_DOMAIN_OPS
135static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
136 msi_alloc_info_t *arg)
137{
138 return arg->hwirq;
139}
140
141static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev,
142 int nvec, msi_alloc_info_t *arg)
143{
144 memset(arg, 0, sizeof(*arg));
145 return 0;
146}
147
148static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
149 struct msi_desc *desc)
150{
151 arg->desc = desc;
152}
153#else
154#define msi_domain_ops_get_hwirq NULL
155#define msi_domain_ops_prepare NULL
156#define msi_domain_ops_set_desc NULL
157#endif /* !GENERIC_MSI_DOMAIN_OPS */
158
159static int msi_domain_ops_init(struct irq_domain *domain,
160 struct msi_domain_info *info,
161 unsigned int virq, irq_hw_number_t hwirq,
162 msi_alloc_info_t *arg)
163{
164 irq_domain_set_hwirq_and_chip(domain, virq, hwirq, info->chip,
165 info->chip_data);
166 if (info->handler && info->handler_name) {
167 __irq_set_handler(virq, info->handler, 0, info->handler_name);
168 if (info->handler_data)
169 irq_set_handler_data(virq, info->handler_data);
170 }
171 return 0;
172}
173
174static int msi_domain_ops_check(struct irq_domain *domain,
175 struct msi_domain_info *info,
176 struct device *dev)
177{
178 return 0;
179}
180
181static struct msi_domain_ops msi_domain_ops_default = {
182 .get_hwirq = msi_domain_ops_get_hwirq,
183 .msi_init = msi_domain_ops_init,
184 .msi_check = msi_domain_ops_check,
185 .msi_prepare = msi_domain_ops_prepare,
186 .set_desc = msi_domain_ops_set_desc,
187};
188
189static void msi_domain_update_dom_ops(struct msi_domain_info *info)
190{
191 struct msi_domain_ops *ops = info->ops;
192
193 if (ops == NULL) {
194 info->ops = &msi_domain_ops_default;
195 return;
196 }
197
198 if (ops->get_hwirq == NULL)
199 ops->get_hwirq = msi_domain_ops_default.get_hwirq;
200 if (ops->msi_init == NULL)
201 ops->msi_init = msi_domain_ops_default.msi_init;
202 if (ops->msi_check == NULL)
203 ops->msi_check = msi_domain_ops_default.msi_check;
204 if (ops->msi_prepare == NULL)
205 ops->msi_prepare = msi_domain_ops_default.msi_prepare;
206 if (ops->set_desc == NULL)
207 ops->set_desc = msi_domain_ops_default.set_desc;
208}
209
210static void msi_domain_update_chip_ops(struct msi_domain_info *info)
211{
212 struct irq_chip *chip = info->chip;
213
214 BUG_ON(!chip);
215 if (!chip->irq_mask)
216 chip->irq_mask = pci_msi_mask_irq;
217 if (!chip->irq_unmask)
218 chip->irq_unmask = pci_msi_unmask_irq;
219 if (!chip->irq_set_affinity)
220 chip->irq_set_affinity = msi_domain_set_affinity;
221}
222
223/**
224 * msi_create_irq_domain - Create a MSI interrupt domain
225 * @of_node: Optional device-tree node of the interrupt controller
226 * @info: MSI domain info
227 * @parent: Parent irq domain
228 */
229struct irq_domain *msi_create_irq_domain(struct device_node *node,
230 struct msi_domain_info *info,
231 struct irq_domain *parent)
232{
233 if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
234 msi_domain_update_dom_ops(info);
235 if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
236 msi_domain_update_chip_ops(info);
237
238 return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops,
239 info);
240}
241
242/**
243 * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
244 * @domain: The domain to allocate from
245 * @dev: Pointer to device struct of the device for which the interrupts
246 * are allocated
247 * @nvec: The number of interrupts to allocate
248 *
249 * Returns 0 on success or an error code.
250 */
251int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
252 int nvec)
253{
254 struct msi_domain_info *info = domain->host_data;
255 struct msi_domain_ops *ops = info->ops;
256 msi_alloc_info_t arg;
257 struct msi_desc *desc;
258 int i, ret, virq = -1;
259
260 ret = ops->msi_check(domain, info, dev);
261 if (ret == 0)
262 ret = ops->msi_prepare(domain, dev, nvec, &arg);
263 if (ret)
264 return ret;
265
266 for_each_msi_entry(desc, dev) {
267 ops->set_desc(&arg, desc);
268 if (info->flags & MSI_FLAG_IDENTITY_MAP)
269 virq = (int)ops->get_hwirq(info, &arg);
270 else
271 virq = -1;
272
273 virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
274 dev_to_node(dev), &arg, false);
275 if (virq < 0) {
276 ret = -ENOSPC;
277 if (ops->handle_error)
278 ret = ops->handle_error(domain, desc, ret);
279 if (ops->msi_finish)
280 ops->msi_finish(&arg, ret);
281 return ret;
282 }
283
284 for (i = 0; i < desc->nvec_used; i++)
285 irq_set_msi_desc_off(virq, i, desc);
286 }
287
288 if (ops->msi_finish)
289 ops->msi_finish(&arg, 0);
290
291 for_each_msi_entry(desc, dev) {
292 if (desc->nvec_used == 1)
293 dev_dbg(dev, "irq %d for MSI\n", virq);
294 else
295 dev_dbg(dev, "irq [%d-%d] for MSI\n",
296 virq, virq + desc->nvec_used - 1);
297 }
298
299 return 0;
300}
301
302/**
303 * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
304 * @domain: The domain to managing the interrupts
305 * @dev: Pointer to device struct of the device for which the interrupts
306 * are free
307 */
308void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
309{
310 struct msi_desc *desc;
311
312 for_each_msi_entry(desc, dev) {
313 irq_domain_free_irqs(desc->irq, desc->nvec_used);
314 desc->irq = 0;
315 }
316}
317
318/**
319 * msi_get_domain_info - Get the MSI interrupt domain info for @domain
320 * @domain: The interrupt domain to retrieve data from
321 *
322 * Returns the pointer to the msi_domain_info stored in
323 * @domain->host_data.
324 */
325struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
326{
327 return (struct msi_domain_info *)domain->host_data;
328}
329
330#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8637e041a247..2777f40a9c7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -47,13 +47,6 @@ extern int max_threads;
47 47
48static struct workqueue_struct *khelper_wq; 48static struct workqueue_struct *khelper_wq;
49 49
50/*
51 * kmod_thread_locker is used for deadlock avoidance. There is no explicit
52 * locking to protect this global - it is private to the singleton khelper
53 * thread and should only ever be modified by that thread.
54 */
55static const struct task_struct *kmod_thread_locker;
56
57#define CAP_BSET (void *)1 50#define CAP_BSET (void *)1
58#define CAP_PI (void *)2 51#define CAP_PI (void *)2
59 52
@@ -196,6 +189,27 @@ int __request_module(bool wait, const char *fmt, ...)
196EXPORT_SYMBOL(__request_module); 189EXPORT_SYMBOL(__request_module);
197#endif /* CONFIG_MODULES */ 190#endif /* CONFIG_MODULES */
198 191
192static void call_usermodehelper_freeinfo(struct subprocess_info *info)
193{
194 if (info->cleanup)
195 (*info->cleanup)(info);
196 kfree(info);
197}
198
199static void umh_complete(struct subprocess_info *sub_info)
200{
201 struct completion *comp = xchg(&sub_info->complete, NULL);
202 /*
203 * See call_usermodehelper_exec(). If xchg() returns NULL
204 * we own sub_info, the UMH_KILLABLE caller has gone away
205 * or the caller used UMH_NO_WAIT.
206 */
207 if (comp)
208 complete(comp);
209 else
210 call_usermodehelper_freeinfo(sub_info);
211}
212
199/* 213/*
200 * This is the task which runs the usermode application 214 * This is the task which runs the usermode application
201 */ 215 */
@@ -221,7 +235,7 @@ static int ____call_usermodehelper(void *data)
221 retval = -ENOMEM; 235 retval = -ENOMEM;
222 new = prepare_kernel_cred(current); 236 new = prepare_kernel_cred(current);
223 if (!new) 237 if (!new)
224 goto fail; 238 goto out;
225 239
226 spin_lock(&umh_sysctl_lock); 240 spin_lock(&umh_sysctl_lock);
227 new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); 241 new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
@@ -233,7 +247,7 @@ static int ____call_usermodehelper(void *data)
233 retval = sub_info->init(sub_info, new); 247 retval = sub_info->init(sub_info, new);
234 if (retval) { 248 if (retval) {
235 abort_creds(new); 249 abort_creds(new);
236 goto fail; 250 goto out;
237 } 251 }
238 } 252 }
239 253
@@ -242,42 +256,16 @@ static int ____call_usermodehelper(void *data)
242 retval = do_execve(getname_kernel(sub_info->path), 256 retval = do_execve(getname_kernel(sub_info->path),
243 (const char __user *const __user *)sub_info->argv, 257 (const char __user *const __user *)sub_info->argv,
244 (const char __user *const __user *)sub_info->envp); 258 (const char __user *const __user *)sub_info->envp);
259out:
260 sub_info->retval = retval;
261 /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
262 if (!(sub_info->wait & UMH_WAIT_PROC))
263 umh_complete(sub_info);
245 if (!retval) 264 if (!retval)
246 return 0; 265 return 0;
247
248 /* Exec failed? */
249fail:
250 sub_info->retval = retval;
251 do_exit(0); 266 do_exit(0);
252} 267}
253 268
254static int call_helper(void *data)
255{
256 /* Worker thread started blocking khelper thread. */
257 kmod_thread_locker = current;
258 return ____call_usermodehelper(data);
259}
260
261static void call_usermodehelper_freeinfo(struct subprocess_info *info)
262{
263 if (info->cleanup)
264 (*info->cleanup)(info);
265 kfree(info);
266}
267
268static void umh_complete(struct subprocess_info *sub_info)
269{
270 struct completion *comp = xchg(&sub_info->complete, NULL);
271 /*
272 * See call_usermodehelper_exec(). If xchg() returns NULL
273 * we own sub_info, the UMH_KILLABLE caller has gone away.
274 */
275 if (comp)
276 complete(comp);
277 else
278 call_usermodehelper_freeinfo(sub_info);
279}
280
281/* Keventd can't block, but this (a child) can. */ 269/* Keventd can't block, but this (a child) can. */
282static int wait_for_helper(void *data) 270static int wait_for_helper(void *data)
283{ 271{
@@ -320,34 +308,17 @@ static void __call_usermodehelper(struct work_struct *work)
320{ 308{
321 struct subprocess_info *sub_info = 309 struct subprocess_info *sub_info =
322 container_of(work, struct subprocess_info, work); 310 container_of(work, struct subprocess_info, work);
323 int wait = sub_info->wait & ~UMH_KILLABLE;
324 pid_t pid; 311 pid_t pid;
325 312
326 /* CLONE_VFORK: wait until the usermode helper has execve'd 313 if (sub_info->wait & UMH_WAIT_PROC)
327 * successfully We need the data structures to stay around
328 * until that is done. */
329 if (wait == UMH_WAIT_PROC)
330 pid = kernel_thread(wait_for_helper, sub_info, 314 pid = kernel_thread(wait_for_helper, sub_info,
331 CLONE_FS | CLONE_FILES | SIGCHLD); 315 CLONE_FS | CLONE_FILES | SIGCHLD);
332 else { 316 else
333 pid = kernel_thread(call_helper, sub_info, 317 pid = kernel_thread(____call_usermodehelper, sub_info,
334 CLONE_VFORK | SIGCHLD); 318 SIGCHLD);
335 /* Worker thread stopped blocking khelper thread. */
336 kmod_thread_locker = NULL;
337 }
338
339 switch (wait) {
340 case UMH_NO_WAIT:
341 call_usermodehelper_freeinfo(sub_info);
342 break;
343 319
344 case UMH_WAIT_PROC: 320 if (pid < 0) {
345 if (pid > 0) 321 sub_info->retval = pid;
346 break;
347 /* FALLTHROUGH */
348 case UMH_WAIT_EXEC:
349 if (pid < 0)
350 sub_info->retval = pid;
351 umh_complete(sub_info); 322 umh_complete(sub_info);
352 } 323 }
353} 324}
@@ -578,17 +549,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
578 goto out; 549 goto out;
579 } 550 }
580 /* 551 /*
581 * Worker thread must not wait for khelper thread at below 552 * Set the completion pointer only if there is a waiter.
582 * wait_for_completion() if the thread was created with CLONE_VFORK 553 * This makes it possible to use umh_complete to free
583 * flag, for khelper thread is already waiting for the thread at 554 * the data structure in case of UMH_NO_WAIT.
584 * wait_for_completion() in do_fork().
585 */ 555 */
586 if (wait != UMH_NO_WAIT && current == kmod_thread_locker) { 556 sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
587 retval = -EBUSY;
588 goto out;
589 }
590
591 sub_info->complete = &done;
592 sub_info->wait = wait; 557 sub_info->wait = wait;
593 558
594 queue_work(khelper_wq, &sub_info->work); 559 queue_work(khelper_wq, &sub_info->work);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 317eb8ad28dd..06f58309fed2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -915,7 +915,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
915#ifdef CONFIG_KPROBES_ON_FTRACE 915#ifdef CONFIG_KPROBES_ON_FTRACE
916static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { 916static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
917 .func = kprobe_ftrace_handler, 917 .func = kprobe_ftrace_handler,
918 .flags = FTRACE_OPS_FL_SAVE_REGS, 918 .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
919}; 919};
920static int kprobe_ftrace_enabled; 920static int kprobe_ftrace_enabled;
921 921
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index dadbf88c22c4..454195194d4a 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -378,8 +378,14 @@ done:
378 * reschedule now, before we try-lock the mutex. This avoids getting 378 * reschedule now, before we try-lock the mutex. This avoids getting
379 * scheduled out right after we obtained the mutex. 379 * scheduled out right after we obtained the mutex.
380 */ 380 */
381 if (need_resched()) 381 if (need_resched()) {
382 /*
383 * We _should_ have TASK_RUNNING here, but just in case
384 * we do not, make it so, otherwise we might get stuck.
385 */
386 __set_current_state(TASK_RUNNING);
382 schedule_preempt_disabled(); 387 schedule_preempt_disabled();
388 }
383 389
384 return false; 390 return false;
385} 391}
diff --git a/kernel/module.c b/kernel/module.c
index 88cec1ddb1e3..e52a8739361a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3097,6 +3097,32 @@ static int may_init_module(void)
3097} 3097}
3098 3098
3099/* 3099/*
3100 * Can't use wait_event_interruptible() because our condition
3101 * 'finished_loading()' contains a blocking primitive itself (mutex_lock).
3102 */
3103static int wait_finished_loading(struct module *mod)
3104{
3105 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3106 int ret = 0;
3107
3108 add_wait_queue(&module_wq, &wait);
3109 for (;;) {
3110 if (finished_loading(mod->name))
3111 break;
3112
3113 if (signal_pending(current)) {
3114 ret = -ERESTARTSYS;
3115 break;
3116 }
3117
3118 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3119 }
3120 remove_wait_queue(&module_wq, &wait);
3121
3122 return ret;
3123}
3124
3125/*
3100 * We try to place it in the list now to make sure it's unique before 3126 * We try to place it in the list now to make sure it's unique before
3101 * we dedicate too many resources. In particular, temporary percpu 3127 * we dedicate too many resources. In particular, temporary percpu
3102 * memory exhaustion. 3128 * memory exhaustion.
@@ -3116,8 +3142,8 @@ again:
3116 || old->state == MODULE_STATE_UNFORMED) { 3142 || old->state == MODULE_STATE_UNFORMED) {
3117 /* Wait in case it fails to load. */ 3143 /* Wait in case it fails to load. */
3118 mutex_unlock(&module_mutex); 3144 mutex_unlock(&module_mutex);
3119 err = wait_event_interruptible(module_wq, 3145
3120 finished_loading(mod->name)); 3146 err = wait_finished_loading(mod);
3121 if (err) 3147 if (err)
3122 goto out_unlocked; 3148 goto out_unlocked;
3123 goto again; 3149 goto again;
diff --git a/kernel/panic.c b/kernel/panic.c
index d09dc5c32c67..4d8d6f906dec 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -33,6 +33,7 @@ static int pause_on_oops;
33static int pause_on_oops_flag; 33static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35static bool crash_kexec_post_notifiers; 35static bool crash_kexec_post_notifiers;
36int panic_on_warn __read_mostly;
36 37
37int panic_timeout = CONFIG_PANIC_TIMEOUT; 38int panic_timeout = CONFIG_PANIC_TIMEOUT;
38EXPORT_SYMBOL_GPL(panic_timeout); 39EXPORT_SYMBOL_GPL(panic_timeout);
@@ -244,6 +245,7 @@ static const struct tnt tnts[] = {
244 * 'I' - Working around severe firmware bug. 245 * 'I' - Working around severe firmware bug.
245 * 'O' - Out-of-tree module has been loaded. 246 * 'O' - Out-of-tree module has been loaded.
246 * 'E' - Unsigned module has been loaded. 247 * 'E' - Unsigned module has been loaded.
248 * 'L' - A soft lockup has previously occurred.
247 * 249 *
248 * The string is overwritten by the next call to print_tainted(). 250 * The string is overwritten by the next call to print_tainted().
249 */ 251 */
@@ -427,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
427 if (args) 429 if (args)
428 vprintk(args->fmt, args->args); 430 vprintk(args->fmt, args->args);
429 431
432 if (panic_on_warn) {
433 /*
434 * This thread may hit another WARN() in the panic path.
435 * Resetting this prevents additional WARN() from panicking the
436 * system on this thread. Other threads are blocked by the
437 * panic_mutex in panic().
438 */
439 panic_on_warn = 0;
440 panic("panic_on_warn set ...\n");
441 }
442
430 print_modules(); 443 print_modules();
431 dump_stack(); 444 dump_stack();
432 print_oops_end_marker(); 445 print_oops_end_marker();
@@ -484,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
484 497
485core_param(panic, panic_timeout, int, 0644); 498core_param(panic, panic_timeout, int, 0644);
486core_param(pause_on_oops, pause_on_oops, int, 0644); 499core_param(pause_on_oops, pause_on_oops, int, 0644);
500core_param(panic_on_warn, panic_on_warn, int, 0644);
487 501
488static int __init setup_crash_kexec_post_notifiers(char *s) 502static int __init setup_crash_kexec_post_notifiers(char *s)
489{ 503{
diff --git a/kernel/pid.c b/kernel/pid.c
index 9b9a26698144..82430c858d69 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -341,6 +341,8 @@ out:
341 341
342out_unlock: 342out_unlock:
343 spin_unlock_irq(&pidmap_lock); 343 spin_unlock_irq(&pidmap_lock);
344 put_pid_ns(ns);
345
344out_free: 346out_free:
345 while (++i <= ns->level) 347 while (++i <= ns->level)
346 free_pidmap(pid->numbers + i); 348 free_pidmap(pid->numbers + i);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index db95d8eb761b..bc6d6a89b6e6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -190,7 +190,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
190 /* Don't allow any more processes into the pid namespace */ 190 /* Don't allow any more processes into the pid namespace */
191 disable_pid_allocation(pid_ns); 191 disable_pid_allocation(pid_ns);
192 192
193 /* Ignore SIGCHLD causing any terminated children to autoreap */ 193 /*
194 * Ignore SIGCHLD causing any terminated children to autoreap.
195 * This speeds up the namespace shutdown, plus see the comment
196 * below.
197 */
194 spin_lock_irq(&me->sighand->siglock); 198 spin_lock_irq(&me->sighand->siglock);
195 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; 199 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
196 spin_unlock_irq(&me->sighand->siglock); 200 spin_unlock_irq(&me->sighand->siglock);
@@ -223,15 +227,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
223 } 227 }
224 read_unlock(&tasklist_lock); 228 read_unlock(&tasklist_lock);
225 229
226 /* Firstly reap the EXIT_ZOMBIE children we may have. */ 230 /*
231 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
232 * sys_wait4() will also block until our children traced from the
233 * parent namespace are detached and become EXIT_DEAD.
234 */
227 do { 235 do {
228 clear_thread_flag(TIF_SIGPENDING); 236 clear_thread_flag(TIF_SIGPENDING);
229 rc = sys_wait4(-1, NULL, __WALL, NULL); 237 rc = sys_wait4(-1, NULL, __WALL, NULL);
230 } while (rc != -ECHILD); 238 } while (rc != -ECHILD);
231 239
232 /* 240 /*
233 * sys_wait4() above can't reap the TASK_DEAD children. 241 * sys_wait4() above can't reap the EXIT_DEAD children but we do not
234 * Make sure they all go away, see free_pid(). 242 * really care, we could reparent them to the global init. We could
243 * exit and reap ->child_reaper even if it is not the last thread in
244 * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
245 * pid_ns can not go away until proc_kill_sb() drops the reference.
246 *
247 * But this ns can also have other tasks injected by setns()+fork().
248 * Again, ignoring the user visible semantics we do not really need
249 * to wait until they are all reaped, but they can be reparented to
250 * us and thus we need to ensure that pid->child_reaper stays valid
251 * until they all go away. See free_pid()->wake_up_process().
252 *
253 * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
254 * if reparented.
235 */ 255 */
236 for (;;) { 256 for (;;) {
237 set_current_state(TASK_UNINTERRUPTIBLE); 257 set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index bbef57f5bdfd..6e7708c2c21f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -94,6 +94,7 @@ config PM_STD_PARTITION
94config PM_SLEEP 94config PM_SLEEP
95 def_bool y 95 def_bool y
96 depends on SUSPEND || HIBERNATE_CALLBACKS 96 depends on SUSPEND || HIBERNATE_CALLBACKS
97 select PM_RUNTIME
97 98
98config PM_SLEEP_SMP 99config PM_SLEEP_SMP
99 def_bool y 100 def_bool y
@@ -131,7 +132,6 @@ config PM_WAKELOCKS_GC
131 132
132config PM_RUNTIME 133config PM_RUNTIME
133 bool "Run-time PM core functionality" 134 bool "Run-time PM core functionality"
134 depends on !IA64_HP_SIM
135 ---help--- 135 ---help---
136 Enable functionality allowing I/O devices to be put into energy-saving 136 Enable functionality allowing I/O devices to be put into energy-saving
137 (low power) states at run time (or autosuspended) after a specified 137 (low power) states at run time (or autosuspended) after a specified
@@ -298,14 +298,9 @@ config PM_GENERIC_DOMAINS_SLEEP
298 def_bool y 298 def_bool y
299 depends on PM_SLEEP && PM_GENERIC_DOMAINS 299 depends on PM_SLEEP && PM_GENERIC_DOMAINS
300 300
301config PM_GENERIC_DOMAINS_RUNTIME
302 def_bool y
303 depends on PM_RUNTIME && PM_GENERIC_DOMAINS
304
305config PM_GENERIC_DOMAINS_OF 301config PM_GENERIC_DOMAINS_OF
306 def_bool y 302 def_bool y
307 depends on PM_GENERIC_DOMAINS && OF 303 depends on PM_GENERIC_DOMAINS && OF
308 304
309config CPU_PM 305config CPU_PM
310 bool 306 bool
311 depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a9dfa79b6bab..2329daae5255 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,6 +28,7 @@
28#include <linux/syscore_ops.h> 28#include <linux/syscore_ops.h>
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/genhd.h> 30#include <linux/genhd.h>
31#include <linux/ktime.h>
31#include <trace/events/power.h> 32#include <trace/events/power.h>
32 33
33#include "power.h" 34#include "power.h"
@@ -232,20 +233,17 @@ static void platform_recover(int platform_mode)
232 * @nr_pages: Number of memory pages processed between @start and @stop. 233 * @nr_pages: Number of memory pages processed between @start and @stop.
233 * @msg: Additional diagnostic message to print. 234 * @msg: Additional diagnostic message to print.
234 */ 235 */
235void swsusp_show_speed(struct timeval *start, struct timeval *stop, 236void swsusp_show_speed(ktime_t start, ktime_t stop,
236 unsigned nr_pages, char *msg) 237 unsigned nr_pages, char *msg)
237{ 238{
239 ktime_t diff;
238 u64 elapsed_centisecs64; 240 u64 elapsed_centisecs64;
239 unsigned int centisecs; 241 unsigned int centisecs;
240 unsigned int k; 242 unsigned int k;
241 unsigned int kps; 243 unsigned int kps;
242 244
243 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); 245 diff = ktime_sub(stop, start);
244 /* 246 elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC);
245 * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
246 * it is obvious enough for what went wrong.
247 */
248 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
249 centisecs = elapsed_centisecs64; 247 centisecs = elapsed_centisecs64;
250 if (centisecs == 0) 248 if (centisecs == 0)
251 centisecs = 1; /* avoid div-by-zero */ 249 centisecs = 1; /* avoid div-by-zero */
@@ -502,8 +500,14 @@ int hibernation_restore(int platform_mode)
502 error = dpm_suspend_start(PMSG_QUIESCE); 500 error = dpm_suspend_start(PMSG_QUIESCE);
503 if (!error) { 501 if (!error) {
504 error = resume_target_kernel(platform_mode); 502 error = resume_target_kernel(platform_mode);
505 dpm_resume_end(PMSG_RECOVER); 503 /*
504 * The above should either succeed and jump to the new kernel,
505 * or return with an error. Otherwise things are just
506 * undefined, so let's be paranoid.
507 */
508 BUG_ON(!error);
506 } 509 }
510 dpm_resume_end(PMSG_RECOVER);
507 pm_restore_gfp_mask(); 511 pm_restore_gfp_mask();
508 resume_console(); 512 resume_console();
509 pm_restore_console(); 513 pm_restore_console();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 2df883a9d3cb..ce9b8328a689 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -174,8 +174,7 @@ extern int hib_wait_on_bio_chain(struct bio **bio_chain);
174 174
175struct timeval; 175struct timeval;
176/* kernel/power/swsusp.c */ 176/* kernel/power/swsusp.c */
177extern void swsusp_show_speed(struct timeval *, struct timeval *, 177extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
178 unsigned int, char *);
179 178
180#ifdef CONFIG_SUSPEND 179#ifdef CONFIG_SUSPEND
181/* kernel/power/suspend.c */ 180/* kernel/power/suspend.c */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 791a61892bb5..0c40c16174b4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -28,6 +28,7 @@
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/compiler.h> 30#include <linux/compiler.h>
31#include <linux/ktime.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/mmu_context.h> 34#include <asm/mmu_context.h>
@@ -1576,11 +1577,11 @@ int hibernate_preallocate_memory(void)
1576 struct zone *zone; 1577 struct zone *zone;
1577 unsigned long saveable, size, max_size, count, highmem, pages = 0; 1578 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1578 unsigned long alloc, save_highmem, pages_highmem, avail_normal; 1579 unsigned long alloc, save_highmem, pages_highmem, avail_normal;
1579 struct timeval start, stop; 1580 ktime_t start, stop;
1580 int error; 1581 int error;
1581 1582
1582 printk(KERN_INFO "PM: Preallocating image memory... "); 1583 printk(KERN_INFO "PM: Preallocating image memory... ");
1583 do_gettimeofday(&start); 1584 start = ktime_get();
1584 1585
1585 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); 1586 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
1586 if (error) 1587 if (error)
@@ -1709,9 +1710,9 @@ int hibernate_preallocate_memory(void)
1709 free_unnecessary_pages(); 1710 free_unnecessary_pages();
1710 1711
1711 out: 1712 out:
1712 do_gettimeofday(&stop); 1713 stop = ktime_get();
1713 printk(KERN_CONT "done (allocated %lu pages)\n", pages); 1714 printk(KERN_CONT "done (allocated %lu pages)\n", pages);
1714 swsusp_show_speed(&start, &stop, pages, "Allocated"); 1715 swsusp_show_speed(start, stop, pages, "Allocated");
1715 1716
1716 return 0; 1717 return 0;
1717 1718
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4ca9a33ff620..c347e3ce3a55 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -146,7 +146,7 @@ static int platform_suspend_prepare(suspend_state_t state)
146 146
147static int platform_suspend_prepare_late(suspend_state_t state) 147static int platform_suspend_prepare_late(suspend_state_t state)
148{ 148{
149 return state == PM_SUSPEND_FREEZE && freeze_ops->prepare ? 149 return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
150 freeze_ops->prepare() : 0; 150 freeze_ops->prepare() : 0;
151} 151}
152 152
@@ -164,7 +164,7 @@ static void platform_resume_noirq(suspend_state_t state)
164 164
165static void platform_resume_early(suspend_state_t state) 165static void platform_resume_early(suspend_state_t state)
166{ 166{
167 if (state == PM_SUSPEND_FREEZE && freeze_ops->restore) 167 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
168 freeze_ops->restore(); 168 freeze_ops->restore();
169} 169}
170 170
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index aaa3261dea5d..570aff817543 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -30,6 +30,7 @@
30#include <linux/atomic.h> 30#include <linux/atomic.h>
31#include <linux/kthread.h> 31#include <linux/kthread.h>
32#include <linux/crc32.h> 32#include <linux/crc32.h>
33#include <linux/ktime.h>
33 34
34#include "power.h" 35#include "power.h"
35 36
@@ -445,8 +446,8 @@ static int save_image(struct swap_map_handle *handle,
445 int nr_pages; 446 int nr_pages;
446 int err2; 447 int err2;
447 struct bio *bio; 448 struct bio *bio;
448 struct timeval start; 449 ktime_t start;
449 struct timeval stop; 450 ktime_t stop;
450 451
451 printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", 452 printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
452 nr_to_write); 453 nr_to_write);
@@ -455,7 +456,7 @@ static int save_image(struct swap_map_handle *handle,
455 m = 1; 456 m = 1;
456 nr_pages = 0; 457 nr_pages = 0;
457 bio = NULL; 458 bio = NULL;
458 do_gettimeofday(&start); 459 start = ktime_get();
459 while (1) { 460 while (1) {
460 ret = snapshot_read_next(snapshot); 461 ret = snapshot_read_next(snapshot);
461 if (ret <= 0) 462 if (ret <= 0)
@@ -469,12 +470,12 @@ static int save_image(struct swap_map_handle *handle,
469 nr_pages++; 470 nr_pages++;
470 } 471 }
471 err2 = hib_wait_on_bio_chain(&bio); 472 err2 = hib_wait_on_bio_chain(&bio);
472 do_gettimeofday(&stop); 473 stop = ktime_get();
473 if (!ret) 474 if (!ret)
474 ret = err2; 475 ret = err2;
475 if (!ret) 476 if (!ret)
476 printk(KERN_INFO "PM: Image saving done.\n"); 477 printk(KERN_INFO "PM: Image saving done.\n");
477 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 478 swsusp_show_speed(start, stop, nr_to_write, "Wrote");
478 return ret; 479 return ret;
479} 480}
480 481
@@ -580,8 +581,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
580 int nr_pages; 581 int nr_pages;
581 int err2; 582 int err2;
582 struct bio *bio; 583 struct bio *bio;
583 struct timeval start; 584 ktime_t start;
584 struct timeval stop; 585 ktime_t stop;
585 size_t off; 586 size_t off;
586 unsigned thr, run_threads, nr_threads; 587 unsigned thr, run_threads, nr_threads;
587 unsigned char *page = NULL; 588 unsigned char *page = NULL;
@@ -674,7 +675,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
674 m = 1; 675 m = 1;
675 nr_pages = 0; 676 nr_pages = 0;
676 bio = NULL; 677 bio = NULL;
677 do_gettimeofday(&start); 678 start = ktime_get();
678 for (;;) { 679 for (;;) {
679 for (thr = 0; thr < nr_threads; thr++) { 680 for (thr = 0; thr < nr_threads; thr++) {
680 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { 681 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
@@ -759,12 +760,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
759 760
760out_finish: 761out_finish:
761 err2 = hib_wait_on_bio_chain(&bio); 762 err2 = hib_wait_on_bio_chain(&bio);
762 do_gettimeofday(&stop); 763 stop = ktime_get();
763 if (!ret) 764 if (!ret)
764 ret = err2; 765 ret = err2;
765 if (!ret) 766 if (!ret)
766 printk(KERN_INFO "PM: Image saving done.\n"); 767 printk(KERN_INFO "PM: Image saving done.\n");
767 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 768 swsusp_show_speed(start, stop, nr_to_write, "Wrote");
768out_clean: 769out_clean:
769 if (crc) { 770 if (crc) {
770 if (crc->thr) 771 if (crc->thr)
@@ -965,8 +966,8 @@ static int load_image(struct swap_map_handle *handle,
965{ 966{
966 unsigned int m; 967 unsigned int m;
967 int ret = 0; 968 int ret = 0;
968 struct timeval start; 969 ktime_t start;
969 struct timeval stop; 970 ktime_t stop;
970 struct bio *bio; 971 struct bio *bio;
971 int err2; 972 int err2;
972 unsigned nr_pages; 973 unsigned nr_pages;
@@ -978,7 +979,7 @@ static int load_image(struct swap_map_handle *handle,
978 m = 1; 979 m = 1;
979 nr_pages = 0; 980 nr_pages = 0;
980 bio = NULL; 981 bio = NULL;
981 do_gettimeofday(&start); 982 start = ktime_get();
982 for ( ; ; ) { 983 for ( ; ; ) {
983 ret = snapshot_write_next(snapshot); 984 ret = snapshot_write_next(snapshot);
984 if (ret <= 0) 985 if (ret <= 0)
@@ -996,7 +997,7 @@ static int load_image(struct swap_map_handle *handle,
996 nr_pages++; 997 nr_pages++;
997 } 998 }
998 err2 = hib_wait_on_bio_chain(&bio); 999 err2 = hib_wait_on_bio_chain(&bio);
999 do_gettimeofday(&stop); 1000 stop = ktime_get();
1000 if (!ret) 1001 if (!ret)
1001 ret = err2; 1002 ret = err2;
1002 if (!ret) { 1003 if (!ret) {
@@ -1005,7 +1006,7 @@ static int load_image(struct swap_map_handle *handle,
1005 if (!snapshot_image_loaded(snapshot)) 1006 if (!snapshot_image_loaded(snapshot))
1006 ret = -ENODATA; 1007 ret = -ENODATA;
1007 } 1008 }
1008 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1009 swsusp_show_speed(start, stop, nr_to_read, "Read");
1009 return ret; 1010 return ret;
1010} 1011}
1011 1012
@@ -1067,8 +1068,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
1067 int ret = 0; 1068 int ret = 0;
1068 int eof = 0; 1069 int eof = 0;
1069 struct bio *bio; 1070 struct bio *bio;
1070 struct timeval start; 1071 ktime_t start;
1071 struct timeval stop; 1072 ktime_t stop;
1072 unsigned nr_pages; 1073 unsigned nr_pages;
1073 size_t off; 1074 size_t off;
1074 unsigned i, thr, run_threads, nr_threads; 1075 unsigned i, thr, run_threads, nr_threads;
@@ -1190,7 +1191,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
1190 m = 1; 1191 m = 1;
1191 nr_pages = 0; 1192 nr_pages = 0;
1192 bio = NULL; 1193 bio = NULL;
1193 do_gettimeofday(&start); 1194 start = ktime_get();
1194 1195
1195 ret = snapshot_write_next(snapshot); 1196 ret = snapshot_write_next(snapshot);
1196 if (ret <= 0) 1197 if (ret <= 0)
@@ -1343,7 +1344,7 @@ out_finish:
1343 wait_event(crc->done, atomic_read(&crc->stop)); 1344 wait_event(crc->done, atomic_read(&crc->stop));
1344 atomic_set(&crc->stop, 0); 1345 atomic_set(&crc->stop, 0);
1345 } 1346 }
1346 do_gettimeofday(&stop); 1347 stop = ktime_get();
1347 if (!ret) { 1348 if (!ret) {
1348 printk(KERN_INFO "PM: Image loading done.\n"); 1349 printk(KERN_INFO "PM: Image loading done.\n");
1349 snapshot_write_finalize(snapshot); 1350 snapshot_write_finalize(snapshot);
@@ -1359,7 +1360,7 @@ out_finish:
1359 } 1360 }
1360 } 1361 }
1361 } 1362 }
1362 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1363 swsusp_show_speed(start, stop, nr_to_read, "Read");
1363out_clean: 1364out_clean:
1364 for (i = 0; i < ring_size; i++) 1365 for (i = 0; i < ring_size; i++)
1365 free_page((unsigned long)page[i]); 1366 free_page((unsigned long)page[i]);
@@ -1374,7 +1375,7 @@ out_clean:
1374 kthread_stop(data[thr].thr); 1375 kthread_stop(data[thr].thr);
1375 vfree(data); 1376 vfree(data);
1376 } 1377 }
1377 if (page) vfree(page); 1378 vfree(page);
1378 1379
1379 return ret; 1380 return ret;
1380} 1381}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ced2b84b1cb7..f900dc9f6822 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -62,9 +62,6 @@ int console_printk[4] = {
62 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ 62 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
63}; 63};
64 64
65/* Deferred messaged from sched code are marked by this special level */
66#define SCHED_MESSAGE_LOGLEVEL -2
67
68/* 65/*
69 * Low level drivers may need that to know if they can schedule in 66 * Low level drivers may need that to know if they can schedule in
70 * their unblank() callback or not. So let's export it. 67 * their unblank() callback or not. So let's export it.
@@ -480,7 +477,7 @@ static int syslog_action_restricted(int type)
480 type != SYSLOG_ACTION_SIZE_BUFFER; 477 type != SYSLOG_ACTION_SIZE_BUFFER;
481} 478}
482 479
483static int check_syslog_permissions(int type, bool from_file) 480int check_syslog_permissions(int type, bool from_file)
484{ 481{
485 /* 482 /*
486 * If this is from /proc/kmsg and we've already opened it, then we've 483 * If this is from /proc/kmsg and we've already opened it, then we've
@@ -1259,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1259int do_syslog(int type, char __user *buf, int len, bool from_file) 1256int do_syslog(int type, char __user *buf, int len, bool from_file)
1260{ 1257{
1261 bool clear = false; 1258 bool clear = false;
1262 static int saved_console_loglevel = -1; 1259 static int saved_console_loglevel = LOGLEVEL_DEFAULT;
1263 int error; 1260 int error;
1264 1261
1265 error = check_syslog_permissions(type, from_file); 1262 error = check_syslog_permissions(type, from_file);
@@ -1316,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1316 break; 1313 break;
1317 /* Disable logging to console */ 1314 /* Disable logging to console */
1318 case SYSLOG_ACTION_CONSOLE_OFF: 1315 case SYSLOG_ACTION_CONSOLE_OFF:
1319 if (saved_console_loglevel == -1) 1316 if (saved_console_loglevel == LOGLEVEL_DEFAULT)
1320 saved_console_loglevel = console_loglevel; 1317 saved_console_loglevel = console_loglevel;
1321 console_loglevel = minimum_console_loglevel; 1318 console_loglevel = minimum_console_loglevel;
1322 break; 1319 break;
1323 /* Enable logging to console */ 1320 /* Enable logging to console */
1324 case SYSLOG_ACTION_CONSOLE_ON: 1321 case SYSLOG_ACTION_CONSOLE_ON:
1325 if (saved_console_loglevel != -1) { 1322 if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
1326 console_loglevel = saved_console_loglevel; 1323 console_loglevel = saved_console_loglevel;
1327 saved_console_loglevel = -1; 1324 saved_console_loglevel = LOGLEVEL_DEFAULT;
1328 } 1325 }
1329 break; 1326 break;
1330 /* Set level of messages printed to console */ 1327 /* Set level of messages printed to console */
@@ -1336,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1336 len = minimum_console_loglevel; 1333 len = minimum_console_loglevel;
1337 console_loglevel = len; 1334 console_loglevel = len;
1338 /* Implicitly re-enable logging to console */ 1335 /* Implicitly re-enable logging to console */
1339 saved_console_loglevel = -1; 1336 saved_console_loglevel = LOGLEVEL_DEFAULT;
1340 error = 0; 1337 error = 0;
1341 break; 1338 break;
1342 /* Number of chars in the log buffer */ 1339 /* Number of chars in the log buffer */
@@ -1627,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level,
1627 int printed_len = 0; 1624 int printed_len = 0;
1628 bool in_sched = false; 1625 bool in_sched = false;
1629 /* cpu currently holding logbuf_lock in this function */ 1626 /* cpu currently holding logbuf_lock in this function */
1630 static volatile unsigned int logbuf_cpu = UINT_MAX; 1627 static unsigned int logbuf_cpu = UINT_MAX;
1631 1628
1632 if (level == SCHED_MESSAGE_LOGLEVEL) { 1629 if (level == LOGLEVEL_SCHED) {
1633 level = -1; 1630 level = LOGLEVEL_DEFAULT;
1634 in_sched = true; 1631 in_sched = true;
1635 } 1632 }
1636 1633
@@ -1695,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level,
1695 const char *end_of_header = printk_skip_level(text); 1692 const char *end_of_header = printk_skip_level(text);
1696 switch (kern_level) { 1693 switch (kern_level) {
1697 case '0' ... '7': 1694 case '0' ... '7':
1698 if (level == -1) 1695 if (level == LOGLEVEL_DEFAULT)
1699 level = kern_level - '0'; 1696 level = kern_level - '0';
1697 /* fallthrough */
1700 case 'd': /* KERN_DEFAULT */ 1698 case 'd': /* KERN_DEFAULT */
1701 lflags |= LOG_PREFIX; 1699 lflags |= LOG_PREFIX;
1702 } 1700 }
@@ -1710,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1710 } 1708 }
1711 } 1709 }
1712 1710
1713 if (level == -1) 1711 if (level == LOGLEVEL_DEFAULT)
1714 level = default_message_loglevel; 1712 level = default_message_loglevel;
1715 1713
1716 if (dict) 1714 if (dict)
@@ -1788,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit);
1788 1786
1789asmlinkage int vprintk(const char *fmt, va_list args) 1787asmlinkage int vprintk(const char *fmt, va_list args)
1790{ 1788{
1791 return vprintk_emit(0, -1, NULL, 0, fmt, args); 1789 return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
1792} 1790}
1793EXPORT_SYMBOL(vprintk); 1791EXPORT_SYMBOL(vprintk);
1794 1792
@@ -1807,6 +1805,30 @@ asmlinkage int printk_emit(int facility, int level,
1807} 1805}
1808EXPORT_SYMBOL(printk_emit); 1806EXPORT_SYMBOL(printk_emit);
1809 1807
1808int vprintk_default(const char *fmt, va_list args)
1809{
1810 int r;
1811
1812#ifdef CONFIG_KGDB_KDB
1813 if (unlikely(kdb_trap_printk)) {
1814 r = vkdb_printf(fmt, args);
1815 return r;
1816 }
1817#endif
1818 r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
1819
1820 return r;
1821}
1822EXPORT_SYMBOL_GPL(vprintk_default);
1823
1824/*
1825 * This allows printk to be diverted to another function per cpu.
1826 * This is useful for calling printk functions from within NMI
1827 * without worrying about race conditions that can lock up the
1828 * box.
1829 */
1830DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
1831
1810/** 1832/**
1811 * printk - print a kernel message 1833 * printk - print a kernel message
1812 * @fmt: format string 1834 * @fmt: format string
@@ -1830,19 +1852,15 @@ EXPORT_SYMBOL(printk_emit);
1830 */ 1852 */
1831asmlinkage __visible int printk(const char *fmt, ...) 1853asmlinkage __visible int printk(const char *fmt, ...)
1832{ 1854{
1855 printk_func_t vprintk_func;
1833 va_list args; 1856 va_list args;
1834 int r; 1857 int r;
1835 1858
1836#ifdef CONFIG_KGDB_KDB
1837 if (unlikely(kdb_trap_printk)) {
1838 va_start(args, fmt);
1839 r = vkdb_printf(fmt, args);
1840 va_end(args);
1841 return r;
1842 }
1843#endif
1844 va_start(args, fmt); 1859 va_start(args, fmt);
1845 r = vprintk_emit(0, -1, NULL, 0, fmt, args); 1860 preempt_disable();
1861 vprintk_func = this_cpu_read(printk_func);
1862 r = vprintk_func(fmt, args);
1863 preempt_enable();
1846 va_end(args); 1864 va_end(args);
1847 1865
1848 return r; 1866 return r;
@@ -1876,28 +1894,28 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
1876 bool syslog, char *buf, size_t size) { return 0; } 1894 bool syslog, char *buf, size_t size) { return 0; }
1877static size_t cont_print_text(char *text, size_t size) { return 0; } 1895static size_t cont_print_text(char *text, size_t size) { return 0; }
1878 1896
1897/* Still needs to be defined for users */
1898DEFINE_PER_CPU(printk_func_t, printk_func);
1899
1879#endif /* CONFIG_PRINTK */ 1900#endif /* CONFIG_PRINTK */
1880 1901
1881#ifdef CONFIG_EARLY_PRINTK 1902#ifdef CONFIG_EARLY_PRINTK
1882struct console *early_console; 1903struct console *early_console;
1883 1904
1884void early_vprintk(const char *fmt, va_list ap)
1885{
1886 if (early_console) {
1887 char buf[512];
1888 int n = vscnprintf(buf, sizeof(buf), fmt, ap);
1889
1890 early_console->write(early_console, buf, n);
1891 }
1892}
1893
1894asmlinkage __visible void early_printk(const char *fmt, ...) 1905asmlinkage __visible void early_printk(const char *fmt, ...)
1895{ 1906{
1896 va_list ap; 1907 va_list ap;
1908 char buf[512];
1909 int n;
1910
1911 if (!early_console)
1912 return;
1897 1913
1898 va_start(ap, fmt); 1914 va_start(ap, fmt);
1899 early_vprintk(fmt, ap); 1915 n = vscnprintf(buf, sizeof(buf), fmt, ap);
1900 va_end(ap); 1916 va_end(ap);
1917
1918 early_console->write(early_console, buf, n);
1901} 1919}
1902#endif 1920#endif
1903 1921
@@ -2634,7 +2652,7 @@ int printk_deferred(const char *fmt, ...)
2634 2652
2635 preempt_disable(); 2653 preempt_disable();
2636 va_start(args, fmt); 2654 va_start(args, fmt);
2637 r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); 2655 r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
2638 va_end(args); 2656 va_end(args);
2639 2657
2640 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); 2658 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54e75226c2c4..1eb9d90c3af9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
485 485
486/* 486/*
487 * Detach all tasks we were using ptrace on. Called with tasklist held 487 * Detach all tasks we were using ptrace on. Called with tasklist held
488 * for writing, and returns with it held too. But note it can release 488 * for writing.
489 * and reacquire the lock.
490 */ 489 */
491void exit_ptrace(struct task_struct *tracer) 490void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
492 __releases(&tasklist_lock)
493 __acquires(&tasklist_lock)
494{ 491{
495 struct task_struct *p, *n; 492 struct task_struct *p, *n;
496 LIST_HEAD(ptrace_dead);
497
498 if (likely(list_empty(&tracer->ptraced)))
499 return;
500 493
501 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 494 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
502 if (unlikely(p->ptrace & PT_EXITKILL)) 495 if (unlikely(p->ptrace & PT_EXITKILL))
503 send_sig_info(SIGKILL, SEND_SIG_FORCED, p); 496 send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
504 497
505 if (__ptrace_detach(tracer, p)) 498 if (__ptrace_detach(tracer, p))
506 list_add(&p->ptrace_entry, &ptrace_dead); 499 list_add(&p->ptrace_entry, dead);
507 }
508
509 write_unlock_irq(&tasklist_lock);
510 BUG_ON(!list_empty(&tracer->ptraced));
511
512 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
513 list_del_init(&p->ptrace_entry);
514 release_task(p);
515 } 500 }
516
517 write_lock_irq(&tasklist_lock);
518} 501}
519 502
520int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 503int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 807ccfbf69b3..e6fae503d1bc 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,6 +1,6 @@
1obj-y += update.o srcu.o 1obj-y += update.o srcu.o
2obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 2obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
3obj-$(CONFIG_TREE_RCU) += tree.o 3obj-$(CONFIG_TREE_RCU) += tree.o
4obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o 4obj-$(CONFIG_PREEMPT_RCU) += tree.o
5obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o 5obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
6obj-$(CONFIG_TINY_RCU) += tiny.o 6obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index ff1a6de62f17..07bb02eda844 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -135,4 +135,6 @@ int rcu_jiffies_till_stall_check(void);
135 */ 135 */
136#define TPS(x) tracepoint_string(x) 136#define TPS(x) tracepoint_string(x)
137 137
138void rcu_early_boot_tests(void);
139
138#endif /* __LINUX_RCU_H */ 140#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 240fa9094f83..4d559baf06e0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -812,6 +812,7 @@ rcu_torture_cbflood(void *arg)
812 cur_ops->cb_barrier(); 812 cur_ops->cb_barrier();
813 stutter_wait("rcu_torture_cbflood"); 813 stutter_wait("rcu_torture_cbflood");
814 } while (!torture_must_stop()); 814 } while (!torture_must_stop());
815 vfree(rhp);
815 torture_kthread_stopping("rcu_torture_cbflood"); 816 torture_kthread_stopping("rcu_torture_cbflood");
816 return 0; 817 return 0;
817} 818}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c0623fc47125..0db5649f8817 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -247,7 +247,7 @@ void rcu_bh_qs(void)
247 * be called from hardirq context. It is normally called from the 247 * be called from hardirq context. It is normally called from the
248 * scheduling-clock interrupt. 248 * scheduling-clock interrupt.
249 */ 249 */
250void rcu_check_callbacks(int cpu, int user) 250void rcu_check_callbacks(int user)
251{ 251{
252 RCU_TRACE(check_cpu_stalls()); 252 RCU_TRACE(check_cpu_stalls());
253 if (user || rcu_is_cpu_rrupt_from_idle()) 253 if (user || rcu_is_cpu_rrupt_from_idle())
@@ -380,7 +380,9 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
380} 380}
381EXPORT_SYMBOL_GPL(call_rcu_bh); 381EXPORT_SYMBOL_GPL(call_rcu_bh);
382 382
383void rcu_init(void) 383void __init rcu_init(void)
384{ 384{
385 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 385 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
386
387 rcu_early_boot_tests();
386} 388}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 133e47223095..7680fc275036 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -105,7 +105,7 @@ struct rcu_state sname##_state = { \
105 .name = RCU_STATE_NAME(sname), \ 105 .name = RCU_STATE_NAME(sname), \
106 .abbr = sabbr, \ 106 .abbr = sabbr, \
107}; \ 107}; \
108DEFINE_PER_CPU(struct rcu_data, sname##_data) 108DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data)
109 109
110RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 110RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
111RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 111RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
@@ -152,19 +152,6 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
152 */ 152 */
153static int rcu_scheduler_fully_active __read_mostly; 153static int rcu_scheduler_fully_active __read_mostly;
154 154
155#ifdef CONFIG_RCU_BOOST
156
157/*
158 * Control variables for per-CPU and per-rcu_node kthreads. These
159 * handle all flavors of RCU.
160 */
161static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
162DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
163DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
164DEFINE_PER_CPU(char, rcu_cpu_has_work);
165
166#endif /* #ifdef CONFIG_RCU_BOOST */
167
168static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 155static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
169static void invoke_rcu_core(void); 156static void invoke_rcu_core(void);
170static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -286,11 +273,11 @@ static void rcu_momentary_dyntick_idle(void)
286 * and requires special handling for preemptible RCU. 273 * and requires special handling for preemptible RCU.
287 * The caller must have disabled preemption. 274 * The caller must have disabled preemption.
288 */ 275 */
289void rcu_note_context_switch(int cpu) 276void rcu_note_context_switch(void)
290{ 277{
291 trace_rcu_utilization(TPS("Start context switch")); 278 trace_rcu_utilization(TPS("Start context switch"));
292 rcu_sched_qs(); 279 rcu_sched_qs();
293 rcu_preempt_note_context_switch(cpu); 280 rcu_preempt_note_context_switch();
294 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 281 if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
295 rcu_momentary_dyntick_idle(); 282 rcu_momentary_dyntick_idle();
296 trace_rcu_utilization(TPS("End context switch")); 283 trace_rcu_utilization(TPS("End context switch"));
@@ -325,7 +312,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
325 unsigned long *maxj), 312 unsigned long *maxj),
326 bool *isidle, unsigned long *maxj); 313 bool *isidle, unsigned long *maxj);
327static void force_quiescent_state(struct rcu_state *rsp); 314static void force_quiescent_state(struct rcu_state *rsp);
328static int rcu_pending(int cpu); 315static int rcu_pending(void);
329 316
330/* 317/*
331 * Return the number of RCU-sched batches processed thus far for debug & stats. 318 * Return the number of RCU-sched batches processed thus far for debug & stats.
@@ -510,11 +497,11 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
510 * we really have entered idle, and must do the appropriate accounting. 497 * we really have entered idle, and must do the appropriate accounting.
511 * The caller must have disabled interrupts. 498 * The caller must have disabled interrupts.
512 */ 499 */
513static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 500static void rcu_eqs_enter_common(long long oldval, bool user)
514 bool user)
515{ 501{
516 struct rcu_state *rsp; 502 struct rcu_state *rsp;
517 struct rcu_data *rdp; 503 struct rcu_data *rdp;
504 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
518 505
519 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); 506 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
520 if (!user && !is_idle_task(current)) { 507 if (!user && !is_idle_task(current)) {
@@ -531,7 +518,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
531 rdp = this_cpu_ptr(rsp->rda); 518 rdp = this_cpu_ptr(rsp->rda);
532 do_nocb_deferred_wakeup(rdp); 519 do_nocb_deferred_wakeup(rdp);
533 } 520 }
534 rcu_prepare_for_idle(smp_processor_id()); 521 rcu_prepare_for_idle();
535 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 522 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
536 smp_mb__before_atomic(); /* See above. */ 523 smp_mb__before_atomic(); /* See above. */
537 atomic_inc(&rdtp->dynticks); 524 atomic_inc(&rdtp->dynticks);
@@ -565,7 +552,7 @@ static void rcu_eqs_enter(bool user)
565 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); 552 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
566 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { 553 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
567 rdtp->dynticks_nesting = 0; 554 rdtp->dynticks_nesting = 0;
568 rcu_eqs_enter_common(rdtp, oldval, user); 555 rcu_eqs_enter_common(oldval, user);
569 } else { 556 } else {
570 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; 557 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
571 } 558 }
@@ -589,7 +576,7 @@ void rcu_idle_enter(void)
589 576
590 local_irq_save(flags); 577 local_irq_save(flags);
591 rcu_eqs_enter(false); 578 rcu_eqs_enter(false);
592 rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); 579 rcu_sysidle_enter(0);
593 local_irq_restore(flags); 580 local_irq_restore(flags);
594} 581}
595EXPORT_SYMBOL_GPL(rcu_idle_enter); 582EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -639,8 +626,8 @@ void rcu_irq_exit(void)
639 if (rdtp->dynticks_nesting) 626 if (rdtp->dynticks_nesting)
640 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); 627 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
641 else 628 else
642 rcu_eqs_enter_common(rdtp, oldval, true); 629 rcu_eqs_enter_common(oldval, true);
643 rcu_sysidle_enter(rdtp, 1); 630 rcu_sysidle_enter(1);
644 local_irq_restore(flags); 631 local_irq_restore(flags);
645} 632}
646 633
@@ -651,16 +638,17 @@ void rcu_irq_exit(void)
651 * we really have exited idle, and must do the appropriate accounting. 638 * we really have exited idle, and must do the appropriate accounting.
652 * The caller must have disabled interrupts. 639 * The caller must have disabled interrupts.
653 */ 640 */
654static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, 641static void rcu_eqs_exit_common(long long oldval, int user)
655 int user)
656{ 642{
643 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
644
657 rcu_dynticks_task_exit(); 645 rcu_dynticks_task_exit();
658 smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ 646 smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
659 atomic_inc(&rdtp->dynticks); 647 atomic_inc(&rdtp->dynticks);
660 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 648 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
661 smp_mb__after_atomic(); /* See above. */ 649 smp_mb__after_atomic(); /* See above. */
662 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 650 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
663 rcu_cleanup_after_idle(smp_processor_id()); 651 rcu_cleanup_after_idle();
664 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); 652 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
665 if (!user && !is_idle_task(current)) { 653 if (!user && !is_idle_task(current)) {
666 struct task_struct *idle __maybe_unused = 654 struct task_struct *idle __maybe_unused =
@@ -691,7 +679,7 @@ static void rcu_eqs_exit(bool user)
691 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; 679 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
692 } else { 680 } else {
693 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 681 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
694 rcu_eqs_exit_common(rdtp, oldval, user); 682 rcu_eqs_exit_common(oldval, user);
695 } 683 }
696} 684}
697 685
@@ -712,7 +700,7 @@ void rcu_idle_exit(void)
712 700
713 local_irq_save(flags); 701 local_irq_save(flags);
714 rcu_eqs_exit(false); 702 rcu_eqs_exit(false);
715 rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); 703 rcu_sysidle_exit(0);
716 local_irq_restore(flags); 704 local_irq_restore(flags);
717} 705}
718EXPORT_SYMBOL_GPL(rcu_idle_exit); 706EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -763,8 +751,8 @@ void rcu_irq_enter(void)
763 if (oldval) 751 if (oldval)
764 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); 752 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
765 else 753 else
766 rcu_eqs_exit_common(rdtp, oldval, true); 754 rcu_eqs_exit_common(oldval, true);
767 rcu_sysidle_exit(rdtp, 1); 755 rcu_sysidle_exit(1);
768 local_irq_restore(flags); 756 local_irq_restore(flags);
769} 757}
770 758
@@ -2387,7 +2375,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2387 * invoked from the scheduling-clock interrupt. If rcu_pending returns 2375 * invoked from the scheduling-clock interrupt. If rcu_pending returns
2388 * false, there is no point in invoking rcu_check_callbacks(). 2376 * false, there is no point in invoking rcu_check_callbacks().
2389 */ 2377 */
2390void rcu_check_callbacks(int cpu, int user) 2378void rcu_check_callbacks(int user)
2391{ 2379{
2392 trace_rcu_utilization(TPS("Start scheduler-tick")); 2380 trace_rcu_utilization(TPS("Start scheduler-tick"));
2393 increment_cpu_stall_ticks(); 2381 increment_cpu_stall_ticks();
@@ -2419,8 +2407,8 @@ void rcu_check_callbacks(int cpu, int user)
2419 2407
2420 rcu_bh_qs(); 2408 rcu_bh_qs();
2421 } 2409 }
2422 rcu_preempt_check_callbacks(cpu); 2410 rcu_preempt_check_callbacks();
2423 if (rcu_pending(cpu)) 2411 if (rcu_pending())
2424 invoke_rcu_core(); 2412 invoke_rcu_core();
2425 if (user) 2413 if (user)
2426 rcu_note_voluntary_context_switch(current); 2414 rcu_note_voluntary_context_switch(current);
@@ -2963,6 +2951,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
2963 */ 2951 */
2964void synchronize_sched_expedited(void) 2952void synchronize_sched_expedited(void)
2965{ 2953{
2954 cpumask_var_t cm;
2955 bool cma = false;
2956 int cpu;
2966 long firstsnap, s, snap; 2957 long firstsnap, s, snap;
2967 int trycount = 0; 2958 int trycount = 0;
2968 struct rcu_state *rsp = &rcu_sched_state; 2959 struct rcu_state *rsp = &rcu_sched_state;
@@ -2997,11 +2988,26 @@ void synchronize_sched_expedited(void)
2997 } 2988 }
2998 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); 2989 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2999 2990
2991 /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
2992 cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
2993 if (cma) {
2994 cpumask_copy(cm, cpu_online_mask);
2995 cpumask_clear_cpu(raw_smp_processor_id(), cm);
2996 for_each_cpu(cpu, cm) {
2997 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2998
2999 if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
3000 cpumask_clear_cpu(cpu, cm);
3001 }
3002 if (cpumask_weight(cm) == 0)
3003 goto all_cpus_idle;
3004 }
3005
3000 /* 3006 /*
3001 * Each pass through the following loop attempts to force a 3007 * Each pass through the following loop attempts to force a
3002 * context switch on each CPU. 3008 * context switch on each CPU.
3003 */ 3009 */
3004 while (try_stop_cpus(cpu_online_mask, 3010 while (try_stop_cpus(cma ? cm : cpu_online_mask,
3005 synchronize_sched_expedited_cpu_stop, 3011 synchronize_sched_expedited_cpu_stop,
3006 NULL) == -EAGAIN) { 3012 NULL) == -EAGAIN) {
3007 put_online_cpus(); 3013 put_online_cpus();
@@ -3013,6 +3019,7 @@ void synchronize_sched_expedited(void)
3013 /* ensure test happens before caller kfree */ 3019 /* ensure test happens before caller kfree */
3014 smp_mb__before_atomic(); /* ^^^ */ 3020 smp_mb__before_atomic(); /* ^^^ */
3015 atomic_long_inc(&rsp->expedited_workdone1); 3021 atomic_long_inc(&rsp->expedited_workdone1);
3022 free_cpumask_var(cm);
3016 return; 3023 return;
3017 } 3024 }
3018 3025
@@ -3022,6 +3029,7 @@ void synchronize_sched_expedited(void)
3022 } else { 3029 } else {
3023 wait_rcu_gp(call_rcu_sched); 3030 wait_rcu_gp(call_rcu_sched);
3024 atomic_long_inc(&rsp->expedited_normal); 3031 atomic_long_inc(&rsp->expedited_normal);
3032 free_cpumask_var(cm);
3025 return; 3033 return;
3026 } 3034 }
3027 3035
@@ -3031,6 +3039,7 @@ void synchronize_sched_expedited(void)
3031 /* ensure test happens before caller kfree */ 3039 /* ensure test happens before caller kfree */
3032 smp_mb__before_atomic(); /* ^^^ */ 3040 smp_mb__before_atomic(); /* ^^^ */
3033 atomic_long_inc(&rsp->expedited_workdone2); 3041 atomic_long_inc(&rsp->expedited_workdone2);
3042 free_cpumask_var(cm);
3034 return; 3043 return;
3035 } 3044 }
3036 3045
@@ -3045,6 +3054,7 @@ void synchronize_sched_expedited(void)
3045 /* CPU hotplug operation in flight, use normal GP. */ 3054 /* CPU hotplug operation in flight, use normal GP. */
3046 wait_rcu_gp(call_rcu_sched); 3055 wait_rcu_gp(call_rcu_sched);
3047 atomic_long_inc(&rsp->expedited_normal); 3056 atomic_long_inc(&rsp->expedited_normal);
3057 free_cpumask_var(cm);
3048 return; 3058 return;
3049 } 3059 }
3050 snap = atomic_long_read(&rsp->expedited_start); 3060 snap = atomic_long_read(&rsp->expedited_start);
@@ -3052,6 +3062,9 @@ void synchronize_sched_expedited(void)
3052 } 3062 }
3053 atomic_long_inc(&rsp->expedited_stoppedcpus); 3063 atomic_long_inc(&rsp->expedited_stoppedcpus);
3054 3064
3065all_cpus_idle:
3066 free_cpumask_var(cm);
3067
3055 /* 3068 /*
3056 * Everyone up to our most recent fetch is covered by our grace 3069 * Everyone up to our most recent fetch is covered by our grace
3057 * period. Update the counter, but only if our work is still 3070 * period. Update the counter, but only if our work is still
@@ -3143,12 +3156,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3143 * by the current CPU, returning 1 if so. This function is part of the 3156 * by the current CPU, returning 1 if so. This function is part of the
3144 * RCU implementation; it is -not- an exported member of the RCU API. 3157 * RCU implementation; it is -not- an exported member of the RCU API.
3145 */ 3158 */
3146static int rcu_pending(int cpu) 3159static int rcu_pending(void)
3147{ 3160{
3148 struct rcu_state *rsp; 3161 struct rcu_state *rsp;
3149 3162
3150 for_each_rcu_flavor(rsp) 3163 for_each_rcu_flavor(rsp)
3151 if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) 3164 if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda)))
3152 return 1; 3165 return 1;
3153 return 0; 3166 return 0;
3154} 3167}
@@ -3158,7 +3171,7 @@ static int rcu_pending(int cpu)
3158 * non-NULL, store an indication of whether all callbacks are lazy. 3171 * non-NULL, store an indication of whether all callbacks are lazy.
3159 * (If there are no callbacks, all of them are deemed to be lazy.) 3172 * (If there are no callbacks, all of them are deemed to be lazy.)
3160 */ 3173 */
3161static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) 3174static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
3162{ 3175{
3163 bool al = true; 3176 bool al = true;
3164 bool hc = false; 3177 bool hc = false;
@@ -3166,7 +3179,7 @@ static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
3166 struct rcu_state *rsp; 3179 struct rcu_state *rsp;
3167 3180
3168 for_each_rcu_flavor(rsp) { 3181 for_each_rcu_flavor(rsp) {
3169 rdp = per_cpu_ptr(rsp->rda, cpu); 3182 rdp = this_cpu_ptr(rsp->rda);
3170 if (!rdp->nxtlist) 3183 if (!rdp->nxtlist)
3171 continue; 3184 continue;
3172 hc = true; 3185 hc = true;
@@ -3299,11 +3312,16 @@ static void _rcu_barrier(struct rcu_state *rsp)
3299 continue; 3312 continue;
3300 rdp = per_cpu_ptr(rsp->rda, cpu); 3313 rdp = per_cpu_ptr(rsp->rda, cpu);
3301 if (rcu_is_nocb_cpu(cpu)) { 3314 if (rcu_is_nocb_cpu(cpu)) {
3302 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 3315 if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
3303 rsp->n_barrier_done); 3316 _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
3304 atomic_inc(&rsp->barrier_cpu_count); 3317 rsp->n_barrier_done);
3305 __call_rcu(&rdp->barrier_head, rcu_barrier_callback, 3318 } else {
3306 rsp, cpu, 0); 3319 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
3320 rsp->n_barrier_done);
3321 atomic_inc(&rsp->barrier_cpu_count);
3322 __call_rcu(&rdp->barrier_head,
3323 rcu_barrier_callback, rsp, cpu, 0);
3324 }
3307 } else if (ACCESS_ONCE(rdp->qlen)) { 3325 } else if (ACCESS_ONCE(rdp->qlen)) {
3308 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 3326 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
3309 rsp->n_barrier_done); 3327 rsp->n_barrier_done);
@@ -3480,8 +3498,10 @@ static int rcu_cpu_notify(struct notifier_block *self,
3480 case CPU_DEAD_FROZEN: 3498 case CPU_DEAD_FROZEN:
3481 case CPU_UP_CANCELED: 3499 case CPU_UP_CANCELED:
3482 case CPU_UP_CANCELED_FROZEN: 3500 case CPU_UP_CANCELED_FROZEN:
3483 for_each_rcu_flavor(rsp) 3501 for_each_rcu_flavor(rsp) {
3484 rcu_cleanup_dead_cpu(cpu, rsp); 3502 rcu_cleanup_dead_cpu(cpu, rsp);
3503 do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
3504 }
3485 break; 3505 break;
3486 default: 3506 default:
3487 break; 3507 break;
@@ -3761,6 +3781,8 @@ void __init rcu_init(void)
3761 pm_notifier(rcu_pm_notify, 0); 3781 pm_notifier(rcu_pm_notify, 0);
3762 for_each_online_cpu(cpu) 3782 for_each_online_cpu(cpu)
3763 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3783 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3784
3785 rcu_early_boot_tests();
3764} 3786}
3765 3787
3766#include "tree_plugin.h" 3788#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index d03764652d91..8e7b1843896e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -139,7 +139,7 @@ struct rcu_node {
139 unsigned long expmask; /* Groups that have ->blkd_tasks */ 139 unsigned long expmask; /* Groups that have ->blkd_tasks */
140 /* elements that need to drain to allow the */ 140 /* elements that need to drain to allow the */
141 /* current expedited grace period to */ 141 /* current expedited grace period to */
142 /* complete (only for TREE_PREEMPT_RCU). */ 142 /* complete (only for PREEMPT_RCU). */
143 unsigned long qsmaskinit; 143 unsigned long qsmaskinit;
144 /* Per-GP initial value for qsmask & expmask. */ 144 /* Per-GP initial value for qsmask & expmask. */
145 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 145 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -530,10 +530,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
530extern struct rcu_state rcu_bh_state; 530extern struct rcu_state rcu_bh_state;
531DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); 531DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
532 532
533#ifdef CONFIG_TREE_PREEMPT_RCU 533#ifdef CONFIG_PREEMPT_RCU
534extern struct rcu_state rcu_preempt_state; 534extern struct rcu_state rcu_preempt_state;
535DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 535DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
536#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 536#endif /* #ifdef CONFIG_PREEMPT_RCU */
537 537
538#ifdef CONFIG_RCU_BOOST 538#ifdef CONFIG_RCU_BOOST
539DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); 539DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -547,7 +547,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
547/* Forward declarations for rcutree_plugin.h */ 547/* Forward declarations for rcutree_plugin.h */
548static void rcu_bootup_announce(void); 548static void rcu_bootup_announce(void);
549long rcu_batches_completed(void); 549long rcu_batches_completed(void);
550static void rcu_preempt_note_context_switch(int cpu); 550static void rcu_preempt_note_context_switch(void);
551static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 551static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
552#ifdef CONFIG_HOTPLUG_CPU 552#ifdef CONFIG_HOTPLUG_CPU
553static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 553static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -561,12 +561,12 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
561 struct rcu_node *rnp, 561 struct rcu_node *rnp,
562 struct rcu_data *rdp); 562 struct rcu_data *rdp);
563#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 563#endif /* #ifdef CONFIG_HOTPLUG_CPU */
564static void rcu_preempt_check_callbacks(int cpu); 564static void rcu_preempt_check_callbacks(void);
565void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 565void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
566#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) 566#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
567static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 567static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
568 bool wake); 568 bool wake);
569#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 569#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
570static void __init __rcu_init_preempt(void); 570static void __init __rcu_init_preempt(void);
571static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 571static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
572static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 572static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -579,14 +579,15 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
579#endif /* #ifdef CONFIG_RCU_BOOST */ 579#endif /* #ifdef CONFIG_RCU_BOOST */
580static void __init rcu_spawn_boost_kthreads(void); 580static void __init rcu_spawn_boost_kthreads(void);
581static void rcu_prepare_kthreads(int cpu); 581static void rcu_prepare_kthreads(int cpu);
582static void rcu_cleanup_after_idle(int cpu); 582static void rcu_cleanup_after_idle(void);
583static void rcu_prepare_for_idle(int cpu); 583static void rcu_prepare_for_idle(void);
584static void rcu_idle_count_callbacks_posted(void); 584static void rcu_idle_count_callbacks_posted(void);
585static void print_cpu_stall_info_begin(void); 585static void print_cpu_stall_info_begin(void);
586static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); 586static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
587static void print_cpu_stall_info_end(void); 587static void print_cpu_stall_info_end(void);
588static void zero_cpu_stall_ticks(struct rcu_data *rdp); 588static void zero_cpu_stall_ticks(struct rcu_data *rdp);
589static void increment_cpu_stall_ticks(void); 589static void increment_cpu_stall_ticks(void);
590static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
590static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); 591static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
591static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); 592static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
592static void rcu_init_one_nocb(struct rcu_node *rnp); 593static void rcu_init_one_nocb(struct rcu_node *rnp);
@@ -605,8 +606,8 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
605#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 606#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
606static void __maybe_unused rcu_kick_nohz_cpu(int cpu); 607static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
607static bool init_nocb_callback_list(struct rcu_data *rdp); 608static bool init_nocb_callback_list(struct rcu_data *rdp);
608static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); 609static void rcu_sysidle_enter(int irq);
609static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); 610static void rcu_sysidle_exit(int irq);
610static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, 611static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
611 unsigned long *maxj); 612 unsigned long *maxj);
612static bool is_sysidle_rcu_state(struct rcu_state *rsp); 613static bool is_sysidle_rcu_state(struct rcu_state *rsp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 387dd4599344..3ec85cb5d544 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -30,14 +30,24 @@
30#include <linux/smpboot.h> 30#include <linux/smpboot.h>
31#include "../time/tick-internal.h" 31#include "../time/tick-internal.h"
32 32
33#define RCU_KTHREAD_PRIO 1
34
35#ifdef CONFIG_RCU_BOOST 33#ifdef CONFIG_RCU_BOOST
34
36#include "../locking/rtmutex_common.h" 35#include "../locking/rtmutex_common.h"
37#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO 36
38#else 37/* rcuc/rcub kthread realtime priority */
39#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO 38static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
40#endif 39module_param(kthread_prio, int, 0644);
40
41/*
42 * Control variables for per-CPU and per-rcu_node kthreads. These
43 * handle all flavors of RCU.
44 */
45static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
46DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
47DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
48DEFINE_PER_CPU(char, rcu_cpu_has_work);
49
50#endif /* #ifdef CONFIG_RCU_BOOST */
41 51
42#ifdef CONFIG_RCU_NOCB_CPU 52#ifdef CONFIG_RCU_NOCB_CPU
43static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ 53static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -72,9 +82,6 @@ static void __init rcu_bootup_announce_oddness(void)
72#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 82#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
73 pr_info("\tRCU torture testing starts during boot.\n"); 83 pr_info("\tRCU torture testing starts during boot.\n");
74#endif 84#endif
75#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
76 pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");
77#endif
78#if defined(CONFIG_RCU_CPU_STALL_INFO) 85#if defined(CONFIG_RCU_CPU_STALL_INFO)
79 pr_info("\tAdditional per-CPU info printed with stalls.\n"); 86 pr_info("\tAdditional per-CPU info printed with stalls.\n");
80#endif 87#endif
@@ -85,9 +92,12 @@ static void __init rcu_bootup_announce_oddness(void)
85 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 92 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
86 if (nr_cpu_ids != NR_CPUS) 93 if (nr_cpu_ids != NR_CPUS)
87 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 94 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
95#ifdef CONFIG_RCU_BOOST
96 pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
97#endif
88} 98}
89 99
90#ifdef CONFIG_TREE_PREEMPT_RCU 100#ifdef CONFIG_PREEMPT_RCU
91 101
92RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); 102RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
93static struct rcu_state *rcu_state_p = &rcu_preempt_state; 103static struct rcu_state *rcu_state_p = &rcu_preempt_state;
@@ -156,7 +166,7 @@ static void rcu_preempt_qs(void)
156 * 166 *
157 * Caller must disable preemption. 167 * Caller must disable preemption.
158 */ 168 */
159static void rcu_preempt_note_context_switch(int cpu) 169static void rcu_preempt_note_context_switch(void)
160{ 170{
161 struct task_struct *t = current; 171 struct task_struct *t = current;
162 unsigned long flags; 172 unsigned long flags;
@@ -167,7 +177,7 @@ static void rcu_preempt_note_context_switch(int cpu)
167 !t->rcu_read_unlock_special.b.blocked) { 177 !t->rcu_read_unlock_special.b.blocked) {
168 178
169 /* Possibly blocking in an RCU read-side critical section. */ 179 /* Possibly blocking in an RCU read-side critical section. */
170 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 180 rdp = this_cpu_ptr(rcu_preempt_state.rda);
171 rnp = rdp->mynode; 181 rnp = rdp->mynode;
172 raw_spin_lock_irqsave(&rnp->lock, flags); 182 raw_spin_lock_irqsave(&rnp->lock, flags);
173 smp_mb__after_unlock_lock(); 183 smp_mb__after_unlock_lock();
@@ -415,8 +425,6 @@ void rcu_read_unlock_special(struct task_struct *t)
415 } 425 }
416} 426}
417 427
418#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
419
420/* 428/*
421 * Dump detailed information for all tasks blocking the current RCU 429 * Dump detailed information for all tasks blocking the current RCU
422 * grace period on the specified rcu_node structure. 430 * grace period on the specified rcu_node structure.
@@ -451,14 +459,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
451 rcu_print_detail_task_stall_rnp(rnp); 459 rcu_print_detail_task_stall_rnp(rnp);
452} 460}
453 461
454#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
455
456static void rcu_print_detail_task_stall(struct rcu_state *rsp)
457{
458}
459
460#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
461
462#ifdef CONFIG_RCU_CPU_STALL_INFO 462#ifdef CONFIG_RCU_CPU_STALL_INFO
463 463
464static void rcu_print_task_stall_begin(struct rcu_node *rnp) 464static void rcu_print_task_stall_begin(struct rcu_node *rnp)
@@ -621,7 +621,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
621 * 621 *
622 * Caller must disable hard irqs. 622 * Caller must disable hard irqs.
623 */ 623 */
624static void rcu_preempt_check_callbacks(int cpu) 624static void rcu_preempt_check_callbacks(void)
625{ 625{
626 struct task_struct *t = current; 626 struct task_struct *t = current;
627 627
@@ -630,8 +630,8 @@ static void rcu_preempt_check_callbacks(int cpu)
630 return; 630 return;
631 } 631 }
632 if (t->rcu_read_lock_nesting > 0 && 632 if (t->rcu_read_lock_nesting > 0 &&
633 per_cpu(rcu_preempt_data, cpu).qs_pending && 633 __this_cpu_read(rcu_preempt_data.qs_pending) &&
634 !per_cpu(rcu_preempt_data, cpu).passed_quiesce) 634 !__this_cpu_read(rcu_preempt_data.passed_quiesce))
635 t->rcu_read_unlock_special.b.need_qs = true; 635 t->rcu_read_unlock_special.b.need_qs = true;
636} 636}
637 637
@@ -919,7 +919,7 @@ void exit_rcu(void)
919 __rcu_read_unlock(); 919 __rcu_read_unlock();
920} 920}
921 921
922#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 922#else /* #ifdef CONFIG_PREEMPT_RCU */
923 923
924static struct rcu_state *rcu_state_p = &rcu_sched_state; 924static struct rcu_state *rcu_state_p = &rcu_sched_state;
925 925
@@ -945,7 +945,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
945 * Because preemptible RCU does not exist, we never have to check for 945 * Because preemptible RCU does not exist, we never have to check for
946 * CPUs being in quiescent states. 946 * CPUs being in quiescent states.
947 */ 947 */
948static void rcu_preempt_note_context_switch(int cpu) 948static void rcu_preempt_note_context_switch(void)
949{ 949{
950} 950}
951 951
@@ -1017,7 +1017,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1017 * Because preemptible RCU does not exist, it never has any callbacks 1017 * Because preemptible RCU does not exist, it never has any callbacks
1018 * to check. 1018 * to check.
1019 */ 1019 */
1020static void rcu_preempt_check_callbacks(int cpu) 1020static void rcu_preempt_check_callbacks(void)
1021{ 1021{
1022} 1022}
1023 1023
@@ -1070,7 +1070,7 @@ void exit_rcu(void)
1070{ 1070{
1071} 1071}
1072 1072
1073#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1073#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
1074 1074
1075#ifdef CONFIG_RCU_BOOST 1075#ifdef CONFIG_RCU_BOOST
1076 1076
@@ -1326,7 +1326,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1326 smp_mb__after_unlock_lock(); 1326 smp_mb__after_unlock_lock();
1327 rnp->boost_kthread_task = t; 1327 rnp->boost_kthread_task = t;
1328 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1328 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1329 sp.sched_priority = RCU_BOOST_PRIO; 1329 sp.sched_priority = kthread_prio;
1330 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1330 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1331 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ 1331 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1332 return 0; 1332 return 0;
@@ -1343,7 +1343,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
1343{ 1343{
1344 struct sched_param sp; 1344 struct sched_param sp;
1345 1345
1346 sp.sched_priority = RCU_KTHREAD_PRIO; 1346 sp.sched_priority = kthread_prio;
1347 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); 1347 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1348} 1348}
1349 1349
@@ -1512,10 +1512,10 @@ static void rcu_prepare_kthreads(int cpu)
1512 * any flavor of RCU. 1512 * any flavor of RCU.
1513 */ 1513 */
1514#ifndef CONFIG_RCU_NOCB_CPU_ALL 1514#ifndef CONFIG_RCU_NOCB_CPU_ALL
1515int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1515int rcu_needs_cpu(unsigned long *delta_jiffies)
1516{ 1516{
1517 *delta_jiffies = ULONG_MAX; 1517 *delta_jiffies = ULONG_MAX;
1518 return rcu_cpu_has_callbacks(cpu, NULL); 1518 return rcu_cpu_has_callbacks(NULL);
1519} 1519}
1520#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ 1520#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1521 1521
@@ -1523,7 +1523,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1523 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up 1523 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1524 * after it. 1524 * after it.
1525 */ 1525 */
1526static void rcu_cleanup_after_idle(int cpu) 1526static void rcu_cleanup_after_idle(void)
1527{ 1527{
1528} 1528}
1529 1529
@@ -1531,7 +1531,7 @@ static void rcu_cleanup_after_idle(int cpu)
1531 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, 1531 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
1532 * is nothing. 1532 * is nothing.
1533 */ 1533 */
1534static void rcu_prepare_for_idle(int cpu) 1534static void rcu_prepare_for_idle(void)
1535{ 1535{
1536} 1536}
1537 1537
@@ -1624,15 +1624,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
1624 * The caller must have disabled interrupts. 1624 * The caller must have disabled interrupts.
1625 */ 1625 */
1626#ifndef CONFIG_RCU_NOCB_CPU_ALL 1626#ifndef CONFIG_RCU_NOCB_CPU_ALL
1627int rcu_needs_cpu(int cpu, unsigned long *dj) 1627int rcu_needs_cpu(unsigned long *dj)
1628{ 1628{
1629 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1629 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1630 1630
1631 /* Snapshot to detect later posting of non-lazy callback. */ 1631 /* Snapshot to detect later posting of non-lazy callback. */
1632 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; 1632 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1633 1633
1634 /* If no callbacks, RCU doesn't need the CPU. */ 1634 /* If no callbacks, RCU doesn't need the CPU. */
1635 if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { 1635 if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
1636 *dj = ULONG_MAX; 1636 *dj = ULONG_MAX;
1637 return 0; 1637 return 0;
1638 } 1638 }
@@ -1666,12 +1666,12 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
1666 * 1666 *
1667 * The caller must have disabled interrupts. 1667 * The caller must have disabled interrupts.
1668 */ 1668 */
1669static void rcu_prepare_for_idle(int cpu) 1669static void rcu_prepare_for_idle(void)
1670{ 1670{
1671#ifndef CONFIG_RCU_NOCB_CPU_ALL 1671#ifndef CONFIG_RCU_NOCB_CPU_ALL
1672 bool needwake; 1672 bool needwake;
1673 struct rcu_data *rdp; 1673 struct rcu_data *rdp;
1674 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1674 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1675 struct rcu_node *rnp; 1675 struct rcu_node *rnp;
1676 struct rcu_state *rsp; 1676 struct rcu_state *rsp;
1677 int tne; 1677 int tne;
@@ -1679,7 +1679,7 @@ static void rcu_prepare_for_idle(int cpu)
1679 /* Handle nohz enablement switches conservatively. */ 1679 /* Handle nohz enablement switches conservatively. */
1680 tne = ACCESS_ONCE(tick_nohz_active); 1680 tne = ACCESS_ONCE(tick_nohz_active);
1681 if (tne != rdtp->tick_nohz_enabled_snap) { 1681 if (tne != rdtp->tick_nohz_enabled_snap) {
1682 if (rcu_cpu_has_callbacks(cpu, NULL)) 1682 if (rcu_cpu_has_callbacks(NULL))
1683 invoke_rcu_core(); /* force nohz to see update. */ 1683 invoke_rcu_core(); /* force nohz to see update. */
1684 rdtp->tick_nohz_enabled_snap = tne; 1684 rdtp->tick_nohz_enabled_snap = tne;
1685 return; 1685 return;
@@ -1688,7 +1688,7 @@ static void rcu_prepare_for_idle(int cpu)
1688 return; 1688 return;
1689 1689
1690 /* If this is a no-CBs CPU, no callbacks, just return. */ 1690 /* If this is a no-CBs CPU, no callbacks, just return. */
1691 if (rcu_is_nocb_cpu(cpu)) 1691 if (rcu_is_nocb_cpu(smp_processor_id()))
1692 return; 1692 return;
1693 1693
1694 /* 1694 /*
@@ -1712,7 +1712,7 @@ static void rcu_prepare_for_idle(int cpu)
1712 return; 1712 return;
1713 rdtp->last_accelerate = jiffies; 1713 rdtp->last_accelerate = jiffies;
1714 for_each_rcu_flavor(rsp) { 1714 for_each_rcu_flavor(rsp) {
1715 rdp = per_cpu_ptr(rsp->rda, cpu); 1715 rdp = this_cpu_ptr(rsp->rda);
1716 if (!*rdp->nxttail[RCU_DONE_TAIL]) 1716 if (!*rdp->nxttail[RCU_DONE_TAIL])
1717 continue; 1717 continue;
1718 rnp = rdp->mynode; 1718 rnp = rdp->mynode;
@@ -1731,10 +1731,10 @@ static void rcu_prepare_for_idle(int cpu)
1731 * any grace periods that elapsed while the CPU was idle, and if any 1731 * any grace periods that elapsed while the CPU was idle, and if any
1732 * callbacks are now ready to invoke, initiate invocation. 1732 * callbacks are now ready to invoke, initiate invocation.
1733 */ 1733 */
1734static void rcu_cleanup_after_idle(int cpu) 1734static void rcu_cleanup_after_idle(void)
1735{ 1735{
1736#ifndef CONFIG_RCU_NOCB_CPU_ALL 1736#ifndef CONFIG_RCU_NOCB_CPU_ALL
1737 if (rcu_is_nocb_cpu(cpu)) 1737 if (rcu_is_nocb_cpu(smp_processor_id()))
1738 return; 1738 return;
1739 if (rcu_try_advance_all_cbs()) 1739 if (rcu_try_advance_all_cbs())
1740 invoke_rcu_core(); 1740 invoke_rcu_core();
@@ -2050,6 +2050,33 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
2050} 2050}
2051 2051
2052/* 2052/*
2053 * Does the specified CPU need an RCU callback for the specified flavor
2054 * of rcu_barrier()?
2055 */
2056static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2057{
2058 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2059 struct rcu_head *rhp;
2060
2061 /* No-CBs CPUs might have callbacks on any of three lists. */
2062 rhp = ACCESS_ONCE(rdp->nocb_head);
2063 if (!rhp)
2064 rhp = ACCESS_ONCE(rdp->nocb_gp_head);
2065 if (!rhp)
2066 rhp = ACCESS_ONCE(rdp->nocb_follower_head);
2067
2068 /* Having no rcuo kthread but CBs after scheduler starts is bad! */
2069 if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) {
2070 /* RCU callback enqueued before CPU first came online??? */
2071 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
2072 cpu, rhp->func);
2073 WARN_ON_ONCE(1);
2074 }
2075
2076 return !!rhp;
2077}
2078
2079/*
2053 * Enqueue the specified string of rcu_head structures onto the specified 2080 * Enqueue the specified string of rcu_head structures onto the specified
2054 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the 2081 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
2055 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy 2082 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
@@ -2546,9 +2573,13 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
2546 rdp->nocb_leader = rdp_spawn; 2573 rdp->nocb_leader = rdp_spawn;
2547 if (rdp_last && rdp != rdp_spawn) 2574 if (rdp_last && rdp != rdp_spawn)
2548 rdp_last->nocb_next_follower = rdp; 2575 rdp_last->nocb_next_follower = rdp;
2549 rdp_last = rdp; 2576 if (rdp == rdp_spawn) {
2550 rdp = rdp->nocb_next_follower; 2577 rdp = rdp->nocb_next_follower;
2551 rdp_last->nocb_next_follower = NULL; 2578 } else {
2579 rdp_last = rdp;
2580 rdp = rdp->nocb_next_follower;
2581 rdp_last->nocb_next_follower = NULL;
2582 }
2552 } while (rdp); 2583 } while (rdp);
2553 rdp_spawn->nocb_next_follower = rdp_old_leader; 2584 rdp_spawn->nocb_next_follower = rdp_old_leader;
2554 } 2585 }
@@ -2642,6 +2673,12 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2642 2673
2643#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2674#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2644 2675
2676static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2677{
2678 WARN_ON_ONCE(1); /* Should be dead code. */
2679 return false;
2680}
2681
2645static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 2682static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2646{ 2683{
2647} 2684}
@@ -2728,9 +2765,10 @@ static int full_sysidle_state; /* Current system-idle state. */
2728 * to detect full-system idle states, not RCU quiescent states and grace 2765 * to detect full-system idle states, not RCU quiescent states and grace
2729 * periods. The caller must have disabled interrupts. 2766 * periods. The caller must have disabled interrupts.
2730 */ 2767 */
2731static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) 2768static void rcu_sysidle_enter(int irq)
2732{ 2769{
2733 unsigned long j; 2770 unsigned long j;
2771 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
2734 2772
2735 /* If there are no nohz_full= CPUs, no need to track this. */ 2773 /* If there are no nohz_full= CPUs, no need to track this. */
2736 if (!tick_nohz_full_enabled()) 2774 if (!tick_nohz_full_enabled())
@@ -2799,8 +2837,10 @@ void rcu_sysidle_force_exit(void)
2799 * usermode execution does -not- count as idle here! The caller must 2837 * usermode execution does -not- count as idle here! The caller must
2800 * have disabled interrupts. 2838 * have disabled interrupts.
2801 */ 2839 */
2802static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) 2840static void rcu_sysidle_exit(int irq)
2803{ 2841{
2842 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
2843
2804 /* If there are no nohz_full= CPUs, no need to track this. */ 2844 /* If there are no nohz_full= CPUs, no need to track this. */
2805 if (!tick_nohz_full_enabled()) 2845 if (!tick_nohz_full_enabled())
2806 return; 2846 return;
@@ -3094,11 +3134,11 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
3094 3134
3095#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 3135#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3096 3136
3097static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) 3137static void rcu_sysidle_enter(int irq)
3098{ 3138{
3099} 3139}
3100 3140
3101static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) 3141static void rcu_sysidle_exit(int irq)
3102{ 3142{
3103} 3143}
3104 3144
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 3ef8ba58694e..e0d31a345ee6 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -306,7 +306,7 @@ struct debug_obj_descr rcuhead_debug_descr = {
306EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 306EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
307#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 307#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
308 308
309#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) 309#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
310void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, 310void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
311 unsigned long secs, 311 unsigned long secs,
312 unsigned long c_old, unsigned long c) 312 unsigned long c_old, unsigned long c)
@@ -531,7 +531,8 @@ static int __noreturn rcu_tasks_kthread(void *arg)
531 struct rcu_head *next; 531 struct rcu_head *next;
532 LIST_HEAD(rcu_tasks_holdouts); 532 LIST_HEAD(rcu_tasks_holdouts);
533 533
534 /* FIXME: Add housekeeping affinity. */ 534 /* Run on housekeeping CPUs by default. Sysadm can move if desired. */
535 housekeeping_affine(current);
535 536
536 /* 537 /*
537 * Each pass through the following loop makes one check for 538 * Each pass through the following loop makes one check for
@@ -690,3 +691,87 @@ static void rcu_spawn_tasks_kthread(void)
690} 691}
691 692
692#endif /* #ifdef CONFIG_TASKS_RCU */ 693#endif /* #ifdef CONFIG_TASKS_RCU */
694
695#ifdef CONFIG_PROVE_RCU
696
697/*
698 * Early boot self test parameters, one for each flavor
699 */
700static bool rcu_self_test;
701static bool rcu_self_test_bh;
702static bool rcu_self_test_sched;
703
704module_param(rcu_self_test, bool, 0444);
705module_param(rcu_self_test_bh, bool, 0444);
706module_param(rcu_self_test_sched, bool, 0444);
707
708static int rcu_self_test_counter;
709
710static void test_callback(struct rcu_head *r)
711{
712 rcu_self_test_counter++;
713 pr_info("RCU test callback executed %d\n", rcu_self_test_counter);
714}
715
716static void early_boot_test_call_rcu(void)
717{
718 static struct rcu_head head;
719
720 call_rcu(&head, test_callback);
721}
722
723static void early_boot_test_call_rcu_bh(void)
724{
725 static struct rcu_head head;
726
727 call_rcu_bh(&head, test_callback);
728}
729
730static void early_boot_test_call_rcu_sched(void)
731{
732 static struct rcu_head head;
733
734 call_rcu_sched(&head, test_callback);
735}
736
737void rcu_early_boot_tests(void)
738{
739 pr_info("Running RCU self tests\n");
740
741 if (rcu_self_test)
742 early_boot_test_call_rcu();
743 if (rcu_self_test_bh)
744 early_boot_test_call_rcu_bh();
745 if (rcu_self_test_sched)
746 early_boot_test_call_rcu_sched();
747}
748
749static int rcu_verify_early_boot_tests(void)
750{
751 int ret = 0;
752 int early_boot_test_counter = 0;
753
754 if (rcu_self_test) {
755 early_boot_test_counter++;
756 rcu_barrier();
757 }
758 if (rcu_self_test_bh) {
759 early_boot_test_counter++;
760 rcu_barrier_bh();
761 }
762 if (rcu_self_test_sched) {
763 early_boot_test_counter++;
764 rcu_barrier_sched();
765 }
766
767 if (rcu_self_test_counter != early_boot_test_counter) {
768 WARN_ON(1);
769 ret = -1;
770 }
771
772 return ret;
773}
774late_initcall(rcu_verify_early_boot_tests);
775#else
776void rcu_early_boot_tests(void) {}
777#endif /* CONFIG_PROVE_RCU */
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
deleted file mode 100644
index e791130f85a7..000000000000
--- a/kernel/res_counter.c
+++ /dev/null
@@ -1,211 +0,0 @@
1/*
2 * resource cgroups
3 *
4 * Copyright 2007 OpenVZ SWsoft Inc
5 *
6 * Author: Pavel Emelianov <xemul@openvz.org>
7 *
8 */
9
10#include <linux/types.h>
11#include <linux/parser.h>
12#include <linux/fs.h>
13#include <linux/res_counter.h>
14#include <linux/uaccess.h>
15#include <linux/mm.h>
16
17void res_counter_init(struct res_counter *counter, struct res_counter *parent)
18{
19 spin_lock_init(&counter->lock);
20 counter->limit = RES_COUNTER_MAX;
21 counter->soft_limit = RES_COUNTER_MAX;
22 counter->parent = parent;
23}
24
25static u64 res_counter_uncharge_locked(struct res_counter *counter,
26 unsigned long val)
27{
28 if (WARN_ON(counter->usage < val))
29 val = counter->usage;
30
31 counter->usage -= val;
32 return counter->usage;
33}
34
35static int res_counter_charge_locked(struct res_counter *counter,
36 unsigned long val, bool force)
37{
38 int ret = 0;
39
40 if (counter->usage + val > counter->limit) {
41 counter->failcnt++;
42 ret = -ENOMEM;
43 if (!force)
44 return ret;
45 }
46
47 counter->usage += val;
48 if (counter->usage > counter->max_usage)
49 counter->max_usage = counter->usage;
50 return ret;
51}
52
53static int __res_counter_charge(struct res_counter *counter, unsigned long val,
54 struct res_counter **limit_fail_at, bool force)
55{
56 int ret, r;
57 unsigned long flags;
58 struct res_counter *c, *u;
59
60 r = ret = 0;
61 *limit_fail_at = NULL;
62 local_irq_save(flags);
63 for (c = counter; c != NULL; c = c->parent) {
64 spin_lock(&c->lock);
65 r = res_counter_charge_locked(c, val, force);
66 spin_unlock(&c->lock);
67 if (r < 0 && !ret) {
68 ret = r;
69 *limit_fail_at = c;
70 if (!force)
71 break;
72 }
73 }
74
75 if (ret < 0 && !force) {
76 for (u = counter; u != c; u = u->parent) {
77 spin_lock(&u->lock);
78 res_counter_uncharge_locked(u, val);
79 spin_unlock(&u->lock);
80 }
81 }
82 local_irq_restore(flags);
83
84 return ret;
85}
86
87int res_counter_charge(struct res_counter *counter, unsigned long val,
88 struct res_counter **limit_fail_at)
89{
90 return __res_counter_charge(counter, val, limit_fail_at, false);
91}
92
93int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
94 struct res_counter **limit_fail_at)
95{
96 return __res_counter_charge(counter, val, limit_fail_at, true);
97}
98
99u64 res_counter_uncharge_until(struct res_counter *counter,
100 struct res_counter *top,
101 unsigned long val)
102{
103 unsigned long flags;
104 struct res_counter *c;
105 u64 ret = 0;
106
107 local_irq_save(flags);
108 for (c = counter; c != top; c = c->parent) {
109 u64 r;
110 spin_lock(&c->lock);
111 r = res_counter_uncharge_locked(c, val);
112 if (c == counter)
113 ret = r;
114 spin_unlock(&c->lock);
115 }
116 local_irq_restore(flags);
117 return ret;
118}
119
120u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
121{
122 return res_counter_uncharge_until(counter, NULL, val);
123}
124
125static inline unsigned long long *
126res_counter_member(struct res_counter *counter, int member)
127{
128 switch (member) {
129 case RES_USAGE:
130 return &counter->usage;
131 case RES_MAX_USAGE:
132 return &counter->max_usage;
133 case RES_LIMIT:
134 return &counter->limit;
135 case RES_FAILCNT:
136 return &counter->failcnt;
137 case RES_SOFT_LIMIT:
138 return &counter->soft_limit;
139 };
140
141 BUG();
142 return NULL;
143}
144
145ssize_t res_counter_read(struct res_counter *counter, int member,
146 const char __user *userbuf, size_t nbytes, loff_t *pos,
147 int (*read_strategy)(unsigned long long val, char *st_buf))
148{
149 unsigned long long *val;
150 char buf[64], *s;
151
152 s = buf;
153 val = res_counter_member(counter, member);
154 if (read_strategy)
155 s += read_strategy(*val, s);
156 else
157 s += sprintf(s, "%llu\n", *val);
158 return simple_read_from_buffer((void __user *)userbuf, nbytes,
159 pos, buf, s - buf);
160}
161
162#if BITS_PER_LONG == 32
163u64 res_counter_read_u64(struct res_counter *counter, int member)
164{
165 unsigned long flags;
166 u64 ret;
167
168 spin_lock_irqsave(&counter->lock, flags);
169 ret = *res_counter_member(counter, member);
170 spin_unlock_irqrestore(&counter->lock, flags);
171
172 return ret;
173}
174#else
175u64 res_counter_read_u64(struct res_counter *counter, int member)
176{
177 return *res_counter_member(counter, member);
178}
179#endif
180
181int res_counter_memparse_write_strategy(const char *buf,
182 unsigned long long *resp)
183{
184 char *end;
185 unsigned long long res;
186
187 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
188 if (*buf == '-') {
189 int rc = kstrtoull(buf + 1, 10, &res);
190
191 if (rc)
192 return rc;
193 if (res != 1)
194 return -EINVAL;
195 *resp = RES_COUNTER_MAX;
196 return 0;
197 }
198
199 res = memparse(buf, &end);
200 if (*end != '\0')
201 return -EINVAL;
202
203 if (PAGE_ALIGN(res) >= res)
204 res = PAGE_ALIGN(res);
205 else
206 res = RES_COUNTER_MAX;
207
208 *resp = res;
209
210 return 0;
211}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a63f4dc27909..607f852b4d04 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
148 * 148 *
149 * This waits to be signaled for completion of a specific task. It is NOT 149 * This waits to be signaled for completion of a specific task. It is NOT
150 * interruptible and there is no timeout. The caller is accounted as waiting 150 * interruptible and there is no timeout. The caller is accounted as waiting
151 * for IO. 151 * for IO (which traditionally means blkio only).
152 */ 152 */
153void __sched wait_for_completion_io(struct completion *x) 153void __sched wait_for_completion_io(struct completion *x)
154{ 154{
@@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
163 * 163 *
164 * This waits for either a completion of a specific task to be signaled or for a 164 * This waits for either a completion of a specific task to be signaled or for a
165 * specified timeout to expire. The timeout is in jiffies. It is not 165 * specified timeout to expire. The timeout is in jiffies. It is not
166 * interruptible. The caller is accounted as waiting for IO. 166 * interruptible. The caller is accounted as waiting for IO (which traditionally
167 * means blkio only).
167 * 168 *
168 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left 169 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
169 * till timeout) if completed. 170 * till timeout) if completed.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44999505e1bf..b5797b78add6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
1008 return cpu_curr(task_cpu(p)) == p; 1008 return cpu_curr(task_cpu(p)) == p;
1009} 1009}
1010 1010
1011/*
1012 * Can drop rq->lock because from sched_class::switched_from() methods drop it.
1013 */
1011static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1014static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1012 const struct sched_class *prev_class, 1015 const struct sched_class *prev_class,
1013 int oldprio) 1016 int oldprio)
@@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1015 if (prev_class != p->sched_class) { 1018 if (prev_class != p->sched_class) {
1016 if (prev_class->switched_from) 1019 if (prev_class->switched_from)
1017 prev_class->switched_from(rq, p); 1020 prev_class->switched_from(rq, p);
1021 /* Possble rq->lock 'hole'. */
1018 p->sched_class->switched_to(rq, p); 1022 p->sched_class->switched_to(rq, p);
1019 } else if (oldprio != p->prio || dl_task(p)) 1023 } else if (oldprio != p->prio || dl_task(p))
1020 p->sched_class->prio_changed(rq, p, oldprio); 1024 p->sched_class->prio_changed(rq, p, oldprio);
@@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1054 * ttwu() will sort out the placement. 1058 * ttwu() will sort out the placement.
1055 */ 1059 */
1056 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1060 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1057 !(task_preempt_count(p) & PREEMPT_ACTIVE)); 1061 !p->on_rq);
1058 1062
1059#ifdef CONFIG_LOCKDEP 1063#ifdef CONFIG_LOCKDEP
1060 /* 1064 /*
@@ -1407,7 +1411,8 @@ out:
1407static inline 1411static inline
1408int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1412int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1409{ 1413{
1410 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1414 if (p->nr_cpus_allowed > 1)
1415 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1411 1416
1412 /* 1417 /*
1413 * In order not to call set_task_cpu() on a blocking task we need 1418 * In order not to call set_task_cpu() on a blocking task we need
@@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu)
1623 struct rq *rq = cpu_rq(cpu); 1628 struct rq *rq = cpu_rq(cpu);
1624 unsigned long flags; 1629 unsigned long flags;
1625 1630
1626 if (!is_idle_task(rq->curr)) 1631 rcu_read_lock();
1627 return; 1632
1633 if (!is_idle_task(rcu_dereference(rq->curr)))
1634 goto out;
1628 1635
1629 if (set_nr_if_polling(rq->idle)) { 1636 if (set_nr_if_polling(rq->idle)) {
1630 trace_sched_wake_idle_without_ipi(cpu); 1637 trace_sched_wake_idle_without_ipi(cpu);
@@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu)
1635 /* Else cpu is not in idle, do nothing here */ 1642 /* Else cpu is not in idle, do nothing here */
1636 raw_spin_unlock_irqrestore(&rq->lock, flags); 1643 raw_spin_unlock_irqrestore(&rq->lock, flags);
1637 } 1644 }
1645
1646out:
1647 rcu_read_unlock();
1638} 1648}
1639 1649
1640bool cpus_share_cache(int this_cpu, int that_cpu) 1650bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1853 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1863 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1854 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1864 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1855 p->numa_work.next = &p->numa_work; 1865 p->numa_work.next = &p->numa_work;
1856 p->numa_faults_memory = NULL; 1866 p->numa_faults = NULL;
1857 p->numa_faults_buffer_memory = NULL;
1858 p->last_task_numa_placement = 0; 1867 p->last_task_numa_placement = 0;
1859 p->last_sum_exec_runtime = 0; 1868 p->last_sum_exec_runtime = 0;
1860 1869
1861 INIT_LIST_HEAD(&p->numa_entry);
1862 p->numa_group = NULL; 1870 p->numa_group = NULL;
1863#endif /* CONFIG_NUMA_BALANCING */ 1871#endif /* CONFIG_NUMA_BALANCING */
1864} 1872}
@@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
2034} 2042}
2035#endif 2043#endif
2036 2044
2037static inline
2038void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
2039{
2040 dl_b->total_bw -= tsk_bw;
2041}
2042
2043static inline
2044void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
2045{
2046 dl_b->total_bw += tsk_bw;
2047}
2048
2049static inline
2050bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
2051{
2052 return dl_b->bw != -1 &&
2053 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
2054}
2055
2056/* 2045/*
2057 * We must be sure that accepting a new task (or allowing changing the 2046 * We must be sure that accepting a new task (or allowing changing the
2058 * parameters of an existing one) is consistent with the bandwidth 2047 * parameters of an existing one) is consistent with the bandwidth
@@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2220 2209
2221/** 2210/**
2222 * finish_task_switch - clean up after a task-switch 2211 * finish_task_switch - clean up after a task-switch
2223 * @rq: runqueue associated with task-switch
2224 * @prev: the thread we just switched away from. 2212 * @prev: the thread we just switched away from.
2225 * 2213 *
2226 * finish_task_switch must be called after the context switch, paired 2214 * finish_task_switch must be called after the context switch, paired
@@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2232 * so, we finish that here outside of the runqueue lock. (Doing it 2220 * so, we finish that here outside of the runqueue lock. (Doing it
2233 * with the lock held can cause deadlocks; see schedule() for 2221 * with the lock held can cause deadlocks; see schedule() for
2234 * details.) 2222 * details.)
2223 *
2224 * The context switch have flipped the stack from under us and restored the
2225 * local variables which were saved when this task called schedule() in the
2226 * past. prev == current is still correct but we need to recalculate this_rq
2227 * because prev may have moved to another CPU.
2235 */ 2228 */
2236static void finish_task_switch(struct rq *rq, struct task_struct *prev) 2229static struct rq *finish_task_switch(struct task_struct *prev)
2237 __releases(rq->lock) 2230 __releases(rq->lock)
2238{ 2231{
2232 struct rq *rq = this_rq();
2239 struct mm_struct *mm = rq->prev_mm; 2233 struct mm_struct *mm = rq->prev_mm;
2240 long prev_state; 2234 long prev_state;
2241 2235
@@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2275 } 2269 }
2276 2270
2277 tick_nohz_task_switch(current); 2271 tick_nohz_task_switch(current);
2272 return rq;
2278} 2273}
2279 2274
2280#ifdef CONFIG_SMP 2275#ifdef CONFIG_SMP
@@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq)
2309asmlinkage __visible void schedule_tail(struct task_struct *prev) 2304asmlinkage __visible void schedule_tail(struct task_struct *prev)
2310 __releases(rq->lock) 2305 __releases(rq->lock)
2311{ 2306{
2312 struct rq *rq = this_rq(); 2307 struct rq *rq;
2313
2314 finish_task_switch(rq, prev);
2315 2308
2316 /* 2309 /* finish_task_switch() drops rq->lock and enables preemtion */
2317 * FIXME: do we need to worry about rq being invalidated by the 2310 preempt_disable();
2318 * task_switch? 2311 rq = finish_task_switch(prev);
2319 */
2320 post_schedule(rq); 2312 post_schedule(rq);
2313 preempt_enable();
2321 2314
2322 if (current->set_child_tid) 2315 if (current->set_child_tid)
2323 put_user(task_pid_vnr(current), current->set_child_tid); 2316 put_user(task_pid_vnr(current), current->set_child_tid);
2324} 2317}
2325 2318
2326/* 2319/*
2327 * context_switch - switch to the new MM and the new 2320 * context_switch - switch to the new MM and the new thread's register state.
2328 * thread's register state.
2329 */ 2321 */
2330static inline void 2322static inline struct rq *
2331context_switch(struct rq *rq, struct task_struct *prev, 2323context_switch(struct rq *rq, struct task_struct *prev,
2332 struct task_struct *next) 2324 struct task_struct *next)
2333{ 2325{
@@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
2366 context_tracking_task_switch(prev, next); 2358 context_tracking_task_switch(prev, next);
2367 /* Here we just switch the register state and the stack. */ 2359 /* Here we just switch the register state and the stack. */
2368 switch_to(prev, next, prev); 2360 switch_to(prev, next, prev);
2369
2370 barrier(); 2361 barrier();
2371 /* 2362
2372 * this_rq must be evaluated again because prev may have moved 2363 return finish_task_switch(prev);
2373 * CPUs since it called schedule(), thus the 'rq' on its stack
2374 * frame will be invalid.
2375 */
2376 finish_task_switch(this_rq(), prev);
2377} 2364}
2378 2365
2379/* 2366/*
@@ -2475,44 +2462,6 @@ EXPORT_PER_CPU_SYMBOL(kstat);
2475EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2462EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2476 2463
2477/* 2464/*
2478 * Return any ns on the sched_clock that have not yet been accounted in
2479 * @p in case that task is currently running.
2480 *
2481 * Called with task_rq_lock() held on @rq.
2482 */
2483static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2484{
2485 u64 ns = 0;
2486
2487 /*
2488 * Must be ->curr _and_ ->on_rq. If dequeued, we would
2489 * project cycles that may never be accounted to this
2490 * thread, breaking clock_gettime().
2491 */
2492 if (task_current(rq, p) && task_on_rq_queued(p)) {
2493 update_rq_clock(rq);
2494 ns = rq_clock_task(rq) - p->se.exec_start;
2495 if ((s64)ns < 0)
2496 ns = 0;
2497 }
2498
2499 return ns;
2500}
2501
2502unsigned long long task_delta_exec(struct task_struct *p)
2503{
2504 unsigned long flags;
2505 struct rq *rq;
2506 u64 ns = 0;
2507
2508 rq = task_rq_lock(p, &flags);
2509 ns = do_task_delta_exec(p, rq);
2510 task_rq_unlock(rq, p, &flags);
2511
2512 return ns;
2513}
2514
2515/*
2516 * Return accounted runtime for the task. 2465 * Return accounted runtime for the task.
2517 * In case the task is currently running, return the runtime plus current's 2466 * In case the task is currently running, return the runtime plus current's
2518 * pending runtime that have not been accounted yet. 2467 * pending runtime that have not been accounted yet.
@@ -2521,7 +2470,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2521{ 2470{
2522 unsigned long flags; 2471 unsigned long flags;
2523 struct rq *rq; 2472 struct rq *rq;
2524 u64 ns = 0; 2473 u64 ns;
2525 2474
2526#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 2475#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
2527 /* 2476 /*
@@ -2540,7 +2489,16 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2540#endif 2489#endif
2541 2490
2542 rq = task_rq_lock(p, &flags); 2491 rq = task_rq_lock(p, &flags);
2543 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 2492 /*
2493 * Must be ->curr _and_ ->on_rq. If dequeued, we would
2494 * project cycles that may never be accounted to this
2495 * thread, breaking clock_gettime().
2496 */
2497 if (task_current(rq, p) && task_on_rq_queued(p)) {
2498 update_rq_clock(rq);
2499 p->sched_class->update_curr(rq);
2500 }
2501 ns = p->se.sum_exec_runtime;
2544 task_rq_unlock(rq, p, &flags); 2502 task_rq_unlock(rq, p, &flags);
2545 2503
2546 return ns; 2504 return ns;
@@ -2802,7 +2760,7 @@ need_resched:
2802 preempt_disable(); 2760 preempt_disable();
2803 cpu = smp_processor_id(); 2761 cpu = smp_processor_id();
2804 rq = cpu_rq(cpu); 2762 rq = cpu_rq(cpu);
2805 rcu_note_context_switch(cpu); 2763 rcu_note_context_switch();
2806 prev = rq->curr; 2764 prev = rq->curr;
2807 2765
2808 schedule_debug(prev); 2766 schedule_debug(prev);
@@ -2855,15 +2813,8 @@ need_resched:
2855 rq->curr = next; 2813 rq->curr = next;
2856 ++*switch_count; 2814 ++*switch_count;
2857 2815
2858 context_switch(rq, prev, next); /* unlocks the rq */ 2816 rq = context_switch(rq, prev, next); /* unlocks the rq */
2859 /* 2817 cpu = cpu_of(rq);
2860 * The context switch have flipped the stack from under us
2861 * and restored the local variables which were saved when
2862 * this task called schedule() in the past. prev == current
2863 * is still correct, but it can be moved to another cpu/rq.
2864 */
2865 cpu = smp_processor_id();
2866 rq = cpu_rq(cpu);
2867 } else 2818 } else
2868 raw_spin_unlock_irq(&rq->lock); 2819 raw_spin_unlock_irq(&rq->lock);
2869 2820
@@ -2903,10 +2854,14 @@ asmlinkage __visible void __sched schedule_user(void)
2903 * or we have been woken up remotely but the IPI has not yet arrived, 2854 * or we have been woken up remotely but the IPI has not yet arrived,
2904 * we haven't yet exited the RCU idle mode. Do it here manually until 2855 * we haven't yet exited the RCU idle mode. Do it here manually until
2905 * we find a better solution. 2856 * we find a better solution.
2857 *
2858 * NB: There are buggy callers of this function. Ideally we
2859 * should warn if prev_state != IN_USER, but that will trigger
2860 * too frequently to make sense yet.
2906 */ 2861 */
2907 user_exit(); 2862 enum ctx_state prev_state = exception_enter();
2908 schedule(); 2863 schedule();
2909 user_enter(); 2864 exception_exit(prev_state);
2910} 2865}
2911#endif 2866#endif
2912 2867
@@ -2951,6 +2906,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
2951} 2906}
2952NOKPROBE_SYMBOL(preempt_schedule); 2907NOKPROBE_SYMBOL(preempt_schedule);
2953EXPORT_SYMBOL(preempt_schedule); 2908EXPORT_SYMBOL(preempt_schedule);
2909
2910#ifdef CONFIG_CONTEXT_TRACKING
2911/**
2912 * preempt_schedule_context - preempt_schedule called by tracing
2913 *
2914 * The tracing infrastructure uses preempt_enable_notrace to prevent
2915 * recursion and tracing preempt enabling caused by the tracing
2916 * infrastructure itself. But as tracing can happen in areas coming
2917 * from userspace or just about to enter userspace, a preempt enable
2918 * can occur before user_exit() is called. This will cause the scheduler
2919 * to be called when the system is still in usermode.
2920 *
2921 * To prevent this, the preempt_enable_notrace will use this function
2922 * instead of preempt_schedule() to exit user context if needed before
2923 * calling the scheduler.
2924 */
2925asmlinkage __visible void __sched notrace preempt_schedule_context(void)
2926{
2927 enum ctx_state prev_ctx;
2928
2929 if (likely(!preemptible()))
2930 return;
2931
2932 do {
2933 __preempt_count_add(PREEMPT_ACTIVE);
2934 /*
2935 * Needs preempt disabled in case user_exit() is traced
2936 * and the tracer calls preempt_enable_notrace() causing
2937 * an infinite recursion.
2938 */
2939 prev_ctx = exception_enter();
2940 __schedule();
2941 exception_exit(prev_ctx);
2942
2943 __preempt_count_sub(PREEMPT_ACTIVE);
2944 barrier();
2945 } while (need_resched());
2946}
2947EXPORT_SYMBOL_GPL(preempt_schedule_context);
2948#endif /* CONFIG_CONTEXT_TRACKING */
2949
2954#endif /* CONFIG_PREEMPT */ 2950#endif /* CONFIG_PREEMPT */
2955 2951
2956/* 2952/*
@@ -4531,8 +4527,10 @@ void sched_show_task(struct task_struct *p)
4531#ifdef CONFIG_DEBUG_STACK_USAGE 4527#ifdef CONFIG_DEBUG_STACK_USAGE
4532 free = stack_not_used(p); 4528 free = stack_not_used(p);
4533#endif 4529#endif
4530 ppid = 0;
4534 rcu_read_lock(); 4531 rcu_read_lock();
4535 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 4532 if (pid_alive(p))
4533 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4536 rcu_read_unlock(); 4534 rcu_read_unlock();
4537 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4535 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4538 task_pid_nr(p), ppid, 4536 task_pid_nr(p), ppid,
@@ -4637,6 +4635,81 @@ void init_idle(struct task_struct *idle, int cpu)
4637#endif 4635#endif
4638} 4636}
4639 4637
4638int cpuset_cpumask_can_shrink(const struct cpumask *cur,
4639 const struct cpumask *trial)
4640{
4641 int ret = 1, trial_cpus;
4642 struct dl_bw *cur_dl_b;
4643 unsigned long flags;
4644
4645 rcu_read_lock_sched();
4646 cur_dl_b = dl_bw_of(cpumask_any(cur));
4647 trial_cpus = cpumask_weight(trial);
4648
4649 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
4650 if (cur_dl_b->bw != -1 &&
4651 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
4652 ret = 0;
4653 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
4654 rcu_read_unlock_sched();
4655
4656 return ret;
4657}
4658
4659int task_can_attach(struct task_struct *p,
4660 const struct cpumask *cs_cpus_allowed)
4661{
4662 int ret = 0;
4663
4664 /*
4665 * Kthreads which disallow setaffinity shouldn't be moved
4666 * to a new cpuset; we don't want to change their cpu
4667 * affinity and isolating such threads by their set of
4668 * allowed nodes is unnecessary. Thus, cpusets are not
4669 * applicable for such threads. This prevents checking for
4670 * success of set_cpus_allowed_ptr() on all attached tasks
4671 * before cpus_allowed may be changed.
4672 */
4673 if (p->flags & PF_NO_SETAFFINITY) {
4674 ret = -EINVAL;
4675 goto out;
4676 }
4677
4678#ifdef CONFIG_SMP
4679 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
4680 cs_cpus_allowed)) {
4681 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
4682 cs_cpus_allowed);
4683 struct dl_bw *dl_b;
4684 bool overflow;
4685 int cpus;
4686 unsigned long flags;
4687
4688 rcu_read_lock_sched();
4689 dl_b = dl_bw_of(dest_cpu);
4690 raw_spin_lock_irqsave(&dl_b->lock, flags);
4691 cpus = dl_bw_cpus(dest_cpu);
4692 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
4693 if (overflow)
4694 ret = -EBUSY;
4695 else {
4696 /*
4697 * We reserve space for this task in the destination
4698 * root_domain, as we can't fail after this point.
4699 * We will free resources in the source root_domain
4700 * later on (see set_cpus_allowed_dl()).
4701 */
4702 __dl_add(dl_b, p->dl.dl_bw);
4703 }
4704 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
4705 rcu_read_unlock_sched();
4706
4707 }
4708#endif
4709out:
4710 return ret;
4711}
4712
4640#ifdef CONFIG_SMP 4713#ifdef CONFIG_SMP
4641/* 4714/*
4642 * move_queued_task - move a queued task to new rq. 4715 * move_queued_task - move a queued task to new rq.
@@ -6087,7 +6160,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
6087 6160
6088#ifdef CONFIG_NUMA 6161#ifdef CONFIG_NUMA
6089static int sched_domains_numa_levels; 6162static int sched_domains_numa_levels;
6163enum numa_topology_type sched_numa_topology_type;
6090static int *sched_domains_numa_distance; 6164static int *sched_domains_numa_distance;
6165int sched_max_numa_distance;
6091static struct cpumask ***sched_domains_numa_masks; 6166static struct cpumask ***sched_domains_numa_masks;
6092static int sched_domains_curr_level; 6167static int sched_domains_curr_level;
6093#endif 6168#endif
@@ -6259,7 +6334,7 @@ static void sched_numa_warn(const char *str)
6259 printk(KERN_WARNING "\n"); 6334 printk(KERN_WARNING "\n");
6260} 6335}
6261 6336
6262static bool find_numa_distance(int distance) 6337bool find_numa_distance(int distance)
6263{ 6338{
6264 int i; 6339 int i;
6265 6340
@@ -6274,6 +6349,56 @@ static bool find_numa_distance(int distance)
6274 return false; 6349 return false;
6275} 6350}
6276 6351
6352/*
6353 * A system can have three types of NUMA topology:
6354 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
6355 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
6356 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
6357 *
6358 * The difference between a glueless mesh topology and a backplane
6359 * topology lies in whether communication between not directly
6360 * connected nodes goes through intermediary nodes (where programs
6361 * could run), or through backplane controllers. This affects
6362 * placement of programs.
6363 *
6364 * The type of topology can be discerned with the following tests:
6365 * - If the maximum distance between any nodes is 1 hop, the system
6366 * is directly connected.
6367 * - If for two nodes A and B, located N > 1 hops away from each other,
6368 * there is an intermediary node C, which is < N hops away from both
6369 * nodes A and B, the system is a glueless mesh.
6370 */
6371static void init_numa_topology_type(void)
6372{
6373 int a, b, c, n;
6374
6375 n = sched_max_numa_distance;
6376
6377 if (n <= 1)
6378 sched_numa_topology_type = NUMA_DIRECT;
6379
6380 for_each_online_node(a) {
6381 for_each_online_node(b) {
6382 /* Find two nodes furthest removed from each other. */
6383 if (node_distance(a, b) < n)
6384 continue;
6385
6386 /* Is there an intermediary node between a and b? */
6387 for_each_online_node(c) {
6388 if (node_distance(a, c) < n &&
6389 node_distance(b, c) < n) {
6390 sched_numa_topology_type =
6391 NUMA_GLUELESS_MESH;
6392 return;
6393 }
6394 }
6395
6396 sched_numa_topology_type = NUMA_BACKPLANE;
6397 return;
6398 }
6399 }
6400}
6401
6277static void sched_init_numa(void) 6402static void sched_init_numa(void)
6278{ 6403{
6279 int next_distance, curr_distance = node_distance(0, 0); 6404 int next_distance, curr_distance = node_distance(0, 0);
@@ -6327,6 +6452,10 @@ static void sched_init_numa(void)
6327 if (!sched_debug()) 6452 if (!sched_debug())
6328 break; 6453 break;
6329 } 6454 }
6455
6456 if (!level)
6457 return;
6458
6330 /* 6459 /*
6331 * 'level' contains the number of unique distances, excluding the 6460 * 'level' contains the number of unique distances, excluding the
6332 * identity distance node_distance(i,i). 6461 * identity distance node_distance(i,i).
@@ -6406,6 +6535,9 @@ static void sched_init_numa(void)
6406 sched_domain_topology = tl; 6535 sched_domain_topology = tl;
6407 6536
6408 sched_domains_numa_levels = level; 6537 sched_domains_numa_levels = level;
6538 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
6539
6540 init_numa_topology_type();
6409} 6541}
6410 6542
6411static void sched_domains_numa_masks_set(int cpu) 6543static void sched_domains_numa_masks_set(int cpu)
@@ -7158,6 +7290,25 @@ static inline int preempt_count_equals(int preempt_offset)
7158 7290
7159void __might_sleep(const char *file, int line, int preempt_offset) 7291void __might_sleep(const char *file, int line, int preempt_offset)
7160{ 7292{
7293 /*
7294 * Blocking primitives will set (and therefore destroy) current->state,
7295 * since we will exit with TASK_RUNNING make sure we enter with it,
7296 * otherwise we will destroy state.
7297 */
7298 if (WARN_ONCE(current->state != TASK_RUNNING,
7299 "do not call blocking ops when !TASK_RUNNING; "
7300 "state=%lx set at [<%p>] %pS\n",
7301 current->state,
7302 (void *)current->task_state_change,
7303 (void *)current->task_state_change))
7304 __set_current_state(TASK_RUNNING);
7305
7306 ___might_sleep(file, line, preempt_offset);
7307}
7308EXPORT_SYMBOL(__might_sleep);
7309
7310void ___might_sleep(const char *file, int line, int preempt_offset)
7311{
7161 static unsigned long prev_jiffy; /* ratelimiting */ 7312 static unsigned long prev_jiffy; /* ratelimiting */
7162 7313
7163 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7314 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7189,7 +7340,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
7189#endif 7340#endif
7190 dump_stack(); 7341 dump_stack();
7191} 7342}
7192EXPORT_SYMBOL(__might_sleep); 7343EXPORT_SYMBOL(___might_sleep);
7193#endif 7344#endif
7194 7345
7195#ifdef CONFIG_MAGIC_SYSRQ 7346#ifdef CONFIG_MAGIC_SYSRQ
@@ -7403,8 +7554,12 @@ void sched_move_task(struct task_struct *tsk)
7403 if (unlikely(running)) 7554 if (unlikely(running))
7404 put_prev_task(rq, tsk); 7555 put_prev_task(rq, tsk);
7405 7556
7406 tg = container_of(task_css_check(tsk, cpu_cgrp_id, 7557 /*
7407 lockdep_is_held(&tsk->sighand->siglock)), 7558 * All callers are synchronized by task_rq_lock(); we do not use RCU
7559 * which is pointless here. Thus, we pass "true" to task_css_check()
7560 * to prevent lockdep warnings.
7561 */
7562 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
7408 struct task_group, css); 7563 struct task_group, css);
7409 tg = autogroup_task_group(tsk, tg); 7564 tg = autogroup_task_group(tsk, tg);
7410 tsk->sched_task_group = tg; 7565 tsk->sched_task_group = tg;
@@ -7833,6 +7988,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7833 sched_offline_group(tg); 7988 sched_offline_group(tg);
7834} 7989}
7835 7990
7991static void cpu_cgroup_fork(struct task_struct *task)
7992{
7993 sched_move_task(task);
7994}
7995
7836static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, 7996static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7837 struct cgroup_taskset *tset) 7997 struct cgroup_taskset *tset)
7838{ 7998{
@@ -8205,6 +8365,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
8205 .css_free = cpu_cgroup_css_free, 8365 .css_free = cpu_cgroup_css_free,
8206 .css_online = cpu_cgroup_css_online, 8366 .css_online = cpu_cgroup_css_online,
8207 .css_offline = cpu_cgroup_css_offline, 8367 .css_offline = cpu_cgroup_css_offline,
8368 .fork = cpu_cgroup_fork,
8208 .can_attach = cpu_cgroup_can_attach, 8369 .can_attach = cpu_cgroup_can_attach,
8209 .attach = cpu_cgroup_attach, 8370 .attach = cpu_cgroup_attach,
8210 .exit = cpu_cgroup_exit, 8371 .exit = cpu_cgroup_exit,
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 538c9796ad4a..020039bd1326 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); 25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26int cpudl_init(struct cpudl *cp); 26int cpudl_init(struct cpudl *cp);
27void cpudl_cleanup(struct cpudl *cp); 27void cpudl_cleanup(struct cpudl *cp);
28#else
29#define cpudl_set(cp, cpu, dl) do { } while (0)
30#define cpudl_init() do { } while (0)
31#endif /* CONFIG_SMP */ 28#endif /* CONFIG_SMP */
32 29
33#endif /* _LINUX_CPUDL_H */ 30#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 6b033347fdfd..63cbb9ca0496 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp,
26void cpupri_set(struct cpupri *cp, int cpu, int pri); 26void cpupri_set(struct cpupri *cp, int cpu, int pri);
27int cpupri_init(struct cpupri *cp); 27int cpupri_init(struct cpupri *cp);
28void cpupri_cleanup(struct cpupri *cp); 28void cpupri_cleanup(struct cpupri *cp);
29#else
30#define cpupri_set(cp, cpu, pri) do { } while (0)
31#define cpupri_init() do { } while (0)
32#endif 29#endif
33 30
34#endif /* _LINUX_CPUPRI_H */ 31#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 256e577faf1b..e5db8c6feebd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -518,12 +518,20 @@ again:
518 } 518 }
519 519
520 /* 520 /*
521 * We need to take care of a possible races here. In fact, the 521 * We need to take care of several possible races here:
522 * task might have changed its scheduling policy to something 522 *
523 * different from SCHED_DEADLINE or changed its reservation 523 * - the task might have changed its scheduling policy
524 * parameters (through sched_setattr()). 524 * to something different than SCHED_DEADLINE
525 * - the task might have changed its reservation parameters
526 * (through sched_setattr())
527 * - the task might have been boosted by someone else and
528 * might be in the boosting/deboosting path
529 *
530 * In all this cases we bail out, as the task is already
531 * in the runqueue or is going to be enqueued back anyway.
525 */ 532 */
526 if (!dl_task(p) || dl_se->dl_new) 533 if (!dl_task(p) || dl_se->dl_new ||
534 dl_se->dl_boosted || !dl_se->dl_throttled)
527 goto unlock; 535 goto unlock;
528 536
529 sched_clock_tick(); 537 sched_clock_tick();
@@ -532,7 +540,7 @@ again:
532 dl_se->dl_yielded = 0; 540 dl_se->dl_yielded = 0;
533 if (task_on_rq_queued(p)) { 541 if (task_on_rq_queued(p)) {
534 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 542 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
535 if (task_has_dl_policy(rq->curr)) 543 if (dl_task(rq->curr))
536 check_preempt_curr_dl(rq, p, 0); 544 check_preempt_curr_dl(rq, p, 0);
537 else 545 else
538 resched_curr(rq); 546 resched_curr(rq);
@@ -555,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
555{ 563{
556 struct hrtimer *timer = &dl_se->dl_timer; 564 struct hrtimer *timer = &dl_se->dl_timer;
557 565
558 if (hrtimer_active(timer)) {
559 hrtimer_try_to_cancel(timer);
560 return;
561 }
562
563 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 566 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
564 timer->function = dl_task_timer; 567 timer->function = dl_task_timer;
565} 568}
@@ -625,7 +628,7 @@ static void update_curr_dl(struct rq *rq)
625 628
626 sched_rt_avg_update(rq, delta_exec); 629 sched_rt_avg_update(rq, delta_exec);
627 630
628 dl_se->runtime -= delta_exec; 631 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
629 if (dl_runtime_exceeded(rq, dl_se)) { 632 if (dl_runtime_exceeded(rq, dl_se)) {
630 __dequeue_task_dl(rq, curr, 0); 633 __dequeue_task_dl(rq, curr, 0);
631 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) 634 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@ -847,8 +850,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
847 * smaller than our one... OTW we keep our runtime and 850 * smaller than our one... OTW we keep our runtime and
848 * deadline. 851 * deadline.
849 */ 852 */
850 if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) 853 if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
851 pi_se = &pi_task->dl; 854 pi_se = &pi_task->dl;
855 } else if (!dl_prio(p->normal_prio)) {
856 /*
857 * Special case in which we have a !SCHED_DEADLINE task
858 * that is going to be deboosted, but exceedes its
859 * runtime while doing so. No point in replenishing
860 * it, as it's going to return back to its original
861 * scheduling class after this.
862 */
863 BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
864 return;
865 }
852 866
853 /* 867 /*
854 * If p is throttled, we do nothing. In fact, if it exhausted 868 * If p is throttled, we do nothing. In fact, if it exhausted
@@ -914,7 +928,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
914 struct task_struct *curr; 928 struct task_struct *curr;
915 struct rq *rq; 929 struct rq *rq;
916 930
917 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 931 if (sd_flag != SD_BALANCE_WAKE)
918 goto out; 932 goto out;
919 933
920 rq = cpu_rq(cpu); 934 rq = cpu_rq(cpu);
@@ -999,6 +1013,10 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
999{ 1013{
1000 hrtick_start(rq, p->dl.runtime); 1014 hrtick_start(rq, p->dl.runtime);
1001} 1015}
1016#else /* !CONFIG_SCHED_HRTICK */
1017static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
1018{
1019}
1002#endif 1020#endif
1003 1021
1004static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, 1022static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1052,10 +1070,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
1052 /* Running task will never be pushed. */ 1070 /* Running task will never be pushed. */
1053 dequeue_pushable_dl_task(rq, p); 1071 dequeue_pushable_dl_task(rq, p);
1054 1072
1055#ifdef CONFIG_SCHED_HRTICK
1056 if (hrtick_enabled(rq)) 1073 if (hrtick_enabled(rq))
1057 start_hrtick_dl(rq, p); 1074 start_hrtick_dl(rq, p);
1058#endif
1059 1075
1060 set_post_schedule(rq); 1076 set_post_schedule(rq);
1061 1077
@@ -1074,10 +1090,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1074{ 1090{
1075 update_curr_dl(rq); 1091 update_curr_dl(rq);
1076 1092
1077#ifdef CONFIG_SCHED_HRTICK
1078 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) 1093 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
1079 start_hrtick_dl(rq, p); 1094 start_hrtick_dl(rq, p);
1080#endif
1081} 1095}
1082 1096
1083static void task_fork_dl(struct task_struct *p) 1097static void task_fork_dl(struct task_struct *p)
@@ -1314,6 +1328,7 @@ static int push_dl_task(struct rq *rq)
1314{ 1328{
1315 struct task_struct *next_task; 1329 struct task_struct *next_task;
1316 struct rq *later_rq; 1330 struct rq *later_rq;
1331 int ret = 0;
1317 1332
1318 if (!rq->dl.overloaded) 1333 if (!rq->dl.overloaded)
1319 return 0; 1334 return 0;
@@ -1359,7 +1374,6 @@ retry:
1359 * The task is still there. We don't try 1374 * The task is still there. We don't try
1360 * again, some other cpu will pull it when ready. 1375 * again, some other cpu will pull it when ready.
1361 */ 1376 */
1362 dequeue_pushable_dl_task(rq, next_task);
1363 goto out; 1377 goto out;
1364 } 1378 }
1365 1379
@@ -1375,6 +1389,7 @@ retry:
1375 deactivate_task(rq, next_task, 0); 1389 deactivate_task(rq, next_task, 0);
1376 set_task_cpu(next_task, later_rq->cpu); 1390 set_task_cpu(next_task, later_rq->cpu);
1377 activate_task(later_rq, next_task, 0); 1391 activate_task(later_rq, next_task, 0);
1392 ret = 1;
1378 1393
1379 resched_curr(later_rq); 1394 resched_curr(later_rq);
1380 1395
@@ -1383,7 +1398,7 @@ retry:
1383out: 1398out:
1384 put_task_struct(next_task); 1399 put_task_struct(next_task);
1385 1400
1386 return 1; 1401 return ret;
1387} 1402}
1388 1403
1389static void push_dl_tasks(struct rq *rq) 1404static void push_dl_tasks(struct rq *rq)
@@ -1489,7 +1504,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
1489 p->nr_cpus_allowed > 1 && 1504 p->nr_cpus_allowed > 1 &&
1490 dl_task(rq->curr) && 1505 dl_task(rq->curr) &&
1491 (rq->curr->nr_cpus_allowed < 2 || 1506 (rq->curr->nr_cpus_allowed < 2 ||
1492 dl_entity_preempt(&rq->curr->dl, &p->dl))) { 1507 !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
1493 push_dl_tasks(rq); 1508 push_dl_tasks(rq);
1494 } 1509 }
1495} 1510}
@@ -1498,10 +1513,33 @@ static void set_cpus_allowed_dl(struct task_struct *p,
1498 const struct cpumask *new_mask) 1513 const struct cpumask *new_mask)
1499{ 1514{
1500 struct rq *rq; 1515 struct rq *rq;
1516 struct root_domain *src_rd;
1501 int weight; 1517 int weight;
1502 1518
1503 BUG_ON(!dl_task(p)); 1519 BUG_ON(!dl_task(p));
1504 1520
1521 rq = task_rq(p);
1522 src_rd = rq->rd;
1523 /*
1524 * Migrating a SCHED_DEADLINE task between exclusive
1525 * cpusets (different root_domains) entails a bandwidth
1526 * update. We already made space for us in the destination
1527 * domain (see cpuset_can_attach()).
1528 */
1529 if (!cpumask_intersects(src_rd->span, new_mask)) {
1530 struct dl_bw *src_dl_b;
1531
1532 src_dl_b = dl_bw_of(cpu_of(rq));
1533 /*
1534 * We now free resources of the root_domain we are migrating
1535 * off. In the worst case, sched_setattr() may temporary fail
1536 * until we complete the update.
1537 */
1538 raw_spin_lock(&src_dl_b->lock);
1539 __dl_clear(src_dl_b, p->dl.dl_bw);
1540 raw_spin_unlock(&src_dl_b->lock);
1541 }
1542
1505 /* 1543 /*
1506 * Update only if the task is actually running (i.e., 1544 * Update only if the task is actually running (i.e.,
1507 * it is on the rq AND it is not throttled). 1545 * it is on the rq AND it is not throttled).
@@ -1518,8 +1556,6 @@ static void set_cpus_allowed_dl(struct task_struct *p,
1518 if ((p->nr_cpus_allowed > 1) == (weight > 1)) 1556 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1519 return; 1557 return;
1520 1558
1521 rq = task_rq(p);
1522
1523 /* 1559 /*
1524 * The process used to be able to migrate OR it can now migrate 1560 * The process used to be able to migrate OR it can now migrate
1525 */ 1561 */
@@ -1567,22 +1603,48 @@ void init_sched_dl_class(void)
1567 1603
1568#endif /* CONFIG_SMP */ 1604#endif /* CONFIG_SMP */
1569 1605
1606/*
1607 * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
1608 */
1609static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
1610{
1611 struct hrtimer *dl_timer = &p->dl.dl_timer;
1612
1613 /* Nobody will change task's class if pi_lock is held */
1614 lockdep_assert_held(&p->pi_lock);
1615
1616 if (hrtimer_active(dl_timer)) {
1617 int ret = hrtimer_try_to_cancel(dl_timer);
1618
1619 if (unlikely(ret == -1)) {
1620 /*
1621 * Note, p may migrate OR new deadline tasks
1622 * may appear in rq when we are unlocking it.
1623 * A caller of us must be fine with that.
1624 */
1625 raw_spin_unlock(&rq->lock);
1626 hrtimer_cancel(dl_timer);
1627 raw_spin_lock(&rq->lock);
1628 }
1629 }
1630}
1631
1570static void switched_from_dl(struct rq *rq, struct task_struct *p) 1632static void switched_from_dl(struct rq *rq, struct task_struct *p)
1571{ 1633{
1572 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) 1634 cancel_dl_timer(rq, p);
1573 hrtimer_try_to_cancel(&p->dl.dl_timer);
1574 1635
1575 __dl_clear_params(p); 1636 __dl_clear_params(p);
1576 1637
1577#ifdef CONFIG_SMP
1578 /* 1638 /*
1579 * Since this might be the only -deadline task on the rq, 1639 * Since this might be the only -deadline task on the rq,
1580 * this is the right place to try to pull some other one 1640 * this is the right place to try to pull some other one
1581 * from an overloaded cpu, if any. 1641 * from an overloaded cpu, if any.
1582 */ 1642 */
1583 if (!rq->dl.dl_nr_running) 1643 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
1584 pull_dl_task(rq); 1644 return;
1585#endif 1645
1646 if (pull_dl_task(rq))
1647 resched_curr(rq);
1586} 1648}
1587 1649
1588/* 1650/*
@@ -1603,12 +1665,17 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1603 1665
1604 if (task_on_rq_queued(p) && rq->curr != p) { 1666 if (task_on_rq_queued(p) && rq->curr != p) {
1605#ifdef CONFIG_SMP 1667#ifdef CONFIG_SMP
1606 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1668 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
1669 push_dl_task(rq) && rq != task_rq(p))
1607 /* Only reschedule if pushing failed */ 1670 /* Only reschedule if pushing failed */
1608 check_resched = 0; 1671 check_resched = 0;
1609#endif /* CONFIG_SMP */ 1672#endif /* CONFIG_SMP */
1610 if (check_resched && task_has_dl_policy(rq->curr)) 1673 if (check_resched) {
1611 check_preempt_curr_dl(rq, p, 0); 1674 if (dl_task(rq->curr))
1675 check_preempt_curr_dl(rq, p, 0);
1676 else
1677 resched_curr(rq);
1678 }
1612 } 1679 }
1613} 1680}
1614 1681
@@ -1678,4 +1745,15 @@ const struct sched_class dl_sched_class = {
1678 .prio_changed = prio_changed_dl, 1745 .prio_changed = prio_changed_dl,
1679 .switched_from = switched_from_dl, 1746 .switched_from = switched_from_dl,
1680 .switched_to = switched_to_dl, 1747 .switched_to = switched_to_dl,
1748
1749 .update_curr = update_curr_dl,
1681}; 1750};
1751
1752#ifdef CONFIG_SCHED_DEBUG
1753extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
1754
1755void print_dl_stats(struct seq_file *m, int cpu)
1756{
1757 print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
1758}
1759#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ce33780d8f20..92cc52001e74 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -261,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
261#undef P 261#undef P
262} 262}
263 263
264void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
265{
266 SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
267 SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
268}
269
264extern __read_mostly int sched_clock_running; 270extern __read_mostly int sched_clock_running;
265 271
266static void print_cpu(struct seq_file *m, int cpu) 272static void print_cpu(struct seq_file *m, int cpu)
@@ -329,6 +335,7 @@ do { \
329 spin_lock_irqsave(&sched_debug_lock, flags); 335 spin_lock_irqsave(&sched_debug_lock, flags);
330 print_cfs_stats(m, cpu); 336 print_cfs_stats(m, cpu);
331 print_rt_stats(m, cpu); 337 print_rt_stats(m, cpu);
338 print_dl_stats(m, cpu);
332 339
333 print_rq(m, rq, cpu); 340 print_rq(m, rq, cpu);
334 spin_unlock_irqrestore(&sched_debug_lock, flags); 341 spin_unlock_irqrestore(&sched_debug_lock, flags);
@@ -528,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
528 unsigned long nr_faults = -1; 535 unsigned long nr_faults = -1;
529 int cpu_current, home_node; 536 int cpu_current, home_node;
530 537
531 if (p->numa_faults_memory) 538 if (p->numa_faults)
532 nr_faults = p->numa_faults_memory[2*node + i]; 539 nr_faults = p->numa_faults[2*node + i];
533 540
534 cpu_current = !i ? (task_node(p) == node) : 541 cpu_current = !i ? (task_node(p) == node) :
535 (pol && node_isset(node, pol->v.nodes)); 542 (pol && node_isset(node, pol->v.nodes));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b069bf3e708..df2cdf77f899 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -726,6 +726,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
726 account_cfs_rq_runtime(cfs_rq, delta_exec); 726 account_cfs_rq_runtime(cfs_rq, delta_exec);
727} 727}
728 728
729static void update_curr_fair(struct rq *rq)
730{
731 update_curr(cfs_rq_of(&rq->curr->se));
732}
733
729static inline void 734static inline void
730update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 735update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
731{ 736{
@@ -828,11 +833,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
828 833
829static unsigned int task_scan_min(struct task_struct *p) 834static unsigned int task_scan_min(struct task_struct *p)
830{ 835{
836 unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
831 unsigned int scan, floor; 837 unsigned int scan, floor;
832 unsigned int windows = 1; 838 unsigned int windows = 1;
833 839
834 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) 840 if (scan_size < MAX_SCAN_WINDOW)
835 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; 841 windows = MAX_SCAN_WINDOW / scan_size;
836 floor = 1000 / windows; 842 floor = 1000 / windows;
837 843
838 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); 844 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
@@ -867,7 +873,6 @@ struct numa_group {
867 spinlock_t lock; /* nr_tasks, tasks */ 873 spinlock_t lock; /* nr_tasks, tasks */
868 int nr_tasks; 874 int nr_tasks;
869 pid_t gid; 875 pid_t gid;
870 struct list_head task_list;
871 876
872 struct rcu_head rcu; 877 struct rcu_head rcu;
873 nodemask_t active_nodes; 878 nodemask_t active_nodes;
@@ -895,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p)
895 return p->numa_group ? p->numa_group->gid : 0; 900 return p->numa_group ? p->numa_group->gid : 0;
896} 901}
897 902
898static inline int task_faults_idx(int nid, int priv) 903/*
904 * The averaged statistics, shared & private, memory & cpu,
905 * occupy the first half of the array. The second half of the
906 * array is for current counters, which are averaged into the
907 * first set by task_numa_placement.
908 */
909static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
899{ 910{
900 return NR_NUMA_HINT_FAULT_TYPES * nid + priv; 911 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
901} 912}
902 913
903static inline unsigned long task_faults(struct task_struct *p, int nid) 914static inline unsigned long task_faults(struct task_struct *p, int nid)
904{ 915{
905 if (!p->numa_faults_memory) 916 if (!p->numa_faults)
906 return 0; 917 return 0;
907 918
908 return p->numa_faults_memory[task_faults_idx(nid, 0)] + 919 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
909 p->numa_faults_memory[task_faults_idx(nid, 1)]; 920 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
910} 921}
911 922
912static inline unsigned long group_faults(struct task_struct *p, int nid) 923static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -914,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
914 if (!p->numa_group) 925 if (!p->numa_group)
915 return 0; 926 return 0;
916 927
917 return p->numa_group->faults[task_faults_idx(nid, 0)] + 928 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
918 p->numa_group->faults[task_faults_idx(nid, 1)]; 929 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
919} 930}
920 931
921static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) 932static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
922{ 933{
923 return group->faults_cpu[task_faults_idx(nid, 0)] + 934 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
924 group->faults_cpu[task_faults_idx(nid, 1)]; 935 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
936}
937
938/* Handle placement on systems where not all nodes are directly connected. */
939static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
940 int maxdist, bool task)
941{
942 unsigned long score = 0;
943 int node;
944
945 /*
946 * All nodes are directly connected, and the same distance
947 * from each other. No need for fancy placement algorithms.
948 */
949 if (sched_numa_topology_type == NUMA_DIRECT)
950 return 0;
951
952 /*
953 * This code is called for each node, introducing N^2 complexity,
954 * which should be ok given the number of nodes rarely exceeds 8.
955 */
956 for_each_online_node(node) {
957 unsigned long faults;
958 int dist = node_distance(nid, node);
959
960 /*
961 * The furthest away nodes in the system are not interesting
962 * for placement; nid was already counted.
963 */
964 if (dist == sched_max_numa_distance || node == nid)
965 continue;
966
967 /*
968 * On systems with a backplane NUMA topology, compare groups
969 * of nodes, and move tasks towards the group with the most
970 * memory accesses. When comparing two nodes at distance
971 * "hoplimit", only nodes closer by than "hoplimit" are part
972 * of each group. Skip other nodes.
973 */
974 if (sched_numa_topology_type == NUMA_BACKPLANE &&
975 dist > maxdist)
976 continue;
977
978 /* Add up the faults from nearby nodes. */
979 if (task)
980 faults = task_faults(p, node);
981 else
982 faults = group_faults(p, node);
983
984 /*
985 * On systems with a glueless mesh NUMA topology, there are
986 * no fixed "groups of nodes". Instead, nodes that are not
987 * directly connected bounce traffic through intermediate
988 * nodes; a numa_group can occupy any set of nodes.
989 * The further away a node is, the less the faults count.
990 * This seems to result in good task placement.
991 */
992 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
993 faults *= (sched_max_numa_distance - dist);
994 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
995 }
996
997 score += faults;
998 }
999
1000 return score;
925} 1001}
926 1002
927/* 1003/*
@@ -930,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
930 * larger multiplier, in order to group tasks together that are almost 1006 * larger multiplier, in order to group tasks together that are almost
931 * evenly spread out between numa nodes. 1007 * evenly spread out between numa nodes.
932 */ 1008 */
933static inline unsigned long task_weight(struct task_struct *p, int nid) 1009static inline unsigned long task_weight(struct task_struct *p, int nid,
1010 int dist)
934{ 1011{
935 unsigned long total_faults; 1012 unsigned long faults, total_faults;
936 1013
937 if (!p->numa_faults_memory) 1014 if (!p->numa_faults)
938 return 0; 1015 return 0;
939 1016
940 total_faults = p->total_numa_faults; 1017 total_faults = p->total_numa_faults;
@@ -942,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
942 if (!total_faults) 1019 if (!total_faults)
943 return 0; 1020 return 0;
944 1021
945 return 1000 * task_faults(p, nid) / total_faults; 1022 faults = task_faults(p, nid);
1023 faults += score_nearby_nodes(p, nid, dist, true);
1024
1025 return 1000 * faults / total_faults;
946} 1026}
947 1027
948static inline unsigned long group_weight(struct task_struct *p, int nid) 1028static inline unsigned long group_weight(struct task_struct *p, int nid,
1029 int dist)
949{ 1030{
950 if (!p->numa_group || !p->numa_group->total_faults) 1031 unsigned long faults, total_faults;
1032
1033 if (!p->numa_group)
1034 return 0;
1035
1036 total_faults = p->numa_group->total_faults;
1037
1038 if (!total_faults)
951 return 0; 1039 return 0;
952 1040
953 return 1000 * group_faults(p, nid) / p->numa_group->total_faults; 1041 faults = group_faults(p, nid);
1042 faults += score_nearby_nodes(p, nid, dist, false);
1043
1044 return 1000 * faults / total_faults;
954} 1045}
955 1046
956bool should_numa_migrate_memory(struct task_struct *p, struct page * page, 1047bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
@@ -1083,6 +1174,7 @@ struct task_numa_env {
1083 struct numa_stats src_stats, dst_stats; 1174 struct numa_stats src_stats, dst_stats;
1084 1175
1085 int imbalance_pct; 1176 int imbalance_pct;
1177 int dist;
1086 1178
1087 struct task_struct *best_task; 1179 struct task_struct *best_task;
1088 long best_imp; 1180 long best_imp;
@@ -1162,11 +1254,29 @@ static void task_numa_compare(struct task_numa_env *env,
1162 long load; 1254 long load;
1163 long imp = env->p->numa_group ? groupimp : taskimp; 1255 long imp = env->p->numa_group ? groupimp : taskimp;
1164 long moveimp = imp; 1256 long moveimp = imp;
1257 int dist = env->dist;
1165 1258
1166 rcu_read_lock(); 1259 rcu_read_lock();
1167 cur = ACCESS_ONCE(dst_rq->curr); 1260
1168 if (cur->pid == 0) /* idle */ 1261 raw_spin_lock_irq(&dst_rq->lock);
1262 cur = dst_rq->curr;
1263 /*
1264 * No need to move the exiting task, and this ensures that ->curr
1265 * wasn't reaped and thus get_task_struct() in task_numa_assign()
1266 * is safe under RCU read lock.
1267 * Note that rcu_read_lock() itself can't protect from the final
1268 * put_task_struct() after the last schedule().
1269 */
1270 if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1169 cur = NULL; 1271 cur = NULL;
1272 raw_spin_unlock_irq(&dst_rq->lock);
1273
1274 /*
1275 * Because we have preemption enabled we can get migrated around and
1276 * end try selecting ourselves (current == env->p) as a swap candidate.
1277 */
1278 if (cur == env->p)
1279 goto unlock;
1170 1280
1171 /* 1281 /*
1172 * "imp" is the fault differential for the source task between the 1282 * "imp" is the fault differential for the source task between the
@@ -1185,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env,
1185 * in any group then look only at task weights. 1295 * in any group then look only at task weights.
1186 */ 1296 */
1187 if (cur->numa_group == env->p->numa_group) { 1297 if (cur->numa_group == env->p->numa_group) {
1188 imp = taskimp + task_weight(cur, env->src_nid) - 1298 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1189 task_weight(cur, env->dst_nid); 1299 task_weight(cur, env->dst_nid, dist);
1190 /* 1300 /*
1191 * Add some hysteresis to prevent swapping the 1301 * Add some hysteresis to prevent swapping the
1192 * tasks within a group over tiny differences. 1302 * tasks within a group over tiny differences.
@@ -1200,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env,
1200 * instead. 1310 * instead.
1201 */ 1311 */
1202 if (cur->numa_group) 1312 if (cur->numa_group)
1203 imp += group_weight(cur, env->src_nid) - 1313 imp += group_weight(cur, env->src_nid, dist) -
1204 group_weight(cur, env->dst_nid); 1314 group_weight(cur, env->dst_nid, dist);
1205 else 1315 else
1206 imp += task_weight(cur, env->src_nid) - 1316 imp += task_weight(cur, env->src_nid, dist) -
1207 task_weight(cur, env->dst_nid); 1317 task_weight(cur, env->dst_nid, dist);
1208 } 1318 }
1209 } 1319 }
1210 1320
@@ -1303,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p)
1303 }; 1413 };
1304 struct sched_domain *sd; 1414 struct sched_domain *sd;
1305 unsigned long taskweight, groupweight; 1415 unsigned long taskweight, groupweight;
1306 int nid, ret; 1416 int nid, ret, dist;
1307 long taskimp, groupimp; 1417 long taskimp, groupimp;
1308 1418
1309 /* 1419 /*
@@ -1331,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p)
1331 return -EINVAL; 1441 return -EINVAL;
1332 } 1442 }
1333 1443
1334 taskweight = task_weight(p, env.src_nid);
1335 groupweight = group_weight(p, env.src_nid);
1336 update_numa_stats(&env.src_stats, env.src_nid);
1337 env.dst_nid = p->numa_preferred_nid; 1444 env.dst_nid = p->numa_preferred_nid;
1338 taskimp = task_weight(p, env.dst_nid) - taskweight; 1445 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1339 groupimp = group_weight(p, env.dst_nid) - groupweight; 1446 taskweight = task_weight(p, env.src_nid, dist);
1447 groupweight = group_weight(p, env.src_nid, dist);
1448 update_numa_stats(&env.src_stats, env.src_nid);
1449 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1450 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1340 update_numa_stats(&env.dst_stats, env.dst_nid); 1451 update_numa_stats(&env.dst_stats, env.dst_nid);
1341 1452
1342 /* Try to find a spot on the preferred nid. */ 1453 /* Try to find a spot on the preferred nid. */
1343 task_numa_find_cpu(&env, taskimp, groupimp); 1454 task_numa_find_cpu(&env, taskimp, groupimp);
1344 1455
1345 /* No space available on the preferred nid. Look elsewhere. */ 1456 /*
1346 if (env.best_cpu == -1) { 1457 * Look at other nodes in these cases:
1458 * - there is no space available on the preferred_nid
1459 * - the task is part of a numa_group that is interleaved across
1460 * multiple NUMA nodes; in order to better consolidate the group,
1461 * we need to check other locations.
1462 */
1463 if (env.best_cpu == -1 || (p->numa_group &&
1464 nodes_weight(p->numa_group->active_nodes) > 1)) {
1347 for_each_online_node(nid) { 1465 for_each_online_node(nid) {
1348 if (nid == env.src_nid || nid == p->numa_preferred_nid) 1466 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1349 continue; 1467 continue;
1350 1468
1469 dist = node_distance(env.src_nid, env.dst_nid);
1470 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1471 dist != env.dist) {
1472 taskweight = task_weight(p, env.src_nid, dist);
1473 groupweight = group_weight(p, env.src_nid, dist);
1474 }
1475
1351 /* Only consider nodes where both task and groups benefit */ 1476 /* Only consider nodes where both task and groups benefit */
1352 taskimp = task_weight(p, nid) - taskweight; 1477 taskimp = task_weight(p, nid, dist) - taskweight;
1353 groupimp = group_weight(p, nid) - groupweight; 1478 groupimp = group_weight(p, nid, dist) - groupweight;
1354 if (taskimp < 0 && groupimp < 0) 1479 if (taskimp < 0 && groupimp < 0)
1355 continue; 1480 continue;
1356 1481
1482 env.dist = dist;
1357 env.dst_nid = nid; 1483 env.dst_nid = nid;
1358 update_numa_stats(&env.dst_stats, env.dst_nid); 1484 update_numa_stats(&env.dst_stats, env.dst_nid);
1359 task_numa_find_cpu(&env, taskimp, groupimp); 1485 task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1408,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p)
1408 unsigned long interval = HZ; 1534 unsigned long interval = HZ;
1409 1535
1410 /* This task has no NUMA fault statistics yet */ 1536 /* This task has no NUMA fault statistics yet */
1411 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) 1537 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1412 return; 1538 return;
1413 1539
1414 /* Periodically retry migrating the task to the preferred node */ 1540 /* Periodically retry migrating the task to the preferred node */
@@ -1520,7 +1646,7 @@ static void update_task_scan_period(struct task_struct *p,
1520 * scanning faster if shared accesses dominate as it may 1646 * scanning faster if shared accesses dominate as it may
1521 * simply bounce migrations uselessly 1647 * simply bounce migrations uselessly
1522 */ 1648 */
1523 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); 1649 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1524 diff = (diff * ratio) / NUMA_PERIOD_SLOTS; 1650 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1525 } 1651 }
1526 1652
@@ -1557,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1557 return delta; 1683 return delta;
1558} 1684}
1559 1685
1686/*
1687 * Determine the preferred nid for a task in a numa_group. This needs to
1688 * be done in a way that produces consistent results with group_weight,
1689 * otherwise workloads might not converge.
1690 */
1691static int preferred_group_nid(struct task_struct *p, int nid)
1692{
1693 nodemask_t nodes;
1694 int dist;
1695
1696 /* Direct connections between all NUMA nodes. */
1697 if (sched_numa_topology_type == NUMA_DIRECT)
1698 return nid;
1699
1700 /*
1701 * On a system with glueless mesh NUMA topology, group_weight
1702 * scores nodes according to the number of NUMA hinting faults on
1703 * both the node itself, and on nearby nodes.
1704 */
1705 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1706 unsigned long score, max_score = 0;
1707 int node, max_node = nid;
1708
1709 dist = sched_max_numa_distance;
1710
1711 for_each_online_node(node) {
1712 score = group_weight(p, node, dist);
1713 if (score > max_score) {
1714 max_score = score;
1715 max_node = node;
1716 }
1717 }
1718 return max_node;
1719 }
1720
1721 /*
1722 * Finding the preferred nid in a system with NUMA backplane
1723 * interconnect topology is more involved. The goal is to locate
1724 * tasks from numa_groups near each other in the system, and
1725 * untangle workloads from different sides of the system. This requires
1726 * searching down the hierarchy of node groups, recursively searching
1727 * inside the highest scoring group of nodes. The nodemask tricks
1728 * keep the complexity of the search down.
1729 */
1730 nodes = node_online_map;
1731 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1732 unsigned long max_faults = 0;
1733 nodemask_t max_group;
1734 int a, b;
1735
1736 /* Are there nodes at this distance from each other? */
1737 if (!find_numa_distance(dist))
1738 continue;
1739
1740 for_each_node_mask(a, nodes) {
1741 unsigned long faults = 0;
1742 nodemask_t this_group;
1743 nodes_clear(this_group);
1744
1745 /* Sum group's NUMA faults; includes a==b case. */
1746 for_each_node_mask(b, nodes) {
1747 if (node_distance(a, b) < dist) {
1748 faults += group_faults(p, b);
1749 node_set(b, this_group);
1750 node_clear(b, nodes);
1751 }
1752 }
1753
1754 /* Remember the top group. */
1755 if (faults > max_faults) {
1756 max_faults = faults;
1757 max_group = this_group;
1758 /*
1759 * subtle: at the smallest distance there is
1760 * just one node left in each "group", the
1761 * winner is the preferred nid.
1762 */
1763 nid = a;
1764 }
1765 }
1766 /* Next round, evaluate the nodes within max_group. */
1767 nodes = max_group;
1768 }
1769 return nid;
1770}
1771
1560static void task_numa_placement(struct task_struct *p) 1772static void task_numa_placement(struct task_struct *p)
1561{ 1773{
1562 int seq, nid, max_nid = -1, max_group_nid = -1; 1774 int seq, nid, max_nid = -1, max_group_nid = -1;
@@ -1584,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p)
1584 1796
1585 /* Find the node with the highest number of faults */ 1797 /* Find the node with the highest number of faults */
1586 for_each_online_node(nid) { 1798 for_each_online_node(nid) {
1799 /* Keep track of the offsets in numa_faults array */
1800 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1587 unsigned long faults = 0, group_faults = 0; 1801 unsigned long faults = 0, group_faults = 0;
1588 int priv, i; 1802 int priv;
1589 1803
1590 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { 1804 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1591 long diff, f_diff, f_weight; 1805 long diff, f_diff, f_weight;
1592 1806
1593 i = task_faults_idx(nid, priv); 1807 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1808 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1809 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1810 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1594 1811
1595 /* Decay existing window, copy faults since last scan */ 1812 /* Decay existing window, copy faults since last scan */
1596 diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; 1813 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1597 fault_types[priv] += p->numa_faults_buffer_memory[i]; 1814 fault_types[priv] += p->numa_faults[membuf_idx];
1598 p->numa_faults_buffer_memory[i] = 0; 1815 p->numa_faults[membuf_idx] = 0;
1599 1816
1600 /* 1817 /*
1601 * Normalize the faults_from, so all tasks in a group 1818 * Normalize the faults_from, so all tasks in a group
@@ -1605,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p)
1605 * faults are less important. 1822 * faults are less important.
1606 */ 1823 */
1607 f_weight = div64_u64(runtime << 16, period + 1); 1824 f_weight = div64_u64(runtime << 16, period + 1);
1608 f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / 1825 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1609 (total_faults + 1); 1826 (total_faults + 1);
1610 f_diff = f_weight - p->numa_faults_cpu[i] / 2; 1827 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
1611 p->numa_faults_buffer_cpu[i] = 0; 1828 p->numa_faults[cpubuf_idx] = 0;
1612 1829
1613 p->numa_faults_memory[i] += diff; 1830 p->numa_faults[mem_idx] += diff;
1614 p->numa_faults_cpu[i] += f_diff; 1831 p->numa_faults[cpu_idx] += f_diff;
1615 faults += p->numa_faults_memory[i]; 1832 faults += p->numa_faults[mem_idx];
1616 p->total_numa_faults += diff; 1833 p->total_numa_faults += diff;
1617 if (p->numa_group) { 1834 if (p->numa_group) {
1618 /* safe because we can only change our own group */ 1835 /*
1619 p->numa_group->faults[i] += diff; 1836 * safe because we can only change our own group
1620 p->numa_group->faults_cpu[i] += f_diff; 1837 *
1838 * mem_idx represents the offset for a given
1839 * nid and priv in a specific region because it
1840 * is at the beginning of the numa_faults array.
1841 */
1842 p->numa_group->faults[mem_idx] += diff;
1843 p->numa_group->faults_cpu[mem_idx] += f_diff;
1621 p->numa_group->total_faults += diff; 1844 p->numa_group->total_faults += diff;
1622 group_faults += p->numa_group->faults[i]; 1845 group_faults += p->numa_group->faults[mem_idx];
1623 } 1846 }
1624 } 1847 }
1625 1848
@@ -1639,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p)
1639 if (p->numa_group) { 1862 if (p->numa_group) {
1640 update_numa_active_node_mask(p->numa_group); 1863 update_numa_active_node_mask(p->numa_group);
1641 spin_unlock_irq(group_lock); 1864 spin_unlock_irq(group_lock);
1642 max_nid = max_group_nid; 1865 max_nid = preferred_group_nid(p, max_group_nid);
1643 } 1866 }
1644 1867
1645 if (max_faults) { 1868 if (max_faults) {
@@ -1682,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1682 1905
1683 atomic_set(&grp->refcount, 1); 1906 atomic_set(&grp->refcount, 1);
1684 spin_lock_init(&grp->lock); 1907 spin_lock_init(&grp->lock);
1685 INIT_LIST_HEAD(&grp->task_list);
1686 grp->gid = p->pid; 1908 grp->gid = p->pid;
1687 /* Second half of the array tracks nids where faults happen */ 1909 /* Second half of the array tracks nids where faults happen */
1688 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * 1910 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
@@ -1691,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1691 node_set(task_node(current), grp->active_nodes); 1913 node_set(task_node(current), grp->active_nodes);
1692 1914
1693 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 1915 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1694 grp->faults[i] = p->numa_faults_memory[i]; 1916 grp->faults[i] = p->numa_faults[i];
1695 1917
1696 grp->total_faults = p->total_numa_faults; 1918 grp->total_faults = p->total_numa_faults;
1697 1919
1698 list_add(&p->numa_entry, &grp->task_list);
1699 grp->nr_tasks++; 1920 grp->nr_tasks++;
1700 rcu_assign_pointer(p->numa_group, grp); 1921 rcu_assign_pointer(p->numa_group, grp);
1701 } 1922 }
@@ -1750,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1750 double_lock_irq(&my_grp->lock, &grp->lock); 1971 double_lock_irq(&my_grp->lock, &grp->lock);
1751 1972
1752 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { 1973 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1753 my_grp->faults[i] -= p->numa_faults_memory[i]; 1974 my_grp->faults[i] -= p->numa_faults[i];
1754 grp->faults[i] += p->numa_faults_memory[i]; 1975 grp->faults[i] += p->numa_faults[i];
1755 } 1976 }
1756 my_grp->total_faults -= p->total_numa_faults; 1977 my_grp->total_faults -= p->total_numa_faults;
1757 grp->total_faults += p->total_numa_faults; 1978 grp->total_faults += p->total_numa_faults;
1758 1979
1759 list_move(&p->numa_entry, &grp->task_list);
1760 my_grp->nr_tasks--; 1980 my_grp->nr_tasks--;
1761 grp->nr_tasks++; 1981 grp->nr_tasks++;
1762 1982
@@ -1776,27 +1996,23 @@ no_join:
1776void task_numa_free(struct task_struct *p) 1996void task_numa_free(struct task_struct *p)
1777{ 1997{
1778 struct numa_group *grp = p->numa_group; 1998 struct numa_group *grp = p->numa_group;
1779 void *numa_faults = p->numa_faults_memory; 1999 void *numa_faults = p->numa_faults;
1780 unsigned long flags; 2000 unsigned long flags;
1781 int i; 2001 int i;
1782 2002
1783 if (grp) { 2003 if (grp) {
1784 spin_lock_irqsave(&grp->lock, flags); 2004 spin_lock_irqsave(&grp->lock, flags);
1785 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 2005 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1786 grp->faults[i] -= p->numa_faults_memory[i]; 2006 grp->faults[i] -= p->numa_faults[i];
1787 grp->total_faults -= p->total_numa_faults; 2007 grp->total_faults -= p->total_numa_faults;
1788 2008
1789 list_del(&p->numa_entry);
1790 grp->nr_tasks--; 2009 grp->nr_tasks--;
1791 spin_unlock_irqrestore(&grp->lock, flags); 2010 spin_unlock_irqrestore(&grp->lock, flags);
1792 RCU_INIT_POINTER(p->numa_group, NULL); 2011 RCU_INIT_POINTER(p->numa_group, NULL);
1793 put_numa_group(grp); 2012 put_numa_group(grp);
1794 } 2013 }
1795 2014
1796 p->numa_faults_memory = NULL; 2015 p->numa_faults = NULL;
1797 p->numa_faults_buffer_memory = NULL;
1798 p->numa_faults_cpu= NULL;
1799 p->numa_faults_buffer_cpu = NULL;
1800 kfree(numa_faults); 2016 kfree(numa_faults);
1801} 2017}
1802 2018
@@ -1819,24 +2035,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1819 return; 2035 return;
1820 2036
1821 /* Allocate buffer to track faults on a per-node basis */ 2037 /* Allocate buffer to track faults on a per-node basis */
1822 if (unlikely(!p->numa_faults_memory)) { 2038 if (unlikely(!p->numa_faults)) {
1823 int size = sizeof(*p->numa_faults_memory) * 2039 int size = sizeof(*p->numa_faults) *
1824 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; 2040 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
1825 2041
1826 p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); 2042 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
1827 if (!p->numa_faults_memory) 2043 if (!p->numa_faults)
1828 return; 2044 return;
1829 2045
1830 BUG_ON(p->numa_faults_buffer_memory);
1831 /*
1832 * The averaged statistics, shared & private, memory & cpu,
1833 * occupy the first half of the array. The second half of the
1834 * array is for current counters, which are averaged into the
1835 * first set by task_numa_placement.
1836 */
1837 p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1838 p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
1839 p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1840 p->total_numa_faults = 0; 2046 p->total_numa_faults = 0;
1841 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 2047 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1842 } 2048 }
@@ -1876,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1876 if (migrated) 2082 if (migrated)
1877 p->numa_pages_migrated += pages; 2083 p->numa_pages_migrated += pages;
1878 2084
1879 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; 2085 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
1880 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; 2086 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
1881 p->numa_faults_locality[local] += pages; 2087 p->numa_faults_locality[local] += pages;
1882} 2088}
1883 2089
@@ -4446,7 +4652,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4446 latest_idle_timestamp = rq->idle_stamp; 4652 latest_idle_timestamp = rq->idle_stamp;
4447 shallowest_idle_cpu = i; 4653 shallowest_idle_cpu = i;
4448 } 4654 }
4449 } else { 4655 } else if (shallowest_idle_cpu == -1) {
4450 load = weighted_cpuload(i); 4656 load = weighted_cpuload(i);
4451 if (load < min_load || (load == min_load && i == this_cpu)) { 4657 if (load < min_load || (load == min_load && i == this_cpu)) {
4452 min_load = load; 4658 min_load = load;
@@ -4524,9 +4730,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4524 int want_affine = 0; 4730 int want_affine = 0;
4525 int sync = wake_flags & WF_SYNC; 4731 int sync = wake_flags & WF_SYNC;
4526 4732
4527 if (p->nr_cpus_allowed == 1)
4528 return prev_cpu;
4529
4530 if (sd_flag & SD_BALANCE_WAKE) 4733 if (sd_flag & SD_BALANCE_WAKE)
4531 want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); 4734 want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
4532 4735
@@ -5166,7 +5369,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5166 struct numa_group *numa_group = rcu_dereference(p->numa_group); 5369 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5167 int src_nid, dst_nid; 5370 int src_nid, dst_nid;
5168 5371
5169 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || 5372 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
5170 !(env->sd->flags & SD_NUMA)) { 5373 !(env->sd->flags & SD_NUMA)) {
5171 return false; 5374 return false;
5172 } 5375 }
@@ -5205,7 +5408,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5205 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5408 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
5206 return false; 5409 return false;
5207 5410
5208 if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) 5411 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5209 return false; 5412 return false;
5210 5413
5211 src_nid = cpu_to_node(env->src_cpu); 5414 src_nid = cpu_to_node(env->src_cpu);
@@ -6149,8 +6352,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
6149 * with a large weight task outweighs the tasks on the system). 6352 * with a large weight task outweighs the tasks on the system).
6150 */ 6353 */
6151 if (prefer_sibling && sds->local && 6354 if (prefer_sibling && sds->local &&
6152 sds->local_stat.group_has_free_capacity) 6355 sds->local_stat.group_has_free_capacity) {
6153 sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); 6356 sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
6357 sgs->group_type = group_classify(sg, sgs);
6358 }
6154 6359
6155 if (update_sd_pick_busiest(env, sds, sg, sgs)) { 6360 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6156 sds->busiest = sg; 6361 sds->busiest = sg;
@@ -7938,6 +8143,8 @@ const struct sched_class fair_sched_class = {
7938 8143
7939 .get_rr_interval = get_rr_interval_fair, 8144 .get_rr_interval = get_rr_interval_fair,
7940 8145
8146 .update_curr = update_curr_fair,
8147
7941#ifdef CONFIG_FAIR_GROUP_SCHED 8148#ifdef CONFIG_FAIR_GROUP_SCHED
7942 .task_move_group = task_move_group_fair, 8149 .task_move_group = task_move_group_fair,
7943#endif 8150#endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 67ad4e7f506a..c65dac8c97cd 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -75,6 +75,10 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
75 return 0; 75 return 0;
76} 76}
77 77
78static void update_curr_idle(struct rq *rq)
79{
80}
81
78/* 82/*
79 * Simple, special scheduling class for the per-CPU idle tasks: 83 * Simple, special scheduling class for the per-CPU idle tasks:
80 */ 84 */
@@ -101,4 +105,5 @@ const struct sched_class idle_sched_class = {
101 105
102 .prio_changed = prio_changed_idle, 106 .prio_changed = prio_changed_idle,
103 .switched_to = switched_to_idle, 107 .switched_to = switched_to_idle,
108 .update_curr = update_curr_idle,
104}; 109};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d024e6ce30ba..ee15f5a0d1c1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1301 struct task_struct *curr; 1301 struct task_struct *curr;
1302 struct rq *rq; 1302 struct rq *rq;
1303 1303
1304 if (p->nr_cpus_allowed == 1)
1305 goto out;
1306
1307 /* For anything but wake ups, just return the task_cpu */ 1304 /* For anything but wake ups, just return the task_cpu */
1308 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1305 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1309 goto out; 1306 goto out;
@@ -1351,16 +1348,22 @@ out:
1351 1348
1352static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1349static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1353{ 1350{
1354 if (rq->curr->nr_cpus_allowed == 1) 1351 /*
1352 * Current can't be migrated, useless to reschedule,
1353 * let's hope p can move out.
1354 */
1355 if (rq->curr->nr_cpus_allowed == 1 ||
1356 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1355 return; 1357 return;
1356 1358
1359 /*
1360 * p is migratable, so let's not schedule it and
1361 * see if it is pushed or pulled somewhere else.
1362 */
1357 if (p->nr_cpus_allowed != 1 1363 if (p->nr_cpus_allowed != 1
1358 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1364 && cpupri_find(&rq->rd->cpupri, p, NULL))
1359 return; 1365 return;
1360 1366
1361 if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1362 return;
1363
1364 /* 1367 /*
1365 * There appears to be other cpus that can accept 1368 * There appears to be other cpus that can accept
1366 * current and none to run 'p', so lets reschedule 1369 * current and none to run 'p', so lets reschedule
@@ -2128,6 +2131,8 @@ const struct sched_class rt_sched_class = {
2128 2131
2129 .prio_changed = prio_changed_rt, 2132 .prio_changed = prio_changed_rt,
2130 .switched_to = switched_to_rt, 2133 .switched_to = switched_to_rt,
2134
2135 .update_curr = update_curr_rt,
2131}; 2136};
2132 2137
2133#ifdef CONFIG_SCHED_DEBUG 2138#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 24156c8434d1..9a2a45c970e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -176,6 +176,25 @@ struct dl_bw {
176 u64 bw, total_bw; 176 u64 bw, total_bw;
177}; 177};
178 178
179static inline
180void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
181{
182 dl_b->total_bw -= tsk_bw;
183}
184
185static inline
186void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
187{
188 dl_b->total_bw += tsk_bw;
189}
190
191static inline
192bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
193{
194 return dl_b->bw != -1 &&
195 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
196}
197
179extern struct mutex sched_domains_mutex; 198extern struct mutex sched_domains_mutex;
180 199
181#ifdef CONFIG_CGROUP_SCHED 200#ifdef CONFIG_CGROUP_SCHED
@@ -678,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq)
678 return rq->clock_task; 697 return rq->clock_task;
679} 698}
680 699
700#ifdef CONFIG_NUMA
701enum numa_topology_type {
702 NUMA_DIRECT,
703 NUMA_GLUELESS_MESH,
704 NUMA_BACKPLANE,
705};
706extern enum numa_topology_type sched_numa_topology_type;
707extern int sched_max_numa_distance;
708extern bool find_numa_distance(int distance);
709#endif
710
681#ifdef CONFIG_NUMA_BALANCING 711#ifdef CONFIG_NUMA_BALANCING
712/* The regions in numa_faults array from task_struct */
713enum numa_faults_stats {
714 NUMA_MEM = 0,
715 NUMA_CPU,
716 NUMA_MEMBUF,
717 NUMA_CPUBUF
718};
682extern void sched_setnuma(struct task_struct *p, int node); 719extern void sched_setnuma(struct task_struct *p, int node);
683extern int migrate_task_to(struct task_struct *p, int cpu); 720extern int migrate_task_to(struct task_struct *p, int cpu);
684extern int migrate_swap(struct task_struct *, struct task_struct *); 721extern int migrate_swap(struct task_struct *, struct task_struct *);
@@ -1127,6 +1164,11 @@ struct sched_class {
1127 void (*task_fork) (struct task_struct *p); 1164 void (*task_fork) (struct task_struct *p);
1128 void (*task_dead) (struct task_struct *p); 1165 void (*task_dead) (struct task_struct *p);
1129 1166
1167 /*
1168 * The switched_from() call is allowed to drop rq->lock, therefore we
1169 * cannot assume the switched_from/switched_to pair is serliazed by
1170 * rq->lock. They are however serialized by p->pi_lock.
1171 */
1130 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1172 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1131 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1173 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1132 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 1174 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -1135,6 +1177,8 @@ struct sched_class {
1135 unsigned int (*get_rr_interval) (struct rq *rq, 1177 unsigned int (*get_rr_interval) (struct rq *rq,
1136 struct task_struct *task); 1178 struct task_struct *task);
1137 1179
1180 void (*update_curr) (struct rq *rq);
1181
1138#ifdef CONFIG_FAIR_GROUP_SCHED 1182#ifdef CONFIG_FAIR_GROUP_SCHED
1139 void (*task_move_group) (struct task_struct *p, int on_rq); 1183 void (*task_move_group) (struct task_struct *p, int on_rq);
1140#endif 1184#endif
@@ -1502,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1502extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); 1546extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1503extern void print_cfs_stats(struct seq_file *m, int cpu); 1547extern void print_cfs_stats(struct seq_file *m, int cpu);
1504extern void print_rt_stats(struct seq_file *m, int cpu); 1548extern void print_rt_stats(struct seq_file *m, int cpu);
1549extern void print_dl_stats(struct seq_file *m, int cpu);
1505 1550
1506extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1551extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1507extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1552extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 67426e529f59..79ffec45a6ac 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -102,6 +102,10 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
102 return 0; 102 return 0;
103} 103}
104 104
105static void update_curr_stop(struct rq *rq)
106{
107}
108
105/* 109/*
106 * Simple, special scheduling class for the per-CPU stop tasks: 110 * Simple, special scheduling class for the per-CPU stop tasks:
107 */ 111 */
@@ -128,4 +132,5 @@ const struct sched_class stop_sched_class = {
128 132
129 .prio_changed = prio_changed_stop, 133 .prio_changed = prio_changed_stop,
130 .switched_to = switched_to_stop, 134 .switched_to = switched_to_stop,
135 .update_curr = update_curr_stop,
131}; 136};
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 5a62915f47a8..852143a79f36 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -9,6 +9,7 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12#include <linux/kthread.h>
12 13
13void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) 14void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
14{ 15{
@@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *
297} 298}
298EXPORT_SYMBOL(autoremove_wake_function); 299EXPORT_SYMBOL(autoremove_wake_function);
299 300
301static inline bool is_kthread_should_stop(void)
302{
303 return (current->flags & PF_KTHREAD) && kthread_should_stop();
304}
305
306/*
307 * DEFINE_WAIT_FUNC(wait, woken_wake_func);
308 *
309 * add_wait_queue(&wq, &wait);
310 * for (;;) {
311 * if (condition)
312 * break;
313 *
314 * p->state = mode; condition = true;
315 * smp_mb(); // A smp_wmb(); // C
316 * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN;
317 * schedule() try_to_wake_up();
318 * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
319 * wait->flags &= ~WQ_FLAG_WOKEN; condition = true;
320 * smp_mb() // B smp_wmb(); // C
321 * wait->flags |= WQ_FLAG_WOKEN;
322 * }
323 * remove_wait_queue(&wq, &wait);
324 *
325 */
326long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
327{
328 set_current_state(mode); /* A */
329 /*
330 * The above implies an smp_mb(), which matches with the smp_wmb() from
331 * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
332 * also observe all state before the wakeup.
333 */
334 if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
335 timeout = schedule_timeout(timeout);
336 __set_current_state(TASK_RUNNING);
337
338 /*
339 * The below implies an smp_mb(), it too pairs with the smp_wmb() from
340 * woken_wake_function() such that we must either observe the wait
341 * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
342 * an event.
343 */
344 set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
345
346 return timeout;
347}
348EXPORT_SYMBOL(wait_woken);
349
350int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
351{
352 /*
353 * Although this function is called under waitqueue lock, LOCK
354 * doesn't imply write barrier and the users expects write
355 * barrier semantics on wakeup functions. The following
356 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
357 * and is paired with set_mb() in wait_woken().
358 */
359 smp_wmb(); /* C */
360 wait->flags |= WQ_FLAG_WOKEN;
361
362 return default_wake_function(wait, mode, sync, key);
363}
364EXPORT_SYMBOL(woken_wake_function);
365
300int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) 366int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
301{ 367{
302 struct wait_bit_key *key = arg; 368 struct wait_bit_key *key = arg;
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f0876f9f6dd..16a305295256 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1275,7 +1275,17 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1275 local_irq_restore(*flags); 1275 local_irq_restore(*flags);
1276 break; 1276 break;
1277 } 1277 }
1278 1278 /*
1279 * This sighand can be already freed and even reused, but
1280 * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which
1281 * initializes ->siglock: this slab can't go away, it has
1282 * the same object type, ->siglock can't be reinitialized.
1283 *
1284 * We need to ensure that tsk->sighand is still the same
1285 * after we take the lock, we can race with de_thread() or
1286 * __exit_signal(). In the latter case the next iteration
1287 * must see ->sighand == NULL.
1288 */
1279 spin_lock(&sighand->siglock); 1289 spin_lock(&sighand->siglock);
1280 if (likely(sighand == tsk->sighand)) { 1290 if (likely(sighand == tsk->sighand)) {
1281 rcu_read_unlock(); 1291 rcu_read_unlock();
@@ -1331,23 +1341,21 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1331 int error = -ESRCH; 1341 int error = -ESRCH;
1332 struct task_struct *p; 1342 struct task_struct *p;
1333 1343
1334 rcu_read_lock(); 1344 for (;;) {
1335retry: 1345 rcu_read_lock();
1336 p = pid_task(pid, PIDTYPE_PID); 1346 p = pid_task(pid, PIDTYPE_PID);
1337 if (p) { 1347 if (p)
1338 error = group_send_sig_info(sig, info, p); 1348 error = group_send_sig_info(sig, info, p);
1339 if (unlikely(error == -ESRCH)) 1349 rcu_read_unlock();
1340 /* 1350 if (likely(!p || error != -ESRCH))
1341 * The task was unhashed in between, try again. 1351 return error;
1342 * If it is dead, pid_task() will return NULL,
1343 * if we race with de_thread() it will find the
1344 * new leader.
1345 */
1346 goto retry;
1347 }
1348 rcu_read_unlock();
1349 1352
1350 return error; 1353 /*
1354 * The task was unhashed in between, try again. If it
1355 * is dead, pid_task() will return NULL, if we race with
1356 * de_thread() it will find the new leader.
1357 */
1358 }
1351} 1359}
1352 1360
1353int kill_proc_info(int sig, struct siginfo *info, pid_t pid) 1361int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
@@ -2748,6 +2756,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
2748 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) 2756 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
2749 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); 2757 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
2750#endif 2758#endif
2759#ifdef SEGV_BNDERR
2760 err |= __put_user(from->si_lower, &to->si_lower);
2761 err |= __put_user(from->si_upper, &to->si_upper);
2762#endif
2751 break; 2763 break;
2752 case __SI_CHLD: 2764 case __SI_CHLD:
2753 err |= __put_user(from->si_pid, &to->si_pid); 2765 err |= __put_user(from->si_pid, &to->si_pid);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index eb89e1807408..f032fb5284e3 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data)
110 set_current_state(TASK_INTERRUPTIBLE); 110 set_current_state(TASK_INTERRUPTIBLE);
111 preempt_disable(); 111 preempt_disable();
112 if (kthread_should_stop()) { 112 if (kthread_should_stop()) {
113 set_current_state(TASK_RUNNING); 113 __set_current_state(TASK_RUNNING);
114 preempt_enable(); 114 preempt_enable();
115 if (ht->cleanup) 115 if (ht->cleanup)
116 ht->cleanup(td->cpu, cpu_online(td->cpu)); 116 ht->cleanup(td->cpu, cpu_online(td->cpu));
@@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data)
136 /* Check for state change setup */ 136 /* Check for state change setup */
137 switch (td->status) { 137 switch (td->status) {
138 case HP_THREAD_NONE: 138 case HP_THREAD_NONE:
139 __set_current_state(TASK_RUNNING);
139 preempt_enable(); 140 preempt_enable();
140 if (ht->setup) 141 if (ht->setup)
141 ht->setup(td->cpu); 142 ht->setup(td->cpu);
142 td->status = HP_THREAD_ACTIVE; 143 td->status = HP_THREAD_ACTIVE;
143 preempt_disable(); 144 continue;
144 break; 145
145 case HP_THREAD_PARKED: 146 case HP_THREAD_PARKED:
147 __set_current_state(TASK_RUNNING);
146 preempt_enable(); 148 preempt_enable();
147 if (ht->unpark) 149 if (ht->unpark)
148 ht->unpark(td->cpu); 150 ht->unpark(td->cpu);
149 td->status = HP_THREAD_ACTIVE; 151 td->status = HP_THREAD_ACTIVE;
150 preempt_disable(); 152 continue;
151 break;
152 } 153 }
153 154
154 if (!ht->thread_should_run(td->cpu)) { 155 if (!ht->thread_should_run(td->cpu)) {
155 preempt_enable(); 156 preempt_enable_no_resched();
156 schedule(); 157 schedule();
157 } else { 158 } else {
158 set_current_state(TASK_RUNNING); 159 __set_current_state(TASK_RUNNING);
159 preempt_enable(); 160 preempt_enable();
160 ht->thread_fn(td->cpu); 161 ht->thread_fn(td->cpu);
161 } 162 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0699add19164..501baa9ac1be 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -656,7 +656,7 @@ static void run_ksoftirqd(unsigned int cpu)
656 * in the task stack here. 656 * in the task stack here.
657 */ 657 */
658 __do_softirq(); 658 __do_softirq();
659 rcu_note_context_switch(cpu); 659 rcu_note_context_switch();
660 local_irq_enable(); 660 local_irq_enable();
661 cond_resched(); 661 cond_resched();
662 return; 662 return;
diff --git a/kernel/sys.c b/kernel/sys.c
index 1eaa2f0b0246..a8c9f5a7dda6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -91,6 +91,12 @@
91#ifndef SET_TSC_CTL 91#ifndef SET_TSC_CTL
92# define SET_TSC_CTL(a) (-EINVAL) 92# define SET_TSC_CTL(a) (-EINVAL)
93#endif 93#endif
94#ifndef MPX_ENABLE_MANAGEMENT
95# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL)
96#endif
97#ifndef MPX_DISABLE_MANAGEMENT
98# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL)
99#endif
94 100
95/* 101/*
96 * this is where the system-wide overflow UID and GID are defined, for 102 * this is where the system-wide overflow UID and GID are defined, for
@@ -2203,6 +2209,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2203 me->mm->def_flags &= ~VM_NOHUGEPAGE; 2209 me->mm->def_flags &= ~VM_NOHUGEPAGE;
2204 up_write(&me->mm->mmap_sem); 2210 up_write(&me->mm->mmap_sem);
2205 break; 2211 break;
2212 case PR_MPX_ENABLE_MANAGEMENT:
2213 error = MPX_ENABLE_MANAGEMENT(me);
2214 break;
2215 case PR_MPX_DISABLE_MANAGEMENT:
2216 error = MPX_DISABLE_MANAGEMENT(me);
2217 break;
2206 default: 2218 default:
2207 error = -EINVAL; 2219 error = -EINVAL;
2208 break; 2220 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4aada6d9fe74..7c54ff79afd7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = {
387 .data = &sysctl_numa_balancing_scan_size, 387 .data = &sysctl_numa_balancing_scan_size,
388 .maxlen = sizeof(unsigned int), 388 .maxlen = sizeof(unsigned int),
389 .mode = 0644, 389 .mode = 0644,
390 .proc_handler = proc_dointvec, 390 .proc_handler = proc_dointvec_minmax,
391 .extra1 = &one,
391 }, 392 },
392 { 393 {
393 .procname = "numa_balancing", 394 .procname = "numa_balancing",
@@ -1103,6 +1104,15 @@ static struct ctl_table kern_table[] = {
1103 .proc_handler = proc_dointvec, 1104 .proc_handler = proc_dointvec,
1104 }, 1105 },
1105#endif 1106#endif
1107 {
1108 .procname = "panic_on_warn",
1109 .data = &panic_on_warn,
1110 .maxlen = sizeof(int),
1111 .mode = 0644,
1112 .proc_handler = proc_dointvec_minmax,
1113 .extra1 = &zero,
1114 .extra2 = &one,
1115 },
1106 { } 1116 { }
1107}; 1117};
1108 1118
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 9a4f750a2963..7e7746a42a62 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, 137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, 138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, 139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
140 { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" },
140 {} 141 {}
141}; 142};
142 143
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b312fcc73024..670fff88a961 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -459,7 +459,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
459 stats = nla_data(na); 459 stats = nla_data(na);
460 memset(stats, 0, sizeof(*stats)); 460 memset(stats, 0, sizeof(*stats));
461 461
462 rc = cgroupstats_build(stats, f.file->f_dentry); 462 rc = cgroupstats_build(stats, f.file->f_path.dentry);
463 if (rc < 0) { 463 if (rc < 0) {
464 nlmsg_free(rep_skb); 464 nlmsg_free(rep_skb);
465 goto err; 465 goto err;
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 7347426fa68d..f622cf28628a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
13obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o 13obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
14obj-$(CONFIG_TIMER_STATS) += timer_stats.o 14obj-$(CONFIG_TIMER_STATS) += timer_stats.o
15obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o 15obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
16obj-$(CONFIG_TEST_UDELAY) += udelay_test.o 16obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
17 17
18$(obj)/time.o: $(obj)/timeconst.h 18$(obj)/time.o: $(obj)/timeconst.h
19 19
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9c94c19f1305..55449909f114 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
72 * Also omit the add if it would overflow the u64 boundary. 72 * Also omit the add if it would overflow the u64 boundary.
73 */ 73 */
74 if ((~0ULL - clc > rnd) && 74 if ((~0ULL - clc > rnd) &&
75 (!ismax || evt->mult <= (1U << evt->shift))) 75 (!ismax || evt->mult <= (1ULL << evt->shift)))
76 clc += rnd; 76 clc += rnd;
77 77
78 do_div(clc, evt->mult); 78 do_div(clc, evt->mult);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 492b986195d5..a16b67859e2a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -553,7 +553,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
553 *sample = cputime_to_expires(cputime.utime); 553 *sample = cputime_to_expires(cputime.utime);
554 break; 554 break;
555 case CPUCLOCK_SCHED: 555 case CPUCLOCK_SCHED:
556 *sample = cputime.sum_exec_runtime + task_delta_exec(p); 556 *sample = cputime.sum_exec_runtime;
557 break; 557 break;
558 } 558 }
559 return 0; 559 return 0;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 42b463ad90f2..31ea01f42e1f 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
636 goto out; 636 goto out;
637 } 637 }
638 } else { 638 } else {
639 memset(&event.sigev_value, 0, sizeof(event.sigev_value));
639 event.sigev_notify = SIGEV_SIGNAL; 640 event.sigev_notify = SIGEV_SIGNAL;
640 event.sigev_signo = SIGALRM; 641 event.sigev_signo = SIGALRM;
641 event.sigev_value.sival_int = new_timer->it_id; 642 event.sigev_value.sival_int = new_timer->it_id;
diff --git a/kernel/time/udelay_test.c b/kernel/time/test_udelay.c
index e622ba365a13..e622ba365a13 100644
--- a/kernel/time/udelay_test.c
+++ b/kernel/time/test_udelay.c
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7b5741fc4110..1f4356037a7d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -585,7 +585,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
585 last_jiffies = jiffies; 585 last_jiffies = jiffies;
586 } while (read_seqretry(&jiffies_lock, seq)); 586 } while (read_seqretry(&jiffies_lock, seq));
587 587
588 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || 588 if (rcu_needs_cpu(&rcu_delta_jiffies) ||
589 arch_needs_cpu() || irq_work_needs_cpu()) { 589 arch_needs_cpu() || irq_work_needs_cpu()) {
590 next_jiffies = last_jiffies + 1; 590 next_jiffies = last_jiffies + 1;
591 delta_jiffies = 1; 591 delta_jiffies = 1;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a9ae20fb0b11..65015ff2f07c 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -304,7 +304,9 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
304} 304}
305EXPORT_SYMBOL(timespec_trunc); 305EXPORT_SYMBOL(timespec_trunc);
306 306
307/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 307/*
308 * mktime64 - Converts date to seconds.
309 * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
308 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 310 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
309 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. 311 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
310 * 312 *
@@ -314,15 +316,10 @@ EXPORT_SYMBOL(timespec_trunc);
314 * -year/100+year/400 terms, and add 10.] 316 * -year/100+year/400 terms, and add 10.]
315 * 317 *
316 * This algorithm was first published by Gauss (I think). 318 * This algorithm was first published by Gauss (I think).
317 *
318 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
319 * machines where long is 32-bit! (However, as time_t is signed, we
320 * will already get problems at other places on 2038-01-19 03:14:08)
321 */ 319 */
322unsigned long 320time64_t mktime64(const unsigned int year0, const unsigned int mon0,
323mktime(const unsigned int year0, const unsigned int mon0, 321 const unsigned int day, const unsigned int hour,
324 const unsigned int day, const unsigned int hour, 322 const unsigned int min, const unsigned int sec)
325 const unsigned int min, const unsigned int sec)
326{ 323{
327 unsigned int mon = mon0, year = year0; 324 unsigned int mon = mon0, year = year0;
328 325
@@ -332,15 +329,14 @@ mktime(const unsigned int year0, const unsigned int mon0,
332 year -= 1; 329 year -= 1;
333 } 330 }
334 331
335 return ((((unsigned long) 332 return ((((time64_t)
336 (year/4 - year/100 + year/400 + 367*mon/12 + day) + 333 (year/4 - year/100 + year/400 + 367*mon/12 + day) +
337 year*365 - 719499 334 year*365 - 719499
338 )*24 + hour /* now have hours */ 335 )*24 + hour /* now have hours */
339 )*60 + min /* now have minutes */ 336 )*60 + min /* now have minutes */
340 )*60 + sec; /* finally seconds */ 337 )*60 + sec; /* finally seconds */
341} 338}
342 339EXPORT_SYMBOL(mktime64);
343EXPORT_SYMBOL(mktime);
344 340
345/** 341/**
346 * set_normalized_timespec - set timespec sec and nsec parts and normalize 342 * set_normalized_timespec - set timespec sec and nsec parts and normalize
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ec1791fae965..6a931852082f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -417,7 +417,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
417 */ 417 */
418static inline void tk_update_ktime_data(struct timekeeper *tk) 418static inline void tk_update_ktime_data(struct timekeeper *tk)
419{ 419{
420 s64 nsec; 420 u64 seconds;
421 u32 nsec;
421 422
422 /* 423 /*
423 * The xtime based monotonic readout is: 424 * The xtime based monotonic readout is:
@@ -426,13 +427,22 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
426 * nsec = base_mono + now(); 427 * nsec = base_mono + now();
427 * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec 428 * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
428 */ 429 */
429 nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); 430 seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
430 nsec *= NSEC_PER_SEC; 431 nsec = (u32) tk->wall_to_monotonic.tv_nsec;
431 nsec += tk->wall_to_monotonic.tv_nsec; 432 tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
432 tk->tkr.base_mono = ns_to_ktime(nsec);
433 433
434 /* Update the monotonic raw base */ 434 /* Update the monotonic raw base */
435 tk->base_raw = timespec64_to_ktime(tk->raw_time); 435 tk->base_raw = timespec64_to_ktime(tk->raw_time);
436
437 /*
438 * The sum of the nanoseconds portions of xtime and
439 * wall_to_monotonic can be greater/equal one second. Take
440 * this into account before updating tk->ktime_sec.
441 */
442 nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift);
443 if (nsec >= NSEC_PER_SEC)
444 seconds++;
445 tk->ktime_sec = seconds;
436} 446}
437 447
438/* must hold timekeeper_lock */ 448/* must hold timekeeper_lock */
@@ -519,9 +529,9 @@ EXPORT_SYMBOL(__getnstimeofday64);
519 529
520/** 530/**
521 * getnstimeofday64 - Returns the time of day in a timespec64. 531 * getnstimeofday64 - Returns the time of day in a timespec64.
522 * @ts: pointer to the timespec to be set 532 * @ts: pointer to the timespec64 to be set
523 * 533 *
524 * Returns the time of day in a timespec (WARN if suspended). 534 * Returns the time of day in a timespec64 (WARN if suspended).
525 */ 535 */
526void getnstimeofday64(struct timespec64 *ts) 536void getnstimeofday64(struct timespec64 *ts)
527{ 537{
@@ -623,7 +633,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw);
623 * 633 *
624 * The function calculates the monotonic clock from the realtime 634 * The function calculates the monotonic clock from the realtime
625 * clock and the wall_to_monotonic offset and stores the result 635 * clock and the wall_to_monotonic offset and stores the result
626 * in normalized timespec format in the variable pointed to by @ts. 636 * in normalized timespec64 format in the variable pointed to by @ts.
627 */ 637 */
628void ktime_get_ts64(struct timespec64 *ts) 638void ktime_get_ts64(struct timespec64 *ts)
629{ 639{
@@ -648,6 +658,54 @@ void ktime_get_ts64(struct timespec64 *ts)
648} 658}
649EXPORT_SYMBOL_GPL(ktime_get_ts64); 659EXPORT_SYMBOL_GPL(ktime_get_ts64);
650 660
661/**
662 * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
663 *
664 * Returns the seconds portion of CLOCK_MONOTONIC with a single non
665 * serialized read. tk->ktime_sec is of type 'unsigned long' so this
666 * works on both 32 and 64 bit systems. On 32 bit systems the readout
667 * covers ~136 years of uptime which should be enough to prevent
668 * premature wrap arounds.
669 */
670time64_t ktime_get_seconds(void)
671{
672 struct timekeeper *tk = &tk_core.timekeeper;
673
674 WARN_ON(timekeeping_suspended);
675 return tk->ktime_sec;
676}
677EXPORT_SYMBOL_GPL(ktime_get_seconds);
678
679/**
680 * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
681 *
682 * Returns the wall clock seconds since 1970. This replaces the
683 * get_seconds() interface which is not y2038 safe on 32bit systems.
684 *
685 * For 64bit systems the fast access to tk->xtime_sec is preserved. On
686 * 32bit systems the access must be protected with the sequence
687 * counter to provide "atomic" access to the 64bit tk->xtime_sec
688 * value.
689 */
690time64_t ktime_get_real_seconds(void)
691{
692 struct timekeeper *tk = &tk_core.timekeeper;
693 time64_t seconds;
694 unsigned int seq;
695
696 if (IS_ENABLED(CONFIG_64BIT))
697 return tk->xtime_sec;
698
699 do {
700 seq = read_seqcount_begin(&tk_core.seq);
701 seconds = tk->xtime_sec;
702
703 } while (read_seqcount_retry(&tk_core.seq, seq));
704
705 return seconds;
706}
707EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
708
651#ifdef CONFIG_NTP_PPS 709#ifdef CONFIG_NTP_PPS
652 710
653/** 711/**
@@ -703,18 +761,18 @@ void do_gettimeofday(struct timeval *tv)
703EXPORT_SYMBOL(do_gettimeofday); 761EXPORT_SYMBOL(do_gettimeofday);
704 762
705/** 763/**
706 * do_settimeofday - Sets the time of day 764 * do_settimeofday64 - Sets the time of day.
707 * @tv: pointer to the timespec variable containing the new time 765 * @ts: pointer to the timespec64 variable containing the new time
708 * 766 *
709 * Sets the time of day to the new time and update NTP and notify hrtimers 767 * Sets the time of day to the new time and update NTP and notify hrtimers
710 */ 768 */
711int do_settimeofday(const struct timespec *tv) 769int do_settimeofday64(const struct timespec64 *ts)
712{ 770{
713 struct timekeeper *tk = &tk_core.timekeeper; 771 struct timekeeper *tk = &tk_core.timekeeper;
714 struct timespec64 ts_delta, xt, tmp; 772 struct timespec64 ts_delta, xt;
715 unsigned long flags; 773 unsigned long flags;
716 774
717 if (!timespec_valid_strict(tv)) 775 if (!timespec64_valid_strict(ts))
718 return -EINVAL; 776 return -EINVAL;
719 777
720 raw_spin_lock_irqsave(&timekeeper_lock, flags); 778 raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -723,13 +781,12 @@ int do_settimeofday(const struct timespec *tv)
723 timekeeping_forward_now(tk); 781 timekeeping_forward_now(tk);
724 782
725 xt = tk_xtime(tk); 783 xt = tk_xtime(tk);
726 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; 784 ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
727 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; 785 ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
728 786
729 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); 787 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
730 788
731 tmp = timespec_to_timespec64(*tv); 789 tk_set_xtime(tk, ts);
732 tk_set_xtime(tk, &tmp);
733 790
734 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 791 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
735 792
@@ -741,7 +798,7 @@ int do_settimeofday(const struct timespec *tv)
741 798
742 return 0; 799 return 0;
743} 800}
744EXPORT_SYMBOL(do_settimeofday); 801EXPORT_SYMBOL(do_settimeofday64);
745 802
746/** 803/**
747 * timekeeping_inject_offset - Adds or subtracts from the current time. 804 * timekeeping_inject_offset - Adds or subtracts from the current time.
@@ -895,12 +952,12 @@ int timekeeping_notify(struct clocksource *clock)
895} 952}
896 953
897/** 954/**
898 * getrawmonotonic - Returns the raw monotonic time in a timespec 955 * getrawmonotonic64 - Returns the raw monotonic time in a timespec
899 * @ts: pointer to the timespec to be set 956 * @ts: pointer to the timespec64 to be set
900 * 957 *
901 * Returns the raw monotonic time (completely un-modified by ntp) 958 * Returns the raw monotonic time (completely un-modified by ntp)
902 */ 959 */
903void getrawmonotonic(struct timespec *ts) 960void getrawmonotonic64(struct timespec64 *ts)
904{ 961{
905 struct timekeeper *tk = &tk_core.timekeeper; 962 struct timekeeper *tk = &tk_core.timekeeper;
906 struct timespec64 ts64; 963 struct timespec64 ts64;
@@ -915,9 +972,10 @@ void getrawmonotonic(struct timespec *ts)
915 } while (read_seqcount_retry(&tk_core.seq, seq)); 972 } while (read_seqcount_retry(&tk_core.seq, seq));
916 973
917 timespec64_add_ns(&ts64, nsecs); 974 timespec64_add_ns(&ts64, nsecs);
918 *ts = timespec64_to_timespec(ts64); 975 *ts = ts64;
919} 976}
920EXPORT_SYMBOL(getrawmonotonic); 977EXPORT_SYMBOL(getrawmonotonic64);
978
921 979
922/** 980/**
923 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres 981 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
@@ -1068,8 +1126,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
1068} 1126}
1069 1127
1070/** 1128/**
1071 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values 1129 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
1072 * @delta: pointer to a timespec delta value 1130 * @delta: pointer to a timespec64 delta value
1073 * 1131 *
1074 * This hook is for architectures that cannot support read_persistent_clock 1132 * This hook is for architectures that cannot support read_persistent_clock
1075 * because their RTC/persistent clock is only accessible when irqs are enabled. 1133 * because their RTC/persistent clock is only accessible when irqs are enabled.
@@ -1077,10 +1135,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
1077 * This function should only be called by rtc_resume(), and allows 1135 * This function should only be called by rtc_resume(), and allows
1078 * a suspend offset to be injected into the timekeeping values. 1136 * a suspend offset to be injected into the timekeeping values.
1079 */ 1137 */
1080void timekeeping_inject_sleeptime(struct timespec *delta) 1138void timekeeping_inject_sleeptime64(struct timespec64 *delta)
1081{ 1139{
1082 struct timekeeper *tk = &tk_core.timekeeper; 1140 struct timekeeper *tk = &tk_core.timekeeper;
1083 struct timespec64 tmp;
1084 unsigned long flags; 1141 unsigned long flags;
1085 1142
1086 /* 1143 /*
@@ -1095,8 +1152,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
1095 1152
1096 timekeeping_forward_now(tk); 1153 timekeeping_forward_now(tk);
1097 1154
1098 tmp = timespec_to_timespec64(*delta); 1155 __timekeeping_inject_sleeptime(tk, delta);
1099 __timekeeping_inject_sleeptime(tk, &tmp);
1100 1156
1101 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 1157 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
1102 1158
@@ -1332,6 +1388,12 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
1332 * 1388 *
1333 * XXX - TODO: Doc ntp_error calculation. 1389 * XXX - TODO: Doc ntp_error calculation.
1334 */ 1390 */
1391 if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) {
1392 /* NTP adjustment caused clocksource mult overflow */
1393 WARN_ON_ONCE(1);
1394 return;
1395 }
1396
1335 tk->tkr.mult += mult_adj; 1397 tk->tkr.mult += mult_adj;
1336 tk->xtime_interval += interval; 1398 tk->xtime_interval += interval;
1337 tk->tkr.xtime_nsec -= offset; 1399 tk->tkr.xtime_nsec -= offset;
@@ -1397,7 +1459,8 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1397 } 1459 }
1398 1460
1399 if (unlikely(tk->tkr.clock->maxadj && 1461 if (unlikely(tk->tkr.clock->maxadj &&
1400 (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) { 1462 (abs(tk->tkr.mult - tk->tkr.clock->mult)
1463 > tk->tkr.clock->maxadj))) {
1401 printk_once(KERN_WARNING 1464 printk_once(KERN_WARNING
1402 "Adjusting %s more than 11%% (%ld vs %ld)\n", 1465 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1403 tk->tkr.clock->name, (long)tk->tkr.mult, 1466 tk->tkr.clock->name, (long)tk->tkr.mult,
@@ -1646,7 +1709,7 @@ struct timespec current_kernel_time(void)
1646} 1709}
1647EXPORT_SYMBOL(current_kernel_time); 1710EXPORT_SYMBOL(current_kernel_time);
1648 1711
1649struct timespec get_monotonic_coarse(void) 1712struct timespec64 get_monotonic_coarse64(void)
1650{ 1713{
1651 struct timekeeper *tk = &tk_core.timekeeper; 1714 struct timekeeper *tk = &tk_core.timekeeper;
1652 struct timespec64 now, mono; 1715 struct timespec64 now, mono;
@@ -1662,7 +1725,7 @@ struct timespec get_monotonic_coarse(void)
1662 set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, 1725 set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
1663 now.tv_nsec + mono.tv_nsec); 1726 now.tv_nsec + mono.tv_nsec);
1664 1727
1665 return timespec64_to_timespec(now); 1728 return now;
1666} 1729}
1667 1730
1668/* 1731/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3260ffdb368f..2d3f5c504939 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1377,12 +1377,11 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1377void update_process_times(int user_tick) 1377void update_process_times(int user_tick)
1378{ 1378{
1379 struct task_struct *p = current; 1379 struct task_struct *p = current;
1380 int cpu = smp_processor_id();
1381 1380
1382 /* Note: this timer irq context must be accounted for as well. */ 1381 /* Note: this timer irq context must be accounted for as well. */
1383 account_process_tick(p, user_tick); 1382 account_process_tick(p, user_tick);
1384 run_local_timers(); 1383 run_local_timers();
1385 rcu_check_callbacks(cpu, user_tick); 1384 rcu_check_callbacks(user_tick);
1386#ifdef CONFIG_IRQ_WORK 1385#ifdef CONFIG_IRQ_WORK
1387 if (in_irq()) 1386 if (in_irq())
1388 irq_work_tick(); 1387 irq_work_tick();
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c1bd4ada2a04..11b9cb36092b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1142,9 +1142,9 @@ static void get_pdu_remap(const struct trace_entry *ent,
1142 r->sector_from = be64_to_cpu(sector_from); 1142 r->sector_from = be64_to_cpu(sector_from);
1143} 1143}
1144 1144
1145typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); 1145typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
1146 1146
1147static int blk_log_action_classic(struct trace_iterator *iter, const char *act) 1147static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
1148{ 1148{
1149 char rwbs[RWBS_LEN]; 1149 char rwbs[RWBS_LEN];
1150 unsigned long long ts = iter->ts; 1150 unsigned long long ts = iter->ts;
@@ -1154,33 +1154,33 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
1154 1154
1155 fill_rwbs(rwbs, t); 1155 fill_rwbs(rwbs, t);
1156 1156
1157 return trace_seq_printf(&iter->seq, 1157 trace_seq_printf(&iter->seq,
1158 "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", 1158 "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
1159 MAJOR(t->device), MINOR(t->device), iter->cpu, 1159 MAJOR(t->device), MINOR(t->device), iter->cpu,
1160 secs, nsec_rem, iter->ent->pid, act, rwbs); 1160 secs, nsec_rem, iter->ent->pid, act, rwbs);
1161} 1161}
1162 1162
1163static int blk_log_action(struct trace_iterator *iter, const char *act) 1163static void blk_log_action(struct trace_iterator *iter, const char *act)
1164{ 1164{
1165 char rwbs[RWBS_LEN]; 1165 char rwbs[RWBS_LEN];
1166 const struct blk_io_trace *t = te_blk_io_trace(iter->ent); 1166 const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1167 1167
1168 fill_rwbs(rwbs, t); 1168 fill_rwbs(rwbs, t);
1169 return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", 1169 trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
1170 MAJOR(t->device), MINOR(t->device), act, rwbs); 1170 MAJOR(t->device), MINOR(t->device), act, rwbs);
1171} 1171}
1172 1172
1173static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) 1173static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
1174{ 1174{
1175 const unsigned char *pdu_buf; 1175 const unsigned char *pdu_buf;
1176 int pdu_len; 1176 int pdu_len;
1177 int i, end, ret; 1177 int i, end;
1178 1178
1179 pdu_buf = pdu_start(ent); 1179 pdu_buf = pdu_start(ent);
1180 pdu_len = te_blk_io_trace(ent)->pdu_len; 1180 pdu_len = te_blk_io_trace(ent)->pdu_len;
1181 1181
1182 if (!pdu_len) 1182 if (!pdu_len)
1183 return 1; 1183 return;
1184 1184
1185 /* find the last zero that needs to be printed */ 1185 /* find the last zero that needs to be printed */
1186 for (end = pdu_len - 1; end >= 0; end--) 1186 for (end = pdu_len - 1; end >= 0; end--)
@@ -1188,119 +1188,107 @@ static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
1188 break; 1188 break;
1189 end++; 1189 end++;
1190 1190
1191 if (!trace_seq_putc(s, '(')) 1191 trace_seq_putc(s, '(');
1192 return 0;
1193 1192
1194 for (i = 0; i < pdu_len; i++) { 1193 for (i = 0; i < pdu_len; i++) {
1195 1194
1196 ret = trace_seq_printf(s, "%s%02x", 1195 trace_seq_printf(s, "%s%02x",
1197 i == 0 ? "" : " ", pdu_buf[i]); 1196 i == 0 ? "" : " ", pdu_buf[i]);
1198 if (!ret)
1199 return ret;
1200 1197
1201 /* 1198 /*
1202 * stop when the rest is just zeroes and indicate so 1199 * stop when the rest is just zeroes and indicate so
1203 * with a ".." appended 1200 * with a ".." appended
1204 */ 1201 */
1205 if (i == end && end != pdu_len - 1) 1202 if (i == end && end != pdu_len - 1) {
1206 return trace_seq_puts(s, " ..) "); 1203 trace_seq_puts(s, " ..) ");
1204 return;
1205 }
1207 } 1206 }
1208 1207
1209 return trace_seq_puts(s, ") "); 1208 trace_seq_puts(s, ") ");
1210} 1209}
1211 1210
1212static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) 1211static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
1213{ 1212{
1214 char cmd[TASK_COMM_LEN]; 1213 char cmd[TASK_COMM_LEN];
1215 1214
1216 trace_find_cmdline(ent->pid, cmd); 1215 trace_find_cmdline(ent->pid, cmd);
1217 1216
1218 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { 1217 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1219 int ret; 1218 trace_seq_printf(s, "%u ", t_bytes(ent));
1220 1219 blk_log_dump_pdu(s, ent);
1221 ret = trace_seq_printf(s, "%u ", t_bytes(ent)); 1220 trace_seq_printf(s, "[%s]\n", cmd);
1222 if (!ret)
1223 return 0;
1224 ret = blk_log_dump_pdu(s, ent);
1225 if (!ret)
1226 return 0;
1227 return trace_seq_printf(s, "[%s]\n", cmd);
1228 } else { 1221 } else {
1229 if (t_sec(ent)) 1222 if (t_sec(ent))
1230 return trace_seq_printf(s, "%llu + %u [%s]\n", 1223 trace_seq_printf(s, "%llu + %u [%s]\n",
1231 t_sector(ent), t_sec(ent), cmd); 1224 t_sector(ent), t_sec(ent), cmd);
1232 return trace_seq_printf(s, "[%s]\n", cmd); 1225 else
1226 trace_seq_printf(s, "[%s]\n", cmd);
1233 } 1227 }
1234} 1228}
1235 1229
1236static int blk_log_with_error(struct trace_seq *s, 1230static void blk_log_with_error(struct trace_seq *s,
1237 const struct trace_entry *ent) 1231 const struct trace_entry *ent)
1238{ 1232{
1239 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { 1233 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1240 int ret; 1234 blk_log_dump_pdu(s, ent);
1241 1235 trace_seq_printf(s, "[%d]\n", t_error(ent));
1242 ret = blk_log_dump_pdu(s, ent);
1243 if (ret)
1244 return trace_seq_printf(s, "[%d]\n", t_error(ent));
1245 return 0;
1246 } else { 1236 } else {
1247 if (t_sec(ent)) 1237 if (t_sec(ent))
1248 return trace_seq_printf(s, "%llu + %u [%d]\n", 1238 trace_seq_printf(s, "%llu + %u [%d]\n",
1249 t_sector(ent), 1239 t_sector(ent),
1250 t_sec(ent), t_error(ent)); 1240 t_sec(ent), t_error(ent));
1251 return trace_seq_printf(s, "%llu [%d]\n", 1241 else
1252 t_sector(ent), t_error(ent)); 1242 trace_seq_printf(s, "%llu [%d]\n",
1243 t_sector(ent), t_error(ent));
1253 } 1244 }
1254} 1245}
1255 1246
1256static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) 1247static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
1257{ 1248{
1258 struct blk_io_trace_remap r = { .device_from = 0, }; 1249 struct blk_io_trace_remap r = { .device_from = 0, };
1259 1250
1260 get_pdu_remap(ent, &r); 1251 get_pdu_remap(ent, &r);
1261 return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", 1252 trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1262 t_sector(ent), t_sec(ent), 1253 t_sector(ent), t_sec(ent),
1263 MAJOR(r.device_from), MINOR(r.device_from), 1254 MAJOR(r.device_from), MINOR(r.device_from),
1264 (unsigned long long)r.sector_from); 1255 (unsigned long long)r.sector_from);
1265} 1256}
1266 1257
1267static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) 1258static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
1268{ 1259{
1269 char cmd[TASK_COMM_LEN]; 1260 char cmd[TASK_COMM_LEN];
1270 1261
1271 trace_find_cmdline(ent->pid, cmd); 1262 trace_find_cmdline(ent->pid, cmd);
1272 1263
1273 return trace_seq_printf(s, "[%s]\n", cmd); 1264 trace_seq_printf(s, "[%s]\n", cmd);
1274} 1265}
1275 1266
1276static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) 1267static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
1277{ 1268{
1278 char cmd[TASK_COMM_LEN]; 1269 char cmd[TASK_COMM_LEN];
1279 1270
1280 trace_find_cmdline(ent->pid, cmd); 1271 trace_find_cmdline(ent->pid, cmd);
1281 1272
1282 return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); 1273 trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
1283} 1274}
1284 1275
1285static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) 1276static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
1286{ 1277{
1287 char cmd[TASK_COMM_LEN]; 1278 char cmd[TASK_COMM_LEN];
1288 1279
1289 trace_find_cmdline(ent->pid, cmd); 1280 trace_find_cmdline(ent->pid, cmd);
1290 1281
1291 return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), 1282 trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
1292 get_pdu_int(ent), cmd); 1283 get_pdu_int(ent), cmd);
1293} 1284}
1294 1285
1295static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) 1286static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
1296{ 1287{
1297 int ret;
1298 const struct blk_io_trace *t = te_blk_io_trace(ent); 1288 const struct blk_io_trace *t = te_blk_io_trace(ent);
1299 1289
1300 ret = trace_seq_putmem(s, t + 1, t->pdu_len); 1290 trace_seq_putmem(s, t + 1, t->pdu_len);
1301 if (ret) 1291 trace_seq_putc(s, '\n');
1302 return trace_seq_putc(s, '\n');
1303 return ret;
1304} 1292}
1305 1293
1306/* 1294/*
@@ -1339,7 +1327,7 @@ static void blk_tracer_reset(struct trace_array *tr)
1339 1327
1340static const struct { 1328static const struct {
1341 const char *act[2]; 1329 const char *act[2];
1342 int (*print)(struct trace_seq *s, const struct trace_entry *ent); 1330 void (*print)(struct trace_seq *s, const struct trace_entry *ent);
1343} what2act[] = { 1331} what2act[] = {
1344 [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, 1332 [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
1345 [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, 1333 [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
@@ -1364,7 +1352,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
1364 struct trace_seq *s = &iter->seq; 1352 struct trace_seq *s = &iter->seq;
1365 const struct blk_io_trace *t; 1353 const struct blk_io_trace *t;
1366 u16 what; 1354 u16 what;
1367 int ret;
1368 bool long_act; 1355 bool long_act;
1369 blk_log_action_t *log_action; 1356 blk_log_action_t *log_action;
1370 1357
@@ -1374,21 +1361,18 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
1374 log_action = classic ? &blk_log_action_classic : &blk_log_action; 1361 log_action = classic ? &blk_log_action_classic : &blk_log_action;
1375 1362
1376 if (t->action == BLK_TN_MESSAGE) { 1363 if (t->action == BLK_TN_MESSAGE) {
1377 ret = log_action(iter, long_act ? "message" : "m"); 1364 log_action(iter, long_act ? "message" : "m");
1378 if (ret) 1365 blk_log_msg(s, iter->ent);
1379 ret = blk_log_msg(s, iter->ent);
1380 goto out;
1381 } 1366 }
1382 1367
1383 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) 1368 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
1384 ret = trace_seq_printf(s, "Unknown action %x\n", what); 1369 trace_seq_printf(s, "Unknown action %x\n", what);
1385 else { 1370 else {
1386 ret = log_action(iter, what2act[what].act[long_act]); 1371 log_action(iter, what2act[what].act[long_act]);
1387 if (ret) 1372 what2act[what].print(s, iter->ent);
1388 ret = what2act[what].print(s, iter->ent);
1389 } 1373 }
1390out: 1374
1391 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1375 return trace_handle_return(s);
1392} 1376}
1393 1377
1394static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, 1378static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
@@ -1397,7 +1381,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1397 return print_one_line(iter, false); 1381 return print_one_line(iter, false);
1398} 1382}
1399 1383
1400static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) 1384static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1401{ 1385{
1402 struct trace_seq *s = &iter->seq; 1386 struct trace_seq *s = &iter->seq;
1403 struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; 1387 struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
@@ -1407,18 +1391,18 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1407 .time = iter->ts, 1391 .time = iter->ts,
1408 }; 1392 };
1409 1393
1410 if (!trace_seq_putmem(s, &old, offset)) 1394 trace_seq_putmem(s, &old, offset);
1411 return 0; 1395 trace_seq_putmem(s, &t->sector,
1412 return trace_seq_putmem(s, &t->sector, 1396 sizeof(old) - offset + t->pdu_len);
1413 sizeof(old) - offset + t->pdu_len);
1414} 1397}
1415 1398
1416static enum print_line_t 1399static enum print_line_t
1417blk_trace_event_print_binary(struct trace_iterator *iter, int flags, 1400blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
1418 struct trace_event *event) 1401 struct trace_event *event)
1419{ 1402{
1420 return blk_trace_synthesize_old_trace(iter) ? 1403 blk_trace_synthesize_old_trace(iter);
1421 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1404
1405 return trace_handle_return(&iter->seq);
1422} 1406}
1423 1407
1424static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) 1408static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fb186b9ddf51..929a733d302e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -387,6 +387,8 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
387 return ret; 387 return ret;
388} 388}
389 389
390static void ftrace_update_trampoline(struct ftrace_ops *ops);
391
390static int __register_ftrace_function(struct ftrace_ops *ops) 392static int __register_ftrace_function(struct ftrace_ops *ops)
391{ 393{
392 if (ops->flags & FTRACE_OPS_FL_DELETED) 394 if (ops->flags & FTRACE_OPS_FL_DELETED)
@@ -416,9 +418,13 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
416 if (control_ops_alloc(ops)) 418 if (control_ops_alloc(ops))
417 return -ENOMEM; 419 return -ENOMEM;
418 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); 420 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
421 /* The control_ops needs the trampoline update */
422 ops = &control_ops;
419 } else 423 } else
420 add_ftrace_ops(&ftrace_ops_list, ops); 424 add_ftrace_ops(&ftrace_ops_list, ops);
421 425
426 ftrace_update_trampoline(ops);
427
422 if (ftrace_enabled) 428 if (ftrace_enabled)
423 update_ftrace_function(); 429 update_ftrace_function();
424 430
@@ -565,13 +571,13 @@ static int function_stat_cmp(void *p1, void *p2)
565static int function_stat_headers(struct seq_file *m) 571static int function_stat_headers(struct seq_file *m)
566{ 572{
567#ifdef CONFIG_FUNCTION_GRAPH_TRACER 573#ifdef CONFIG_FUNCTION_GRAPH_TRACER
568 seq_printf(m, " Function " 574 seq_puts(m, " Function "
569 "Hit Time Avg s^2\n" 575 "Hit Time Avg s^2\n"
570 " -------- " 576 " -------- "
571 "--- ---- --- ---\n"); 577 "--- ---- --- ---\n");
572#else 578#else
573 seq_printf(m, " Function Hit\n" 579 seq_puts(m, " Function Hit\n"
574 " -------- ---\n"); 580 " -------- ---\n");
575#endif 581#endif
576 return 0; 582 return 0;
577} 583}
@@ -598,7 +604,7 @@ static int function_stat_show(struct seq_file *m, void *v)
598 seq_printf(m, " %-30.30s %10lu", str, rec->counter); 604 seq_printf(m, " %-30.30s %10lu", str, rec->counter);
599 605
600#ifdef CONFIG_FUNCTION_GRAPH_TRACER 606#ifdef CONFIG_FUNCTION_GRAPH_TRACER
601 seq_printf(m, " "); 607 seq_puts(m, " ");
602 avg = rec->time; 608 avg = rec->time;
603 do_div(avg, rec->counter); 609 do_div(avg, rec->counter);
604 610
@@ -1111,6 +1117,43 @@ static struct ftrace_ops global_ops = {
1111 FTRACE_OPS_FL_INITIALIZED, 1117 FTRACE_OPS_FL_INITIALIZED,
1112}; 1118};
1113 1119
1120/*
1121 * This is used by __kernel_text_address() to return true if the
1122 * address is on a dynamically allocated trampoline that would
1123 * not return true for either core_kernel_text() or
1124 * is_module_text_address().
1125 */
1126bool is_ftrace_trampoline(unsigned long addr)
1127{
1128 struct ftrace_ops *op;
1129 bool ret = false;
1130
1131 /*
1132 * Some of the ops may be dynamically allocated,
1133 * they are freed after a synchronize_sched().
1134 */
1135 preempt_disable_notrace();
1136
1137 do_for_each_ftrace_op(op, ftrace_ops_list) {
1138 /*
1139 * This is to check for dynamically allocated trampolines.
1140 * Trampolines that are in kernel text will have
1141 * core_kernel_text() return true.
1142 */
1143 if (op->trampoline && op->trampoline_size)
1144 if (addr >= op->trampoline &&
1145 addr < op->trampoline + op->trampoline_size) {
1146 ret = true;
1147 goto out;
1148 }
1149 } while_for_each_ftrace_op(op);
1150
1151 out:
1152 preempt_enable_notrace();
1153
1154 return ret;
1155}
1156
1114struct ftrace_page { 1157struct ftrace_page {
1115 struct ftrace_page *next; 1158 struct ftrace_page *next;
1116 struct dyn_ftrace *records; 1159 struct dyn_ftrace *records;
@@ -1315,6 +1358,9 @@ ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
1315static void 1358static void
1316ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); 1359ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
1317 1360
1361static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
1362 struct ftrace_hash *new_hash);
1363
1318static int 1364static int
1319ftrace_hash_move(struct ftrace_ops *ops, int enable, 1365ftrace_hash_move(struct ftrace_ops *ops, int enable,
1320 struct ftrace_hash **dst, struct ftrace_hash *src) 1366 struct ftrace_hash **dst, struct ftrace_hash *src)
@@ -1325,8 +1371,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1325 struct ftrace_hash *new_hash; 1371 struct ftrace_hash *new_hash;
1326 int size = src->count; 1372 int size = src->count;
1327 int bits = 0; 1373 int bits = 0;
1374 int ret;
1328 int i; 1375 int i;
1329 1376
1377 /* Reject setting notrace hash on IPMODIFY ftrace_ops */
1378 if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
1379 return -EINVAL;
1380
1330 /* 1381 /*
1331 * If the new source is empty, just free dst and assign it 1382 * If the new source is empty, just free dst and assign it
1332 * the empty_hash. 1383 * the empty_hash.
@@ -1360,6 +1411,16 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1360 } 1411 }
1361 1412
1362update: 1413update:
1414 /* Make sure this can be applied if it is IPMODIFY ftrace_ops */
1415 if (enable) {
1416 /* IPMODIFY should be updated only when filter_hash updating */
1417 ret = ftrace_hash_ipmodify_update(ops, new_hash);
1418 if (ret < 0) {
1419 free_ftrace_hash(new_hash);
1420 return ret;
1421 }
1422 }
1423
1363 /* 1424 /*
1364 * Remove the current set, update the hash and add 1425 * Remove the current set, update the hash and add
1365 * them back. 1426 * them back.
@@ -1724,6 +1785,114 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
1724 ftrace_hash_rec_update_modify(ops, filter_hash, 1); 1785 ftrace_hash_rec_update_modify(ops, filter_hash, 1);
1725} 1786}
1726 1787
1788/*
1789 * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
1790 * or no-needed to update, -EBUSY if it detects a conflict of the flag
1791 * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs.
1792 * Note that old_hash and new_hash has below meanings
1793 * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
1794 * - If the hash is EMPTY_HASH, it hits nothing
1795 * - Anything else hits the recs which match the hash entries.
1796 */
1797static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
1798 struct ftrace_hash *old_hash,
1799 struct ftrace_hash *new_hash)
1800{
1801 struct ftrace_page *pg;
1802 struct dyn_ftrace *rec, *end = NULL;
1803 int in_old, in_new;
1804
1805 /* Only update if the ops has been registered */
1806 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
1807 return 0;
1808
1809 if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
1810 return 0;
1811
1812 /*
1813 * Since the IPMODIFY is a very address sensitive action, we do not
1814 * allow ftrace_ops to set all functions to new hash.
1815 */
1816 if (!new_hash || !old_hash)
1817 return -EINVAL;
1818
1819 /* Update rec->flags */
1820 do_for_each_ftrace_rec(pg, rec) {
1821 /* We need to update only differences of filter_hash */
1822 in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
1823 in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
1824 if (in_old == in_new)
1825 continue;
1826
1827 if (in_new) {
1828 /* New entries must ensure no others are using it */
1829 if (rec->flags & FTRACE_FL_IPMODIFY)
1830 goto rollback;
1831 rec->flags |= FTRACE_FL_IPMODIFY;
1832 } else /* Removed entry */
1833 rec->flags &= ~FTRACE_FL_IPMODIFY;
1834 } while_for_each_ftrace_rec();
1835
1836 return 0;
1837
1838rollback:
1839 end = rec;
1840
1841 /* Roll back what we did above */
1842 do_for_each_ftrace_rec(pg, rec) {
1843 if (rec == end)
1844 goto err_out;
1845
1846 in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
1847 in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
1848 if (in_old == in_new)
1849 continue;
1850
1851 if (in_new)
1852 rec->flags &= ~FTRACE_FL_IPMODIFY;
1853 else
1854 rec->flags |= FTRACE_FL_IPMODIFY;
1855 } while_for_each_ftrace_rec();
1856
1857err_out:
1858 return -EBUSY;
1859}
1860
1861static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops)
1862{
1863 struct ftrace_hash *hash = ops->func_hash->filter_hash;
1864
1865 if (ftrace_hash_empty(hash))
1866 hash = NULL;
1867
1868 return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash);
1869}
1870
1871/* Disabling always succeeds */
1872static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops)
1873{
1874 struct ftrace_hash *hash = ops->func_hash->filter_hash;
1875
1876 if (ftrace_hash_empty(hash))
1877 hash = NULL;
1878
1879 __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH);
1880}
1881
1882static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
1883 struct ftrace_hash *new_hash)
1884{
1885 struct ftrace_hash *old_hash = ops->func_hash->filter_hash;
1886
1887 if (ftrace_hash_empty(old_hash))
1888 old_hash = NULL;
1889
1890 if (ftrace_hash_empty(new_hash))
1891 new_hash = NULL;
1892
1893 return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
1894}
1895
1727static void print_ip_ins(const char *fmt, unsigned char *p) 1896static void print_ip_ins(const char *fmt, unsigned char *p)
1728{ 1897{
1729 int i; 1898 int i;
@@ -1734,10 +1903,13 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
1734 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); 1903 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
1735} 1904}
1736 1905
1906static struct ftrace_ops *
1907ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
1908
1737/** 1909/**
1738 * ftrace_bug - report and shutdown function tracer 1910 * ftrace_bug - report and shutdown function tracer
1739 * @failed: The failed type (EFAULT, EINVAL, EPERM) 1911 * @failed: The failed type (EFAULT, EINVAL, EPERM)
1740 * @ip: The address that failed 1912 * @rec: The record that failed
1741 * 1913 *
1742 * The arch code that enables or disables the function tracing 1914 * The arch code that enables or disables the function tracing
1743 * can call ftrace_bug() when it has detected a problem in 1915 * can call ftrace_bug() when it has detected a problem in
@@ -1746,8 +1918,10 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
1746 * EINVAL - if what is read at @ip is not what was expected 1918 * EINVAL - if what is read at @ip is not what was expected
1747 * EPERM - if the problem happens on writting to the @ip address 1919 * EPERM - if the problem happens on writting to the @ip address
1748 */ 1920 */
1749void ftrace_bug(int failed, unsigned long ip) 1921void ftrace_bug(int failed, struct dyn_ftrace *rec)
1750{ 1922{
1923 unsigned long ip = rec ? rec->ip : 0;
1924
1751 switch (failed) { 1925 switch (failed) {
1752 case -EFAULT: 1926 case -EFAULT:
1753 FTRACE_WARN_ON_ONCE(1); 1927 FTRACE_WARN_ON_ONCE(1);
@@ -1759,7 +1933,7 @@ void ftrace_bug(int failed, unsigned long ip)
1759 pr_info("ftrace failed to modify "); 1933 pr_info("ftrace failed to modify ");
1760 print_ip_sym(ip); 1934 print_ip_sym(ip);
1761 print_ip_ins(" actual: ", (unsigned char *)ip); 1935 print_ip_ins(" actual: ", (unsigned char *)ip);
1762 printk(KERN_CONT "\n"); 1936 pr_cont("\n");
1763 break; 1937 break;
1764 case -EPERM: 1938 case -EPERM:
1765 FTRACE_WARN_ON_ONCE(1); 1939 FTRACE_WARN_ON_ONCE(1);
@@ -1771,6 +1945,24 @@ void ftrace_bug(int failed, unsigned long ip)
1771 pr_info("ftrace faulted on unknown error "); 1945 pr_info("ftrace faulted on unknown error ");
1772 print_ip_sym(ip); 1946 print_ip_sym(ip);
1773 } 1947 }
1948 if (rec) {
1949 struct ftrace_ops *ops = NULL;
1950
1951 pr_info("ftrace record flags: %lx\n", rec->flags);
1952 pr_cont(" (%ld)%s", ftrace_rec_count(rec),
1953 rec->flags & FTRACE_FL_REGS ? " R" : " ");
1954 if (rec->flags & FTRACE_FL_TRAMP_EN) {
1955 ops = ftrace_find_tramp_ops_any(rec);
1956 if (ops)
1957 pr_cont("\ttramp: %pS",
1958 (void *)ops->trampoline);
1959 else
1960 pr_cont("\ttramp: ERROR!");
1961
1962 }
1963 ip = ftrace_get_addr_curr(rec);
1964 pr_cont(" expected tramp: %lx\n", ip);
1965 }
1774} 1966}
1775 1967
1776static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) 1968static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
@@ -1925,8 +2117,16 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
1925 * when we are adding another op to the rec or removing the 2117 * when we are adding another op to the rec or removing the
1926 * current one. Thus, if the op is being added, we can 2118 * current one. Thus, if the op is being added, we can
1927 * ignore it because it hasn't attached itself to the rec 2119 * ignore it because it hasn't attached itself to the rec
1928 * yet. That means we just need to find the op that has a 2120 * yet.
1929 * trampoline and is not beeing added. 2121 *
2122 * If an ops is being modified (hooking to different functions)
2123 * then we don't care about the new functions that are being
2124 * added, just the old ones (that are probably being removed).
2125 *
2126 * If we are adding an ops to a function that already is using
2127 * a trampoline, it needs to be removed (trampolines are only
2128 * for single ops connected), then an ops that is not being
2129 * modified also needs to be checked.
1930 */ 2130 */
1931 do_for_each_ftrace_op(op, ftrace_ops_list) { 2131 do_for_each_ftrace_op(op, ftrace_ops_list) {
1932 2132
@@ -1940,17 +2140,23 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
1940 if (op->flags & FTRACE_OPS_FL_ADDING) 2140 if (op->flags & FTRACE_OPS_FL_ADDING)
1941 continue; 2141 continue;
1942 2142
2143
1943 /* 2144 /*
1944 * If the ops is not being added and has a trampoline, 2145 * If the ops is being modified and is in the old
1945 * then it must be the one that we want! 2146 * hash, then it is probably being removed from this
2147 * function.
1946 */ 2148 */
1947 if (hash_contains_ip(ip, op->func_hash))
1948 return op;
1949
1950 /* If the ops is being modified, it may be in the old hash. */
1951 if ((op->flags & FTRACE_OPS_FL_MODIFYING) && 2149 if ((op->flags & FTRACE_OPS_FL_MODIFYING) &&
1952 hash_contains_ip(ip, &op->old_hash)) 2150 hash_contains_ip(ip, &op->old_hash))
1953 return op; 2151 return op;
2152 /*
2153 * If the ops is not being added or modified, and it's
2154 * in its normal filter hash, then this must be the one
2155 * we want!
2156 */
2157 if (!(op->flags & FTRACE_OPS_FL_MODIFYING) &&
2158 hash_contains_ip(ip, op->func_hash))
2159 return op;
1954 2160
1955 } while_for_each_ftrace_op(op); 2161 } while_for_each_ftrace_op(op);
1956 2162
@@ -2079,7 +2285,7 @@ void __weak ftrace_replace_code(int enable)
2079 do_for_each_ftrace_rec(pg, rec) { 2285 do_for_each_ftrace_rec(pg, rec) {
2080 failed = __ftrace_replace_code(rec, enable); 2286 failed = __ftrace_replace_code(rec, enable);
2081 if (failed) { 2287 if (failed) {
2082 ftrace_bug(failed, rec->ip); 2288 ftrace_bug(failed, rec);
2083 /* Stop processing */ 2289 /* Stop processing */
2084 return; 2290 return;
2085 } 2291 }
@@ -2161,17 +2367,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
2161static int 2367static int
2162ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) 2368ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
2163{ 2369{
2164 unsigned long ip;
2165 int ret; 2370 int ret;
2166 2371
2167 ip = rec->ip;
2168
2169 if (unlikely(ftrace_disabled)) 2372 if (unlikely(ftrace_disabled))
2170 return 0; 2373 return 0;
2171 2374
2172 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); 2375 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
2173 if (ret) { 2376 if (ret) {
2174 ftrace_bug(ret, ip); 2377 ftrace_bug(ret, rec);
2175 return 0; 2378 return 0;
2176 } 2379 }
2177 return 1; 2380 return 1;
@@ -2293,16 +2496,23 @@ static void ftrace_run_update_code(int command)
2293 FTRACE_WARN_ON(ret); 2496 FTRACE_WARN_ON(ret);
2294} 2497}
2295 2498
2296static void ftrace_run_modify_code(struct ftrace_ops *ops, int command) 2499static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
2500 struct ftrace_hash *old_hash)
2297{ 2501{
2298 ops->flags |= FTRACE_OPS_FL_MODIFYING; 2502 ops->flags |= FTRACE_OPS_FL_MODIFYING;
2503 ops->old_hash.filter_hash = old_hash;
2299 ftrace_run_update_code(command); 2504 ftrace_run_update_code(command);
2505 ops->old_hash.filter_hash = NULL;
2300 ops->flags &= ~FTRACE_OPS_FL_MODIFYING; 2506 ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
2301} 2507}
2302 2508
2303static ftrace_func_t saved_ftrace_func; 2509static ftrace_func_t saved_ftrace_func;
2304static int ftrace_start_up; 2510static int ftrace_start_up;
2305 2511
2512void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
2513{
2514}
2515
2306static void control_ops_free(struct ftrace_ops *ops) 2516static void control_ops_free(struct ftrace_ops *ops)
2307{ 2517{
2308 free_percpu(ops->disabled); 2518 free_percpu(ops->disabled);
@@ -2352,6 +2562,15 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
2352 */ 2562 */
2353 ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING; 2563 ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
2354 2564
2565 ret = ftrace_hash_ipmodify_enable(ops);
2566 if (ret < 0) {
2567 /* Rollback registration process */
2568 __unregister_ftrace_function(ops);
2569 ftrace_start_up--;
2570 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
2571 return ret;
2572 }
2573
2355 ftrace_hash_rec_enable(ops, 1); 2574 ftrace_hash_rec_enable(ops, 1);
2356 2575
2357 ftrace_startup_enable(command); 2576 ftrace_startup_enable(command);
@@ -2380,6 +2599,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2380 */ 2599 */
2381 WARN_ON_ONCE(ftrace_start_up < 0); 2600 WARN_ON_ONCE(ftrace_start_up < 0);
2382 2601
2602 /* Disabling ipmodify never fails */
2603 ftrace_hash_ipmodify_disable(ops);
2383 ftrace_hash_rec_disable(ops, 1); 2604 ftrace_hash_rec_disable(ops, 1);
2384 2605
2385 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 2606 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
@@ -2454,6 +2675,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2454 if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { 2675 if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
2455 schedule_on_each_cpu(ftrace_sync); 2676 schedule_on_each_cpu(ftrace_sync);
2456 2677
2678 arch_ftrace_trampoline_free(ops);
2679
2457 if (ops->flags & FTRACE_OPS_FL_CONTROL) 2680 if (ops->flags & FTRACE_OPS_FL_CONTROL)
2458 control_ops_free(ops); 2681 control_ops_free(ops);
2459 } 2682 }
@@ -2606,7 +2829,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
2606 if (ftrace_start_up && cnt) { 2829 if (ftrace_start_up && cnt) {
2607 int failed = __ftrace_replace_code(p, 1); 2830 int failed = __ftrace_replace_code(p, 1);
2608 if (failed) 2831 if (failed)
2609 ftrace_bug(failed, p->ip); 2832 ftrace_bug(failed, p);
2610 } 2833 }
2611 } 2834 }
2612 } 2835 }
@@ -2931,6 +3154,22 @@ static void t_stop(struct seq_file *m, void *p)
2931 mutex_unlock(&ftrace_lock); 3154 mutex_unlock(&ftrace_lock);
2932} 3155}
2933 3156
3157void * __weak
3158arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
3159{
3160 return NULL;
3161}
3162
3163static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops,
3164 struct dyn_ftrace *rec)
3165{
3166 void *ptr;
3167
3168 ptr = arch_ftrace_trampoline_func(ops, rec);
3169 if (ptr)
3170 seq_printf(m, " ->%pS", ptr);
3171}
3172
2934static int t_show(struct seq_file *m, void *v) 3173static int t_show(struct seq_file *m, void *v)
2935{ 3174{
2936 struct ftrace_iterator *iter = m->private; 3175 struct ftrace_iterator *iter = m->private;
@@ -2941,9 +3180,9 @@ static int t_show(struct seq_file *m, void *v)
2941 3180
2942 if (iter->flags & FTRACE_ITER_PRINTALL) { 3181 if (iter->flags & FTRACE_ITER_PRINTALL) {
2943 if (iter->flags & FTRACE_ITER_NOTRACE) 3182 if (iter->flags & FTRACE_ITER_NOTRACE)
2944 seq_printf(m, "#### no functions disabled ####\n"); 3183 seq_puts(m, "#### no functions disabled ####\n");
2945 else 3184 else
2946 seq_printf(m, "#### all functions enabled ####\n"); 3185 seq_puts(m, "#### all functions enabled ####\n");
2947 return 0; 3186 return 0;
2948 } 3187 }
2949 3188
@@ -2954,22 +3193,25 @@ static int t_show(struct seq_file *m, void *v)
2954 3193
2955 seq_printf(m, "%ps", (void *)rec->ip); 3194 seq_printf(m, "%ps", (void *)rec->ip);
2956 if (iter->flags & FTRACE_ITER_ENABLED) { 3195 if (iter->flags & FTRACE_ITER_ENABLED) {
2957 seq_printf(m, " (%ld)%s", 3196 struct ftrace_ops *ops = NULL;
3197
3198 seq_printf(m, " (%ld)%s%s",
2958 ftrace_rec_count(rec), 3199 ftrace_rec_count(rec),
2959 rec->flags & FTRACE_FL_REGS ? " R" : " "); 3200 rec->flags & FTRACE_FL_REGS ? " R" : " ",
3201 rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
2960 if (rec->flags & FTRACE_FL_TRAMP_EN) { 3202 if (rec->flags & FTRACE_FL_TRAMP_EN) {
2961 struct ftrace_ops *ops;
2962
2963 ops = ftrace_find_tramp_ops_any(rec); 3203 ops = ftrace_find_tramp_ops_any(rec);
2964 if (ops) 3204 if (ops)
2965 seq_printf(m, "\ttramp: %pS", 3205 seq_printf(m, "\ttramp: %pS",
2966 (void *)ops->trampoline); 3206 (void *)ops->trampoline);
2967 else 3207 else
2968 seq_printf(m, "\ttramp: ERROR!"); 3208 seq_puts(m, "\ttramp: ERROR!");
3209
2969 } 3210 }
3211 add_trampoline_func(m, ops, rec);
2970 } 3212 }
2971 3213
2972 seq_printf(m, "\n"); 3214 seq_putc(m, '\n');
2973 3215
2974 return 0; 3216 return 0;
2975} 3217}
@@ -3003,9 +3245,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
3003{ 3245{
3004 struct ftrace_iterator *iter; 3246 struct ftrace_iterator *iter;
3005 3247
3006 if (unlikely(ftrace_disabled))
3007 return -ENODEV;
3008
3009 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); 3248 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
3010 if (iter) { 3249 if (iter) {
3011 iter->pg = ftrace_pages_start; 3250 iter->pg = ftrace_pages_start;
@@ -3340,7 +3579,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
3340 3579
3341static int ftrace_probe_registered; 3580static int ftrace_probe_registered;
3342 3581
3343static void __enable_ftrace_function_probe(void) 3582static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash)
3344{ 3583{
3345 int ret; 3584 int ret;
3346 int i; 3585 int i;
@@ -3348,7 +3587,8 @@ static void __enable_ftrace_function_probe(void)
3348 if (ftrace_probe_registered) { 3587 if (ftrace_probe_registered) {
3349 /* still need to update the function call sites */ 3588 /* still need to update the function call sites */
3350 if (ftrace_enabled) 3589 if (ftrace_enabled)
3351 ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS); 3590 ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
3591 old_hash);
3352 return; 3592 return;
3353 } 3593 }
3354 3594
@@ -3477,13 +3717,14 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3477 } while_for_each_ftrace_rec(); 3717 } while_for_each_ftrace_rec();
3478 3718
3479 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); 3719 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3720
3721 __enable_ftrace_function_probe(old_hash);
3722
3480 if (!ret) 3723 if (!ret)
3481 free_ftrace_hash_rcu(old_hash); 3724 free_ftrace_hash_rcu(old_hash);
3482 else 3725 else
3483 count = ret; 3726 count = ret;
3484 3727
3485 __enable_ftrace_function_probe();
3486
3487 out_unlock: 3728 out_unlock:
3488 mutex_unlock(&ftrace_lock); 3729 mutex_unlock(&ftrace_lock);
3489 out: 3730 out:
@@ -3764,10 +4005,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
3764 return add_hash_entry(hash, ip); 4005 return add_hash_entry(hash, ip);
3765} 4006}
3766 4007
3767static void ftrace_ops_update_code(struct ftrace_ops *ops) 4008static void ftrace_ops_update_code(struct ftrace_ops *ops,
4009 struct ftrace_hash *old_hash)
3768{ 4010{
3769 if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) 4011 if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
3770 ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS); 4012 ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
3771} 4013}
3772 4014
3773static int 4015static int
@@ -3813,7 +4055,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3813 old_hash = *orig_hash; 4055 old_hash = *orig_hash;
3814 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 4056 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
3815 if (!ret) { 4057 if (!ret) {
3816 ftrace_ops_update_code(ops); 4058 ftrace_ops_update_code(ops, old_hash);
3817 free_ftrace_hash_rcu(old_hash); 4059 free_ftrace_hash_rcu(old_hash);
3818 } 4060 }
3819 mutex_unlock(&ftrace_lock); 4061 mutex_unlock(&ftrace_lock);
@@ -3955,6 +4197,9 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
3955static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; 4197static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
3956static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); 4198static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
3957 4199
4200static unsigned long save_global_trampoline;
4201static unsigned long save_global_flags;
4202
3958static int __init set_graph_function(char *str) 4203static int __init set_graph_function(char *str)
3959{ 4204{
3960 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); 4205 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -4058,7 +4303,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
4058 ret = ftrace_hash_move(iter->ops, filter_hash, 4303 ret = ftrace_hash_move(iter->ops, filter_hash,
4059 orig_hash, iter->hash); 4304 orig_hash, iter->hash);
4060 if (!ret) { 4305 if (!ret) {
4061 ftrace_ops_update_code(iter->ops); 4306 ftrace_ops_update_code(iter->ops, old_hash);
4062 free_ftrace_hash_rcu(old_hash); 4307 free_ftrace_hash_rcu(old_hash);
4063 } 4308 }
4064 mutex_unlock(&ftrace_lock); 4309 mutex_unlock(&ftrace_lock);
@@ -4163,9 +4408,9 @@ static int g_show(struct seq_file *m, void *v)
4163 struct ftrace_graph_data *fgd = m->private; 4408 struct ftrace_graph_data *fgd = m->private;
4164 4409
4165 if (fgd->table == ftrace_graph_funcs) 4410 if (fgd->table == ftrace_graph_funcs)
4166 seq_printf(m, "#### all functions enabled ####\n"); 4411 seq_puts(m, "#### all functions enabled ####\n");
4167 else 4412 else
4168 seq_printf(m, "#### no functions disabled ####\n"); 4413 seq_puts(m, "#### no functions disabled ####\n");
4169 return 0; 4414 return 0;
4170 } 4415 }
4171 4416
@@ -4676,6 +4921,32 @@ void __init ftrace_init(void)
4676 ftrace_disabled = 1; 4921 ftrace_disabled = 1;
4677} 4922}
4678 4923
4924/* Do nothing if arch does not support this */
4925void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
4926{
4927}
4928
4929static void ftrace_update_trampoline(struct ftrace_ops *ops)
4930{
4931
4932/*
4933 * Currently there's no safe way to free a trampoline when the kernel
4934 * is configured with PREEMPT. That is because a task could be preempted
4935 * when it jumped to the trampoline, it may be preempted for a long time
4936 * depending on the system load, and currently there's no way to know
4937 * when it will be off the trampoline. If the trampoline is freed
4938 * too early, when the task runs again, it will be executing on freed
4939 * memory and crash.
4940 */
4941#ifdef CONFIG_PREEMPT
4942 /* Currently, only non dynamic ops can have a trampoline */
4943 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
4944 return;
4945#endif
4946
4947 arch_ftrace_update_trampoline(ops);
4948}
4949
4679#else 4950#else
4680 4951
4681static struct ftrace_ops global_ops = { 4952static struct ftrace_ops global_ops = {
@@ -4718,6 +4989,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
4718 return 1; 4989 return 1;
4719} 4990}
4720 4991
4992static void ftrace_update_trampoline(struct ftrace_ops *ops)
4993{
4994}
4995
4721#endif /* CONFIG_DYNAMIC_FTRACE */ 4996#endif /* CONFIG_DYNAMIC_FTRACE */
4722 4997
4723__init void ftrace_init_global_array_ops(struct trace_array *tr) 4998__init void ftrace_init_global_array_ops(struct trace_array *tr)
@@ -5055,12 +5330,12 @@ static int fpid_show(struct seq_file *m, void *v)
5055 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); 5330 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
5056 5331
5057 if (v == (void *)1) { 5332 if (v == (void *)1) {
5058 seq_printf(m, "no pid\n"); 5333 seq_puts(m, "no pid\n");
5059 return 0; 5334 return 0;
5060 } 5335 }
5061 5336
5062 if (fpid->pid == ftrace_swapper_pid) 5337 if (fpid->pid == ftrace_swapper_pid)
5063 seq_printf(m, "swapper tasks\n"); 5338 seq_puts(m, "swapper tasks\n");
5064 else 5339 else
5065 seq_printf(m, "%u\n", pid_vnr(fpid->pid)); 5340 seq_printf(m, "%u\n", pid_vnr(fpid->pid));
5066 5341
@@ -5273,6 +5548,7 @@ static struct ftrace_ops graph_ops = {
5273 FTRACE_OPS_FL_STUB, 5548 FTRACE_OPS_FL_STUB,
5274#ifdef FTRACE_GRAPH_TRAMP_ADDR 5549#ifdef FTRACE_GRAPH_TRAMP_ADDR
5275 .trampoline = FTRACE_GRAPH_TRAMP_ADDR, 5550 .trampoline = FTRACE_GRAPH_TRAMP_ADDR,
5551 /* trampoline_size is only needed for dynamically allocated tramps */
5276#endif 5552#endif
5277 ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) 5553 ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
5278}; 5554};
@@ -5502,7 +5778,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5502 update_function_graph_func(); 5778 update_function_graph_func();
5503 5779
5504 ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); 5780 ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
5505
5506out: 5781out:
5507 mutex_unlock(&ftrace_lock); 5782 mutex_unlock(&ftrace_lock);
5508 return ret; 5783 return ret;
@@ -5523,6 +5798,17 @@ void unregister_ftrace_graph(void)
5523 unregister_pm_notifier(&ftrace_suspend_notifier); 5798 unregister_pm_notifier(&ftrace_suspend_notifier);
5524 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 5799 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
5525 5800
5801#ifdef CONFIG_DYNAMIC_FTRACE
5802 /*
5803 * Function graph does not allocate the trampoline, but
5804 * other global_ops do. We need to reset the ALLOC_TRAMP flag
5805 * if one was used.
5806 */
5807 global_ops.trampoline = save_global_trampoline;
5808 if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
5809 global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
5810#endif
5811
5526 out: 5812 out:
5527 mutex_unlock(&ftrace_lock); 5813 mutex_unlock(&ftrace_lock);
5528} 5814}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2d75c94ae87d..7a4104cb95cb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -34,21 +34,19 @@ static void update_pages_handler(struct work_struct *work);
34 */ 34 */
35int ring_buffer_print_entry_header(struct trace_seq *s) 35int ring_buffer_print_entry_header(struct trace_seq *s)
36{ 36{
37 int ret; 37 trace_seq_puts(s, "# compressed entry header\n");
38 38 trace_seq_puts(s, "\ttype_len : 5 bits\n");
39 ret = trace_seq_puts(s, "# compressed entry header\n"); 39 trace_seq_puts(s, "\ttime_delta : 27 bits\n");
40 ret = trace_seq_puts(s, "\ttype_len : 5 bits\n"); 40 trace_seq_puts(s, "\tarray : 32 bits\n");
41 ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n"); 41 trace_seq_putc(s, '\n');
42 ret = trace_seq_puts(s, "\tarray : 32 bits\n"); 42 trace_seq_printf(s, "\tpadding : type == %d\n",
43 ret = trace_seq_putc(s, '\n'); 43 RINGBUF_TYPE_PADDING);
44 ret = trace_seq_printf(s, "\tpadding : type == %d\n", 44 trace_seq_printf(s, "\ttime_extend : type == %d\n",
45 RINGBUF_TYPE_PADDING); 45 RINGBUF_TYPE_TIME_EXTEND);
46 ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", 46 trace_seq_printf(s, "\tdata max type_len == %d\n",
47 RINGBUF_TYPE_TIME_EXTEND); 47 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
48 ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
49 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
50 48
51 return ret; 49 return !trace_seq_has_overflowed(s);
52} 50}
53 51
54/* 52/*
@@ -419,32 +417,31 @@ static inline int test_time_stamp(u64 delta)
419int ring_buffer_print_page_header(struct trace_seq *s) 417int ring_buffer_print_page_header(struct trace_seq *s)
420{ 418{
421 struct buffer_data_page field; 419 struct buffer_data_page field;
422 int ret;
423 420
424 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" 421 trace_seq_printf(s, "\tfield: u64 timestamp;\t"
425 "offset:0;\tsize:%u;\tsigned:%u;\n", 422 "offset:0;\tsize:%u;\tsigned:%u;\n",
426 (unsigned int)sizeof(field.time_stamp), 423 (unsigned int)sizeof(field.time_stamp),
427 (unsigned int)is_signed_type(u64)); 424 (unsigned int)is_signed_type(u64));
428
429 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
430 "offset:%u;\tsize:%u;\tsigned:%u;\n",
431 (unsigned int)offsetof(typeof(field), commit),
432 (unsigned int)sizeof(field.commit),
433 (unsigned int)is_signed_type(long));
434
435 ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
436 "offset:%u;\tsize:%u;\tsigned:%u;\n",
437 (unsigned int)offsetof(typeof(field), commit),
438 1,
439 (unsigned int)is_signed_type(long));
440
441 ret = trace_seq_printf(s, "\tfield: char data;\t"
442 "offset:%u;\tsize:%u;\tsigned:%u;\n",
443 (unsigned int)offsetof(typeof(field), data),
444 (unsigned int)BUF_PAGE_SIZE,
445 (unsigned int)is_signed_type(char));
446 425
447 return ret; 426 trace_seq_printf(s, "\tfield: local_t commit;\t"
427 "offset:%u;\tsize:%u;\tsigned:%u;\n",
428 (unsigned int)offsetof(typeof(field), commit),
429 (unsigned int)sizeof(field.commit),
430 (unsigned int)is_signed_type(long));
431
432 trace_seq_printf(s, "\tfield: int overwrite;\t"
433 "offset:%u;\tsize:%u;\tsigned:%u;\n",
434 (unsigned int)offsetof(typeof(field), commit),
435 1,
436 (unsigned int)is_signed_type(long));
437
438 trace_seq_printf(s, "\tfield: char data;\t"
439 "offset:%u;\tsize:%u;\tsigned:%u;\n",
440 (unsigned int)offsetof(typeof(field), data),
441 (unsigned int)BUF_PAGE_SIZE,
442 (unsigned int)is_signed_type(char));
443
444 return !trace_seq_has_overflowed(s);
448} 445}
449 446
450struct rb_irq_work { 447struct rb_irq_work {
@@ -538,16 +535,18 @@ static void rb_wake_up_waiters(struct irq_work *work)
538 * ring_buffer_wait - wait for input to the ring buffer 535 * ring_buffer_wait - wait for input to the ring buffer
539 * @buffer: buffer to wait on 536 * @buffer: buffer to wait on
540 * @cpu: the cpu buffer to wait on 537 * @cpu: the cpu buffer to wait on
538 * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS
541 * 539 *
542 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon 540 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
543 * as data is added to any of the @buffer's cpu buffers. Otherwise 541 * as data is added to any of the @buffer's cpu buffers. Otherwise
544 * it will wait for data to be added to a specific cpu buffer. 542 * it will wait for data to be added to a specific cpu buffer.
545 */ 543 */
546int ring_buffer_wait(struct ring_buffer *buffer, int cpu) 544int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
547{ 545{
548 struct ring_buffer_per_cpu *cpu_buffer; 546 struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
549 DEFINE_WAIT(wait); 547 DEFINE_WAIT(wait);
550 struct rb_irq_work *work; 548 struct rb_irq_work *work;
549 int ret = 0;
551 550
552 /* 551 /*
553 * Depending on what the caller is waiting for, either any 552 * Depending on what the caller is waiting for, either any
@@ -564,36 +563,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
564 } 563 }
565 564
566 565
567 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); 566 while (true) {
567 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
568 568
569 /* 569 /*
570 * The events can happen in critical sections where 570 * The events can happen in critical sections where
571 * checking a work queue can cause deadlocks. 571 * checking a work queue can cause deadlocks.
572 * After adding a task to the queue, this flag is set 572 * After adding a task to the queue, this flag is set
573 * only to notify events to try to wake up the queue 573 * only to notify events to try to wake up the queue
574 * using irq_work. 574 * using irq_work.
575 * 575 *
576 * We don't clear it even if the buffer is no longer 576 * We don't clear it even if the buffer is no longer
577 * empty. The flag only causes the next event to run 577 * empty. The flag only causes the next event to run
578 * irq_work to do the work queue wake up. The worse 578 * irq_work to do the work queue wake up. The worse
579 * that can happen if we race with !trace_empty() is that 579 * that can happen if we race with !trace_empty() is that
580 * an event will cause an irq_work to try to wake up 580 * an event will cause an irq_work to try to wake up
581 * an empty queue. 581 * an empty queue.
582 * 582 *
583 * There's no reason to protect this flag either, as 583 * There's no reason to protect this flag either, as
584 * the work queue and irq_work logic will do the necessary 584 * the work queue and irq_work logic will do the necessary
585 * synchronization for the wake ups. The only thing 585 * synchronization for the wake ups. The only thing
586 * that is necessary is that the wake up happens after 586 * that is necessary is that the wake up happens after
587 * a task has been queued. It's OK for spurious wake ups. 587 * a task has been queued. It's OK for spurious wake ups.
588 */ 588 */
589 work->waiters_pending = true; 589 work->waiters_pending = true;
590
591 if (signal_pending(current)) {
592 ret = -EINTR;
593 break;
594 }
595
596 if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
597 break;
598
599 if (cpu != RING_BUFFER_ALL_CPUS &&
600 !ring_buffer_empty_cpu(buffer, cpu)) {
601 unsigned long flags;
602 bool pagebusy;
603
604 if (!full)
605 break;
606
607 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
608 pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
609 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
610
611 if (!pagebusy)
612 break;
613 }
590 614
591 if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
592 (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
593 schedule(); 615 schedule();
616 }
594 617
595 finish_wait(&work->waiters, &wait); 618 finish_wait(&work->waiters, &wait);
596 return 0; 619
620 return ret;
597} 621}
598 622
599/** 623/**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a528392b1f4..1af4f8f2ab5d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -155,10 +155,11 @@ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
155 155
156static int __init stop_trace_on_warning(char *str) 156static int __init stop_trace_on_warning(char *str)
157{ 157{
158 __disable_trace_on_warning = 1; 158 if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
159 __disable_trace_on_warning = 1;
159 return 1; 160 return 1;
160} 161}
161__setup("traceoff_on_warning=", stop_trace_on_warning); 162__setup("traceoff_on_warning", stop_trace_on_warning);
162 163
163static int __init boot_alloc_snapshot(char *str) 164static int __init boot_alloc_snapshot(char *str)
164{ 165{
@@ -938,19 +939,20 @@ out:
938 return ret; 939 return ret;
939} 940}
940 941
942/* TODO add a seq_buf_to_buffer() */
941static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) 943static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
942{ 944{
943 int len; 945 int len;
944 946
945 if (s->len <= s->readpos) 947 if (trace_seq_used(s) <= s->seq.readpos)
946 return -EBUSY; 948 return -EBUSY;
947 949
948 len = s->len - s->readpos; 950 len = trace_seq_used(s) - s->seq.readpos;
949 if (cnt > len) 951 if (cnt > len)
950 cnt = len; 952 cnt = len;
951 memcpy(buf, s->buffer + s->readpos, cnt); 953 memcpy(buf, s->buffer + s->seq.readpos, cnt);
952 954
953 s->readpos += cnt; 955 s->seq.readpos += cnt;
954 return cnt; 956 return cnt;
955} 957}
956 958
@@ -1076,13 +1078,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
1076} 1078}
1077#endif /* CONFIG_TRACER_MAX_TRACE */ 1079#endif /* CONFIG_TRACER_MAX_TRACE */
1078 1080
1079static int wait_on_pipe(struct trace_iterator *iter) 1081static int wait_on_pipe(struct trace_iterator *iter, bool full)
1080{ 1082{
1081 /* Iterators are static, they should be filled or empty */ 1083 /* Iterators are static, they should be filled or empty */
1082 if (trace_buffer_iter(iter, iter->cpu_file)) 1084 if (trace_buffer_iter(iter, iter->cpu_file))
1083 return 0; 1085 return 0;
1084 1086
1085 return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); 1087 return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file,
1088 full);
1086} 1089}
1087 1090
1088#ifdef CONFIG_FTRACE_STARTUP_TEST 1091#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -2157,9 +2160,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
2157 goto out; 2160 goto out;
2158 } 2161 }
2159 2162
2160 len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); 2163 len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
2161 if (len > TRACE_BUF_SIZE)
2162 goto out;
2163 2164
2164 local_save_flags(flags); 2165 local_save_flags(flags);
2165 size = sizeof(*entry) + len + 1; 2166 size = sizeof(*entry) + len + 1;
@@ -2170,8 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
2170 entry = ring_buffer_event_data(event); 2171 entry = ring_buffer_event_data(event);
2171 entry->ip = ip; 2172 entry->ip = ip;
2172 2173
2173 memcpy(&entry->buf, tbuffer, len); 2174 memcpy(&entry->buf, tbuffer, len + 1);
2174 entry->buf[len] = '\0';
2175 if (!call_filter_check_discard(call, entry, buffer, event)) { 2175 if (!call_filter_check_discard(call, entry, buffer, event)) {
2176 __buffer_unlock_commit(buffer, event); 2176 __buffer_unlock_commit(buffer, event);
2177 ftrace_trace_stack(buffer, flags, 6, pc); 2177 ftrace_trace_stack(buffer, flags, 6, pc);
@@ -2508,14 +2508,14 @@ get_total_entries(struct trace_buffer *buf,
2508 2508
2509static void print_lat_help_header(struct seq_file *m) 2509static void print_lat_help_header(struct seq_file *m)
2510{ 2510{
2511 seq_puts(m, "# _------=> CPU# \n"); 2511 seq_puts(m, "# _------=> CPU# \n"
2512 seq_puts(m, "# / _-----=> irqs-off \n"); 2512 "# / _-----=> irqs-off \n"
2513 seq_puts(m, "# | / _----=> need-resched \n"); 2513 "# | / _----=> need-resched \n"
2514 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 2514 "# || / _---=> hardirq/softirq \n"
2515 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 2515 "# ||| / _--=> preempt-depth \n"
2516 seq_puts(m, "# |||| / delay \n"); 2516 "# |||| / delay \n"
2517 seq_puts(m, "# cmd pid ||||| time | caller \n"); 2517 "# cmd pid ||||| time | caller \n"
2518 seq_puts(m, "# \\ / ||||| \\ | / \n"); 2518 "# \\ / ||||| \\ | / \n");
2519} 2519}
2520 2520
2521static void print_event_info(struct trace_buffer *buf, struct seq_file *m) 2521static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
@@ -2532,20 +2532,20 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
2532static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) 2532static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
2533{ 2533{
2534 print_event_info(buf, m); 2534 print_event_info(buf, m);
2535 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 2535 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"
2536 seq_puts(m, "# | | | | |\n"); 2536 "# | | | | |\n");
2537} 2537}
2538 2538
2539static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) 2539static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
2540{ 2540{
2541 print_event_info(buf, m); 2541 print_event_info(buf, m);
2542 seq_puts(m, "# _-----=> irqs-off\n"); 2542 seq_puts(m, "# _-----=> irqs-off\n"
2543 seq_puts(m, "# / _----=> need-resched\n"); 2543 "# / _----=> need-resched\n"
2544 seq_puts(m, "# | / _---=> hardirq/softirq\n"); 2544 "# | / _---=> hardirq/softirq\n"
2545 seq_puts(m, "# || / _--=> preempt-depth\n"); 2545 "# || / _--=> preempt-depth\n"
2546 seq_puts(m, "# ||| / delay\n"); 2546 "# ||| / delay\n"
2547 seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); 2547 "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"
2548 seq_puts(m, "# | | | |||| | |\n"); 2548 "# | | | |||| | |\n");
2549} 2549}
2550 2550
2551void 2551void
@@ -2648,24 +2648,21 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
2648 event = ftrace_find_event(entry->type); 2648 event = ftrace_find_event(entry->type);
2649 2649
2650 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2650 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2651 if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2651 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
2652 if (!trace_print_lat_context(iter)) 2652 trace_print_lat_context(iter);
2653 goto partial; 2653 else
2654 } else { 2654 trace_print_context(iter);
2655 if (!trace_print_context(iter))
2656 goto partial;
2657 }
2658 } 2655 }
2659 2656
2657 if (trace_seq_has_overflowed(s))
2658 return TRACE_TYPE_PARTIAL_LINE;
2659
2660 if (event) 2660 if (event)
2661 return event->funcs->trace(iter, sym_flags, event); 2661 return event->funcs->trace(iter, sym_flags, event);
2662 2662
2663 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) 2663 trace_seq_printf(s, "Unknown type %d\n", entry->type);
2664 goto partial;
2665 2664
2666 return TRACE_TYPE_HANDLED; 2665 return trace_handle_return(s);
2667partial:
2668 return TRACE_TYPE_PARTIAL_LINE;
2669} 2666}
2670 2667
2671static enum print_line_t print_raw_fmt(struct trace_iterator *iter) 2668static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
@@ -2676,22 +2673,20 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
2676 2673
2677 entry = iter->ent; 2674 entry = iter->ent;
2678 2675
2679 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2676 if (trace_flags & TRACE_ITER_CONTEXT_INFO)
2680 if (!trace_seq_printf(s, "%d %d %llu ", 2677 trace_seq_printf(s, "%d %d %llu ",
2681 entry->pid, iter->cpu, iter->ts)) 2678 entry->pid, iter->cpu, iter->ts);
2682 goto partial; 2679
2683 } 2680 if (trace_seq_has_overflowed(s))
2681 return TRACE_TYPE_PARTIAL_LINE;
2684 2682
2685 event = ftrace_find_event(entry->type); 2683 event = ftrace_find_event(entry->type);
2686 if (event) 2684 if (event)
2687 return event->funcs->raw(iter, 0, event); 2685 return event->funcs->raw(iter, 0, event);
2688 2686
2689 if (!trace_seq_printf(s, "%d ?\n", entry->type)) 2687 trace_seq_printf(s, "%d ?\n", entry->type);
2690 goto partial;
2691 2688
2692 return TRACE_TYPE_HANDLED; 2689 return trace_handle_return(s);
2693partial:
2694 return TRACE_TYPE_PARTIAL_LINE;
2695} 2690}
2696 2691
2697static enum print_line_t print_hex_fmt(struct trace_iterator *iter) 2692static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
@@ -2704,9 +2699,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
2704 entry = iter->ent; 2699 entry = iter->ent;
2705 2700
2706 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2701 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2707 SEQ_PUT_HEX_FIELD_RET(s, entry->pid); 2702 SEQ_PUT_HEX_FIELD(s, entry->pid);
2708 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); 2703 SEQ_PUT_HEX_FIELD(s, iter->cpu);
2709 SEQ_PUT_HEX_FIELD_RET(s, iter->ts); 2704 SEQ_PUT_HEX_FIELD(s, iter->ts);
2705 if (trace_seq_has_overflowed(s))
2706 return TRACE_TYPE_PARTIAL_LINE;
2710 } 2707 }
2711 2708
2712 event = ftrace_find_event(entry->type); 2709 event = ftrace_find_event(entry->type);
@@ -2716,9 +2713,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
2716 return ret; 2713 return ret;
2717 } 2714 }
2718 2715
2719 SEQ_PUT_FIELD_RET(s, newline); 2716 SEQ_PUT_FIELD(s, newline);
2720 2717
2721 return TRACE_TYPE_HANDLED; 2718 return trace_handle_return(s);
2722} 2719}
2723 2720
2724static enum print_line_t print_bin_fmt(struct trace_iterator *iter) 2721static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
@@ -2730,9 +2727,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
2730 entry = iter->ent; 2727 entry = iter->ent;
2731 2728
2732 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2729 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2733 SEQ_PUT_FIELD_RET(s, entry->pid); 2730 SEQ_PUT_FIELD(s, entry->pid);
2734 SEQ_PUT_FIELD_RET(s, iter->cpu); 2731 SEQ_PUT_FIELD(s, iter->cpu);
2735 SEQ_PUT_FIELD_RET(s, iter->ts); 2732 SEQ_PUT_FIELD(s, iter->ts);
2733 if (trace_seq_has_overflowed(s))
2734 return TRACE_TYPE_PARTIAL_LINE;
2736 } 2735 }
2737 2736
2738 event = ftrace_find_event(entry->type); 2737 event = ftrace_find_event(entry->type);
@@ -2778,10 +2777,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2778{ 2777{
2779 enum print_line_t ret; 2778 enum print_line_t ret;
2780 2779
2781 if (iter->lost_events && 2780 if (iter->lost_events) {
2782 !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", 2781 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2783 iter->cpu, iter->lost_events)) 2782 iter->cpu, iter->lost_events);
2784 return TRACE_TYPE_PARTIAL_LINE; 2783 if (trace_seq_has_overflowed(&iter->seq))
2784 return TRACE_TYPE_PARTIAL_LINE;
2785 }
2785 2786
2786 if (iter->trace && iter->trace->print_line) { 2787 if (iter->trace && iter->trace->print_line) {
2787 ret = iter->trace->print_line(iter); 2788 ret = iter->trace->print_line(iter);
@@ -2859,44 +2860,44 @@ static void test_ftrace_alive(struct seq_file *m)
2859{ 2860{
2860 if (!ftrace_is_dead()) 2861 if (!ftrace_is_dead())
2861 return; 2862 return;
2862 seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); 2863 seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"
2863 seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); 2864 "# MAY BE MISSING FUNCTION EVENTS\n");
2864} 2865}
2865 2866
2866#ifdef CONFIG_TRACER_MAX_TRACE 2867#ifdef CONFIG_TRACER_MAX_TRACE
2867static void show_snapshot_main_help(struct seq_file *m) 2868static void show_snapshot_main_help(struct seq_file *m)
2868{ 2869{
2869 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); 2870 seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
2870 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); 2871 "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
2871 seq_printf(m, "# Takes a snapshot of the main buffer.\n"); 2872 "# Takes a snapshot of the main buffer.\n"
2872 seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); 2873 "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
2873 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); 2874 "# (Doesn't have to be '2' works with any number that\n"
2874 seq_printf(m, "# is not a '0' or '1')\n"); 2875 "# is not a '0' or '1')\n");
2875} 2876}
2876 2877
2877static void show_snapshot_percpu_help(struct seq_file *m) 2878static void show_snapshot_percpu_help(struct seq_file *m)
2878{ 2879{
2879 seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); 2880 seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
2880#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP 2881#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2881 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); 2882 seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
2882 seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); 2883 "# Takes a snapshot of the main buffer for this cpu.\n");
2883#else 2884#else
2884 seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); 2885 seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
2885 seq_printf(m, "# Must use main snapshot file to allocate.\n"); 2886 "# Must use main snapshot file to allocate.\n");
2886#endif 2887#endif
2887 seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); 2888 seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
2888 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); 2889 "# (Doesn't have to be '2' works with any number that\n"
2889 seq_printf(m, "# is not a '0' or '1')\n"); 2890 "# is not a '0' or '1')\n");
2890} 2891}
2891 2892
2892static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) 2893static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
2893{ 2894{
2894 if (iter->tr->allocated_snapshot) 2895 if (iter->tr->allocated_snapshot)
2895 seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); 2896 seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
2896 else 2897 else
2897 seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); 2898 seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
2898 2899
2899 seq_printf(m, "# Snapshot commands:\n"); 2900 seq_puts(m, "# Snapshot commands:\n");
2900 if (iter->cpu_file == RING_BUFFER_ALL_CPUS) 2901 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
2901 show_snapshot_main_help(m); 2902 show_snapshot_main_help(m);
2902 else 2903 else
@@ -3250,7 +3251,7 @@ static int t_show(struct seq_file *m, void *v)
3250 if (!t) 3251 if (!t)
3251 return 0; 3252 return 0;
3252 3253
3253 seq_printf(m, "%s", t->name); 3254 seq_puts(m, t->name);
3254 if (t->next) 3255 if (t->next)
3255 seq_putc(m, ' '); 3256 seq_putc(m, ' ');
3256 else 3257 else
@@ -4313,6 +4314,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
4313 goto out; 4314 goto out;
4314 } 4315 }
4315 4316
4317 trace_seq_init(&iter->seq);
4318
4316 /* 4319 /*
4317 * We make a copy of the current tracer to avoid concurrent 4320 * We make a copy of the current tracer to avoid concurrent
4318 * changes on it while we are reading. 4321 * changes on it while we are reading.
@@ -4434,15 +4437,12 @@ static int tracing_wait_pipe(struct file *filp)
4434 4437
4435 mutex_unlock(&iter->mutex); 4438 mutex_unlock(&iter->mutex);
4436 4439
4437 ret = wait_on_pipe(iter); 4440 ret = wait_on_pipe(iter, false);
4438 4441
4439 mutex_lock(&iter->mutex); 4442 mutex_lock(&iter->mutex);
4440 4443
4441 if (ret) 4444 if (ret)
4442 return ret; 4445 return ret;
4443
4444 if (signal_pending(current))
4445 return -EINTR;
4446 } 4446 }
4447 4447
4448 return 1; 4448 return 1;
@@ -4509,18 +4509,18 @@ waitagain:
4509 trace_access_lock(iter->cpu_file); 4509 trace_access_lock(iter->cpu_file);
4510 while (trace_find_next_entry_inc(iter) != NULL) { 4510 while (trace_find_next_entry_inc(iter) != NULL) {
4511 enum print_line_t ret; 4511 enum print_line_t ret;
4512 int len = iter->seq.len; 4512 int save_len = iter->seq.seq.len;
4513 4513
4514 ret = print_trace_line(iter); 4514 ret = print_trace_line(iter);
4515 if (ret == TRACE_TYPE_PARTIAL_LINE) { 4515 if (ret == TRACE_TYPE_PARTIAL_LINE) {
4516 /* don't print partial lines */ 4516 /* don't print partial lines */
4517 iter->seq.len = len; 4517 iter->seq.seq.len = save_len;
4518 break; 4518 break;
4519 } 4519 }
4520 if (ret != TRACE_TYPE_NO_CONSUME) 4520 if (ret != TRACE_TYPE_NO_CONSUME)
4521 trace_consume(iter); 4521 trace_consume(iter);
4522 4522
4523 if (iter->seq.len >= cnt) 4523 if (trace_seq_used(&iter->seq) >= cnt)
4524 break; 4524 break;
4525 4525
4526 /* 4526 /*
@@ -4536,7 +4536,7 @@ waitagain:
4536 4536
4537 /* Now copy what we have to the user */ 4537 /* Now copy what we have to the user */
4538 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 4538 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
4539 if (iter->seq.readpos >= iter->seq.len) 4539 if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
4540 trace_seq_init(&iter->seq); 4540 trace_seq_init(&iter->seq);
4541 4541
4542 /* 4542 /*
@@ -4570,20 +4570,33 @@ static size_t
4570tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) 4570tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
4571{ 4571{
4572 size_t count; 4572 size_t count;
4573 int save_len;
4573 int ret; 4574 int ret;
4574 4575
4575 /* Seq buffer is page-sized, exactly what we need. */ 4576 /* Seq buffer is page-sized, exactly what we need. */
4576 for (;;) { 4577 for (;;) {
4577 count = iter->seq.len; 4578 save_len = iter->seq.seq.len;
4578 ret = print_trace_line(iter); 4579 ret = print_trace_line(iter);
4579 count = iter->seq.len - count; 4580
4580 if (rem < count) { 4581 if (trace_seq_has_overflowed(&iter->seq)) {
4581 rem = 0; 4582 iter->seq.seq.len = save_len;
4582 iter->seq.len -= count;
4583 break; 4583 break;
4584 } 4584 }
4585
4586 /*
4587 * This should not be hit, because it should only
4588 * be set if the iter->seq overflowed. But check it
4589 * anyway to be safe.
4590 */
4585 if (ret == TRACE_TYPE_PARTIAL_LINE) { 4591 if (ret == TRACE_TYPE_PARTIAL_LINE) {
4586 iter->seq.len -= count; 4592 iter->seq.seq.len = save_len;
4593 break;
4594 }
4595
4596 count = trace_seq_used(&iter->seq) - save_len;
4597 if (rem < count) {
4598 rem = 0;
4599 iter->seq.seq.len = save_len;
4587 break; 4600 break;
4588 } 4601 }
4589 4602
@@ -4664,13 +4677,13 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
4664 /* Copy the data into the page, so we can start over. */ 4677 /* Copy the data into the page, so we can start over. */
4665 ret = trace_seq_to_buffer(&iter->seq, 4678 ret = trace_seq_to_buffer(&iter->seq,
4666 page_address(spd.pages[i]), 4679 page_address(spd.pages[i]),
4667 iter->seq.len); 4680 trace_seq_used(&iter->seq));
4668 if (ret < 0) { 4681 if (ret < 0) {
4669 __free_page(spd.pages[i]); 4682 __free_page(spd.pages[i]);
4670 break; 4683 break;
4671 } 4684 }
4672 spd.partial[i].offset = 0; 4685 spd.partial[i].offset = 0;
4673 spd.partial[i].len = iter->seq.len; 4686 spd.partial[i].len = trace_seq_used(&iter->seq);
4674 4687
4675 trace_seq_init(&iter->seq); 4688 trace_seq_init(&iter->seq);
4676 } 4689 }
@@ -5372,16 +5385,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5372 goto out_unlock; 5385 goto out_unlock;
5373 } 5386 }
5374 mutex_unlock(&trace_types_lock); 5387 mutex_unlock(&trace_types_lock);
5375 ret = wait_on_pipe(iter); 5388 ret = wait_on_pipe(iter, false);
5376 mutex_lock(&trace_types_lock); 5389 mutex_lock(&trace_types_lock);
5377 if (ret) { 5390 if (ret) {
5378 size = ret; 5391 size = ret;
5379 goto out_unlock; 5392 goto out_unlock;
5380 } 5393 }
5381 if (signal_pending(current)) {
5382 size = -EINTR;
5383 goto out_unlock;
5384 }
5385 goto again; 5394 goto again;
5386 } 5395 }
5387 size = 0; 5396 size = 0;
@@ -5500,7 +5509,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5500 }; 5509 };
5501 struct buffer_ref *ref; 5510 struct buffer_ref *ref;
5502 int entries, size, i; 5511 int entries, size, i;
5503 ssize_t ret; 5512 ssize_t ret = 0;
5504 5513
5505 mutex_lock(&trace_types_lock); 5514 mutex_lock(&trace_types_lock);
5506 5515
@@ -5538,13 +5547,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5538 int r; 5547 int r;
5539 5548
5540 ref = kzalloc(sizeof(*ref), GFP_KERNEL); 5549 ref = kzalloc(sizeof(*ref), GFP_KERNEL);
5541 if (!ref) 5550 if (!ref) {
5551 ret = -ENOMEM;
5542 break; 5552 break;
5553 }
5543 5554
5544 ref->ref = 1; 5555 ref->ref = 1;
5545 ref->buffer = iter->trace_buffer->buffer; 5556 ref->buffer = iter->trace_buffer->buffer;
5546 ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); 5557 ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
5547 if (!ref->page) { 5558 if (!ref->page) {
5559 ret = -ENOMEM;
5548 kfree(ref); 5560 kfree(ref);
5549 break; 5561 break;
5550 } 5562 }
@@ -5582,19 +5594,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5582 5594
5583 /* did we read anything? */ 5595 /* did we read anything? */
5584 if (!spd.nr_pages) { 5596 if (!spd.nr_pages) {
5597 if (ret)
5598 goto out;
5599
5585 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { 5600 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
5586 ret = -EAGAIN; 5601 ret = -EAGAIN;
5587 goto out; 5602 goto out;
5588 } 5603 }
5589 mutex_unlock(&trace_types_lock); 5604 mutex_unlock(&trace_types_lock);
5590 ret = wait_on_pipe(iter); 5605 ret = wait_on_pipe(iter, true);
5591 mutex_lock(&trace_types_lock); 5606 mutex_lock(&trace_types_lock);
5592 if (ret) 5607 if (ret)
5593 goto out; 5608 goto out;
5594 if (signal_pending(current)) { 5609
5595 ret = -EINTR;
5596 goto out;
5597 }
5598 goto again; 5610 goto again;
5599 } 5611 }
5600 5612
@@ -5671,7 +5683,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
5671 cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); 5683 cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
5672 trace_seq_printf(s, "read events: %ld\n", cnt); 5684 trace_seq_printf(s, "read events: %ld\n", cnt);
5673 5685
5674 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 5686 count = simple_read_from_buffer(ubuf, count, ppos,
5687 s->buffer, trace_seq_used(s));
5675 5688
5676 kfree(s); 5689 kfree(s);
5677 5690
@@ -5752,10 +5765,10 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
5752 5765
5753 seq_printf(m, "%ps:", (void *)ip); 5766 seq_printf(m, "%ps:", (void *)ip);
5754 5767
5755 seq_printf(m, "snapshot"); 5768 seq_puts(m, "snapshot");
5756 5769
5757 if (count == -1) 5770 if (count == -1)
5758 seq_printf(m, ":unlimited\n"); 5771 seq_puts(m, ":unlimited\n");
5759 else 5772 else
5760 seq_printf(m, ":count=%ld\n", count); 5773 seq_printf(m, ":count=%ld\n", count);
5761 5774
@@ -6420,7 +6433,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m
6420 int ret; 6433 int ret;
6421 6434
6422 /* Paranoid: Make sure the parent is the "instances" directory */ 6435 /* Paranoid: Make sure the parent is the "instances" directory */
6423 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); 6436 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
6424 if (WARN_ON_ONCE(parent != trace_instance_dir)) 6437 if (WARN_ON_ONCE(parent != trace_instance_dir))
6425 return -ENOENT; 6438 return -ENOENT;
6426 6439
@@ -6447,7 +6460,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry)
6447 int ret; 6460 int ret;
6448 6461
6449 /* Paranoid: Make sure the parent is the "instances" directory */ 6462 /* Paranoid: Make sure the parent is the "instances" directory */
6450 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); 6463 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
6451 if (WARN_ON_ONCE(parent != trace_instance_dir)) 6464 if (WARN_ON_ONCE(parent != trace_instance_dir))
6452 return -ENOENT; 6465 return -ENOENT;
6453 6466
@@ -6634,11 +6647,19 @@ void
6634trace_printk_seq(struct trace_seq *s) 6647trace_printk_seq(struct trace_seq *s)
6635{ 6648{
6636 /* Probably should print a warning here. */ 6649 /* Probably should print a warning here. */
6637 if (s->len >= TRACE_MAX_PRINT) 6650 if (s->seq.len >= TRACE_MAX_PRINT)
6638 s->len = TRACE_MAX_PRINT; 6651 s->seq.len = TRACE_MAX_PRINT;
6652
6653 /*
6654 * More paranoid code. Although the buffer size is set to
6655 * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just
6656 * an extra layer of protection.
6657 */
6658 if (WARN_ON_ONCE(s->seq.len >= s->seq.size))
6659 s->seq.len = s->seq.size - 1;
6639 6660
6640 /* should be zero ended, but we are paranoid. */ 6661 /* should be zero ended, but we are paranoid. */
6641 s->buffer[s->len] = 0; 6662 s->buffer[s->seq.len] = 0;
6642 6663
6643 printk(KERN_TRACE "%s", s->buffer); 6664 printk(KERN_TRACE "%s", s->buffer);
6644 6665
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 385391fb1d3b..3255dfb054a0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -14,6 +14,7 @@
14#include <linux/trace_seq.h> 14#include <linux/trace_seq.h>
15#include <linux/ftrace_event.h> 15#include <linux/ftrace_event.h>
16#include <linux/compiler.h> 16#include <linux/compiler.h>
17#include <linux/trace_seq.h>
17 18
18#ifdef CONFIG_FTRACE_SYSCALLS 19#ifdef CONFIG_FTRACE_SYSCALLS
19#include <asm/unistd.h> /* For NR_SYSCALLS */ 20#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -569,15 +570,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
569 570
570void tracing_iter_reset(struct trace_iterator *iter, int cpu); 571void tracing_iter_reset(struct trace_iterator *iter, int cpu);
571 572
572void tracing_sched_switch_trace(struct trace_array *tr,
573 struct task_struct *prev,
574 struct task_struct *next,
575 unsigned long flags, int pc);
576
577void tracing_sched_wakeup_trace(struct trace_array *tr,
578 struct task_struct *wakee,
579 struct task_struct *cur,
580 unsigned long flags, int pc);
581void trace_function(struct trace_array *tr, 573void trace_function(struct trace_array *tr,
582 unsigned long ip, 574 unsigned long ip,
583 unsigned long parent_ip, 575 unsigned long parent_ip,
@@ -597,9 +589,6 @@ void set_graph_array(struct trace_array *tr);
597 589
598void tracing_start_cmdline_record(void); 590void tracing_start_cmdline_record(void);
599void tracing_stop_cmdline_record(void); 591void tracing_stop_cmdline_record(void);
600void tracing_sched_switch_assign_trace(struct trace_array *tr);
601void tracing_stop_sched_switch_record(void);
602void tracing_start_sched_switch_record(void);
603int register_tracer(struct tracer *type); 592int register_tracer(struct tracer *type);
604int is_tracing_stopped(void); 593int is_tracing_stopped(void);
605 594
@@ -719,6 +708,8 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
719 708
720extern unsigned long trace_flags; 709extern unsigned long trace_flags;
721 710
711extern char trace_find_mark(unsigned long long duration);
712
722/* Standard output formatting function used for function return traces */ 713/* Standard output formatting function used for function return traces */
723#ifdef CONFIG_FUNCTION_GRAPH_TRACER 714#ifdef CONFIG_FUNCTION_GRAPH_TRACER
724 715
@@ -737,7 +728,7 @@ extern unsigned long trace_flags;
737extern enum print_line_t 728extern enum print_line_t
738print_graph_function_flags(struct trace_iterator *iter, u32 flags); 729print_graph_function_flags(struct trace_iterator *iter, u32 flags);
739extern void print_graph_headers_flags(struct seq_file *s, u32 flags); 730extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
740extern enum print_line_t 731extern void
741trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); 732trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
742extern void graph_trace_open(struct trace_iterator *iter); 733extern void graph_trace_open(struct trace_iterator *iter);
743extern void graph_trace_close(struct trace_iterator *iter); 734extern void graph_trace_close(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 697fb9bac8f0..7d6e2afde669 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -151,22 +151,21 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
151 151
152 trace_assign_type(field, iter->ent); 152 trace_assign_type(field, iter->ent);
153 153
154 if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", 154 trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
155 field->correct ? " ok " : " MISS ", 155 field->correct ? " ok " : " MISS ",
156 field->func, 156 field->func,
157 field->file, 157 field->file,
158 field->line)) 158 field->line);
159 return TRACE_TYPE_PARTIAL_LINE; 159
160 160 return trace_handle_return(&iter->seq);
161 return TRACE_TYPE_HANDLED;
162} 161}
163 162
164static void branch_print_header(struct seq_file *s) 163static void branch_print_header(struct seq_file *s)
165{ 164{
166 seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" 165 seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT"
167 " FUNC:FILE:LINE\n"); 166 " FUNC:FILE:LINE\n"
168 seq_puts(s, "# | | | | | " 167 "# | | | | | "
169 " |\n"); 168 " |\n");
170} 169}
171 170
172static struct trace_event_functions trace_branch_funcs = { 171static struct trace_event_functions trace_branch_funcs = {
@@ -233,12 +232,12 @@ extern unsigned long __stop_annotated_branch_profile[];
233 232
234static int annotated_branch_stat_headers(struct seq_file *m) 233static int annotated_branch_stat_headers(struct seq_file *m)
235{ 234{
236 seq_printf(m, " correct incorrect %% "); 235 seq_puts(m, " correct incorrect % "
237 seq_printf(m, " Function " 236 " Function "
238 " File Line\n" 237 " File Line\n"
239 " ------- --------- - " 238 " ------- --------- - "
240 " -------- " 239 " -------- "
241 " ---- ----\n"); 240 " ---- ----\n");
242 return 0; 241 return 0;
243} 242}
244 243
@@ -274,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
274 273
275 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); 274 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
276 if (percent < 0) 275 if (percent < 0)
277 seq_printf(m, " X "); 276 seq_puts(m, " X ");
278 else 277 else
279 seq_printf(m, "%3ld ", percent); 278 seq_printf(m, "%3ld ", percent);
280 seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); 279 seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
@@ -362,12 +361,12 @@ extern unsigned long __stop_branch_profile[];
362 361
363static int all_branch_stat_headers(struct seq_file *m) 362static int all_branch_stat_headers(struct seq_file *m)
364{ 363{
365 seq_printf(m, " miss hit %% "); 364 seq_puts(m, " miss hit % "
366 seq_printf(m, " Function " 365 " Function "
367 " File Line\n" 366 " File Line\n"
368 " ------- --------- - " 367 " ------- --------- - "
369 " -------- " 368 " -------- "
370 " ---- ----\n"); 369 " ---- ----\n");
371 return 0; 370 return 0;
372} 371}
373 372
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0cc51edde3a8..d0e4f92b5eb6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -461,7 +461,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
461 461
462 if (dir) { 462 if (dir) {
463 spin_lock(&dir->d_lock); /* probably unneeded */ 463 spin_lock(&dir->d_lock); /* probably unneeded */
464 list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { 464 list_for_each_entry(child, &dir->d_subdirs, d_child) {
465 if (child->d_inode) /* probably unneeded */ 465 if (child->d_inode) /* probably unneeded */
466 child->d_inode->i_private = NULL; 466 child->d_inode->i_private = NULL;
467 } 467 }
@@ -918,7 +918,7 @@ static int f_show(struct seq_file *m, void *v)
918 case FORMAT_HEADER: 918 case FORMAT_HEADER:
919 seq_printf(m, "name: %s\n", ftrace_event_name(call)); 919 seq_printf(m, "name: %s\n", ftrace_event_name(call));
920 seq_printf(m, "ID: %d\n", call->event.type); 920 seq_printf(m, "ID: %d\n", call->event.type);
921 seq_printf(m, "format:\n"); 921 seq_puts(m, "format:\n");
922 return 0; 922 return 0;
923 923
924 case FORMAT_FIELD_SEPERATOR: 924 case FORMAT_FIELD_SEPERATOR:
@@ -1044,7 +1044,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
1044 mutex_unlock(&event_mutex); 1044 mutex_unlock(&event_mutex);
1045 1045
1046 if (file) 1046 if (file)
1047 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1047 r = simple_read_from_buffer(ubuf, cnt, ppos,
1048 s->buffer, trace_seq_used(s));
1048 1049
1049 kfree(s); 1050 kfree(s);
1050 1051
@@ -1210,7 +1211,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
1210 trace_seq_init(s); 1211 trace_seq_init(s);
1211 1212
1212 print_subsystem_event_filter(system, s); 1213 print_subsystem_event_filter(system, s);
1213 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1214 r = simple_read_from_buffer(ubuf, cnt, ppos,
1215 s->buffer, trace_seq_used(s));
1214 1216
1215 kfree(s); 1217 kfree(s);
1216 1218
@@ -1265,7 +1267,8 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
1265 trace_seq_init(s); 1267 trace_seq_init(s);
1266 1268
1267 func(s); 1269 func(s);
1268 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1270 r = simple_read_from_buffer(ubuf, cnt, ppos,
1271 s->buffer, trace_seq_used(s));
1269 1272
1270 kfree(s); 1273 kfree(s);
1271 1274
@@ -1988,7 +1991,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
1988 ftrace_event_name(data->file->event_call)); 1991 ftrace_event_name(data->file->event_call));
1989 1992
1990 if (data->count == -1) 1993 if (data->count == -1)
1991 seq_printf(m, ":unlimited\n"); 1994 seq_puts(m, ":unlimited\n");
1992 else 1995 else
1993 seq_printf(m, ":count=%ld\n", data->count); 1996 seq_printf(m, ":count=%ld\n", data->count);
1994 1997
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 7a8c1528e141..ced69da0ff55 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -45,6 +45,7 @@ enum filter_op_ids
45 OP_GT, 45 OP_GT,
46 OP_GE, 46 OP_GE,
47 OP_BAND, 47 OP_BAND,
48 OP_NOT,
48 OP_NONE, 49 OP_NONE,
49 OP_OPEN_PAREN, 50 OP_OPEN_PAREN,
50}; 51};
@@ -67,6 +68,7 @@ static struct filter_op filter_ops[] = {
67 { OP_GT, ">", 5 }, 68 { OP_GT, ">", 5 },
68 { OP_GE, ">=", 5 }, 69 { OP_GE, ">=", 5 },
69 { OP_BAND, "&", 6 }, 70 { OP_BAND, "&", 6 },
71 { OP_NOT, "!", 6 },
70 { OP_NONE, "OP_NONE", 0 }, 72 { OP_NONE, "OP_NONE", 0 },
71 { OP_OPEN_PAREN, "(", 0 }, 73 { OP_OPEN_PAREN, "(", 0 },
72}; 74};
@@ -85,6 +87,7 @@ enum {
85 FILT_ERR_MISSING_FIELD, 87 FILT_ERR_MISSING_FIELD,
86 FILT_ERR_INVALID_FILTER, 88 FILT_ERR_INVALID_FILTER,
87 FILT_ERR_IP_FIELD_ONLY, 89 FILT_ERR_IP_FIELD_ONLY,
90 FILT_ERR_ILLEGAL_NOT_OP,
88}; 91};
89 92
90static char *err_text[] = { 93static char *err_text[] = {
@@ -101,6 +104,7 @@ static char *err_text[] = {
101 "Missing field name and/or value", 104 "Missing field name and/or value",
102 "Meaningless filter expression", 105 "Meaningless filter expression",
103 "Only 'ip' field is supported for function trace", 106 "Only 'ip' field is supported for function trace",
107 "Illegal use of '!'",
104}; 108};
105 109
106struct opstack_op { 110struct opstack_op {
@@ -139,6 +143,7 @@ struct pred_stack {
139 int index; 143 int index;
140}; 144};
141 145
146/* If not of not match is equal to not of not, then it is a match */
142#define DEFINE_COMPARISON_PRED(type) \ 147#define DEFINE_COMPARISON_PRED(type) \
143static int filter_pred_##type(struct filter_pred *pred, void *event) \ 148static int filter_pred_##type(struct filter_pred *pred, void *event) \
144{ \ 149{ \
@@ -166,7 +171,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \
166 break; \ 171 break; \
167 } \ 172 } \
168 \ 173 \
169 return match; \ 174 return !!match == !pred->not; \
170} 175}
171 176
172#define DEFINE_EQUALITY_PRED(size) \ 177#define DEFINE_EQUALITY_PRED(size) \
@@ -484,9 +489,10 @@ static int process_ops(struct filter_pred *preds,
484 if (!WARN_ON_ONCE(!pred->fn)) 489 if (!WARN_ON_ONCE(!pred->fn))
485 match = pred->fn(pred, rec); 490 match = pred->fn(pred, rec);
486 if (!!match == type) 491 if (!!match == type)
487 return match; 492 break;
488 } 493 }
489 return match; 494 /* If not of not match is equal to not of not, then it is a match */
495 return !!match == !op->not;
490} 496}
491 497
492struct filter_match_preds_data { 498struct filter_match_preds_data {
@@ -735,10 +741,10 @@ static int filter_set_pred(struct event_filter *filter,
735 * then this op can be folded. 741 * then this op can be folded.
736 */ 742 */
737 if (left->index & FILTER_PRED_FOLD && 743 if (left->index & FILTER_PRED_FOLD &&
738 (left->op == dest->op || 744 ((left->op == dest->op && !left->not) ||
739 left->left == FILTER_PRED_INVALID) && 745 left->left == FILTER_PRED_INVALID) &&
740 right->index & FILTER_PRED_FOLD && 746 right->index & FILTER_PRED_FOLD &&
741 (right->op == dest->op || 747 ((right->op == dest->op && !right->not) ||
742 right->left == FILTER_PRED_INVALID)) 748 right->left == FILTER_PRED_INVALID))
743 dest->index |= FILTER_PRED_FOLD; 749 dest->index |= FILTER_PRED_FOLD;
744 750
@@ -1028,7 +1034,7 @@ static int init_pred(struct filter_parse_state *ps,
1028 } 1034 }
1029 1035
1030 if (pred->op == OP_NE) 1036 if (pred->op == OP_NE)
1031 pred->not = 1; 1037 pred->not ^= 1;
1032 1038
1033 pred->fn = fn; 1039 pred->fn = fn;
1034 return 0; 1040 return 0;
@@ -1590,6 +1596,17 @@ static int replace_preds(struct ftrace_event_call *call,
1590 continue; 1596 continue;
1591 } 1597 }
1592 1598
1599 if (elt->op == OP_NOT) {
1600 if (!n_preds || operand1 || operand2) {
1601 parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0);
1602 err = -EINVAL;
1603 goto fail;
1604 }
1605 if (!dry_run)
1606 filter->preds[n_preds - 1].not ^= 1;
1607 continue;
1608 }
1609
1593 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { 1610 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
1594 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 1611 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1595 err = -ENOSPC; 1612 err = -ENOSPC;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4747b476a030..8712df9decb4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -373,7 +373,7 @@ event_trigger_print(const char *name, struct seq_file *m,
373{ 373{
374 long count = (long)data; 374 long count = (long)data;
375 375
376 seq_printf(m, "%s", name); 376 seq_puts(m, name);
377 377
378 if (count == -1) 378 if (count == -1)
379 seq_puts(m, ":unlimited"); 379 seq_puts(m, ":unlimited");
@@ -383,7 +383,7 @@ event_trigger_print(const char *name, struct seq_file *m,
383 if (filter_str) 383 if (filter_str)
384 seq_printf(m, " if %s\n", filter_str); 384 seq_printf(m, " if %s\n", filter_str);
385 else 385 else
386 seq_puts(m, "\n"); 386 seq_putc(m, '\n');
387 387
388 return 0; 388 return 0;
389} 389}
@@ -1105,7 +1105,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
1105 if (data->filter_str) 1105 if (data->filter_str)
1106 seq_printf(m, " if %s\n", data->filter_str); 1106 seq_printf(m, " if %s\n", data->filter_str);
1107 else 1107 else
1108 seq_puts(m, "\n"); 1108 seq_putc(m, '\n');
1109 1109
1110 return 0; 1110 return 0;
1111} 1111}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 57f0ec962d2c..fcd41a166405 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -261,37 +261,74 @@ static struct tracer function_trace __tracer_data =
261}; 261};
262 262
263#ifdef CONFIG_DYNAMIC_FTRACE 263#ifdef CONFIG_DYNAMIC_FTRACE
264static int update_count(void **data) 264static void update_traceon_count(void **data, bool on)
265{ 265{
266 unsigned long *count = (long *)data; 266 long *count = (long *)data;
267 long old_count = *count;
267 268
268 if (!*count) 269 /*
269 return 0; 270 * Tracing gets disabled (or enabled) once per count.
271 * This function can be called at the same time on multiple CPUs.
272 * It is fine if both disable (or enable) tracing, as disabling
273 * (or enabling) the second time doesn't do anything as the
274 * state of the tracer is already disabled (or enabled).
275 * What needs to be synchronized in this case is that the count
276 * only gets decremented once, even if the tracer is disabled
277 * (or enabled) twice, as the second one is really a nop.
278 *
279 * The memory barriers guarantee that we only decrement the
280 * counter once. First the count is read to a local variable
281 * and a read barrier is used to make sure that it is loaded
282 * before checking if the tracer is in the state we want.
283 * If the tracer is not in the state we want, then the count
284 * is guaranteed to be the old count.
285 *
286 * Next the tracer is set to the state we want (disabled or enabled)
287 * then a write memory barrier is used to make sure that
288 * the new state is visible before changing the counter by
289 * one minus the old counter. This guarantees that another CPU
290 * executing this code will see the new state before seeing
291 * the new counter value, and would not do anything if the new
292 * counter is seen.
293 *
294 * Note, there is no synchronization between this and a user
295 * setting the tracing_on file. But we currently don't care
296 * about that.
297 */
298 if (!old_count)
299 return;
270 300
271 if (*count != -1) 301 /* Make sure we see count before checking tracing state */
272 (*count)--; 302 smp_rmb();
273 303
274 return 1; 304 if (on == !!tracing_is_on())
305 return;
306
307 if (on)
308 tracing_on();
309 else
310 tracing_off();
311
312 /* unlimited? */
313 if (old_count == -1)
314 return;
315
316 /* Make sure tracing state is visible before updating count */
317 smp_wmb();
318
319 *count = old_count - 1;
275} 320}
276 321
277static void 322static void
278ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) 323ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
279{ 324{
280 if (tracing_is_on()) 325 update_traceon_count(data, 1);
281 return;
282
283 if (update_count(data))
284 tracing_on();
285} 326}
286 327
287static void 328static void
288ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) 329ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
289{ 330{
290 if (!tracing_is_on()) 331 update_traceon_count(data, 0);
291 return;
292
293 if (update_count(data))
294 tracing_off();
295} 332}
296 333
297static void 334static void
@@ -330,11 +367,49 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
330static void 367static void
331ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) 368ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
332{ 369{
333 if (!tracing_is_on()) 370 long *count = (long *)data;
334 return; 371 long old_count;
372 long new_count;
335 373
336 if (update_count(data)) 374 /*
337 trace_dump_stack(STACK_SKIP); 375 * Stack traces should only execute the number of times the
376 * user specified in the counter.
377 */
378 do {
379
380 if (!tracing_is_on())
381 return;
382
383 old_count = *count;
384
385 if (!old_count)
386 return;
387
388 /* unlimited? */
389 if (old_count == -1) {
390 trace_dump_stack(STACK_SKIP);
391 return;
392 }
393
394 new_count = old_count - 1;
395 new_count = cmpxchg(count, old_count, new_count);
396 if (new_count == old_count)
397 trace_dump_stack(STACK_SKIP);
398
399 } while (new_count != old_count);
400}
401
402static int update_count(void **data)
403{
404 unsigned long *count = (long *)data;
405
406 if (!*count)
407 return 0;
408
409 if (*count != -1)
410 (*count)--;
411
412 return 1;
338} 413}
339 414
340static void 415static void
@@ -361,7 +436,7 @@ ftrace_probe_print(const char *name, struct seq_file *m,
361 seq_printf(m, "%ps:%s", (void *)ip, name); 436 seq_printf(m, "%ps:%s", (void *)ip, name);
362 437
363 if (count == -1) 438 if (count == -1)
364 seq_printf(m, ":unlimited\n"); 439 seq_puts(m, ":unlimited\n");
365 else 440 else
366 seq_printf(m, ":count=%ld\n", count); 441 seq_printf(m, ":count=%ld\n", count);
367 442
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index f0a0c982cde3..ba476009e5de 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -107,7 +107,7 @@ enum {
107 FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, 107 FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
108}; 108};
109 109
110static enum print_line_t 110static void
111print_graph_duration(unsigned long long duration, struct trace_seq *s, 111print_graph_duration(unsigned long long duration, struct trace_seq *s,
112 u32 flags); 112 u32 flags);
113 113
@@ -483,33 +483,24 @@ static int graph_trace_update_thresh(struct trace_array *tr)
483 483
484static int max_bytes_for_cpu; 484static int max_bytes_for_cpu;
485 485
486static enum print_line_t 486static void print_graph_cpu(struct trace_seq *s, int cpu)
487print_graph_cpu(struct trace_seq *s, int cpu)
488{ 487{
489 int ret;
490
491 /* 488 /*
492 * Start with a space character - to make it stand out 489 * Start with a space character - to make it stand out
493 * to the right a bit when trace output is pasted into 490 * to the right a bit when trace output is pasted into
494 * email: 491 * email:
495 */ 492 */
496 ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); 493 trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
497 if (!ret)
498 return TRACE_TYPE_PARTIAL_LINE;
499
500 return TRACE_TYPE_HANDLED;
501} 494}
502 495
503#define TRACE_GRAPH_PROCINFO_LENGTH 14 496#define TRACE_GRAPH_PROCINFO_LENGTH 14
504 497
505static enum print_line_t 498static void print_graph_proc(struct trace_seq *s, pid_t pid)
506print_graph_proc(struct trace_seq *s, pid_t pid)
507{ 499{
508 char comm[TASK_COMM_LEN]; 500 char comm[TASK_COMM_LEN];
509 /* sign + log10(MAX_INT) + '\0' */ 501 /* sign + log10(MAX_INT) + '\0' */
510 char pid_str[11]; 502 char pid_str[11];
511 int spaces = 0; 503 int spaces = 0;
512 int ret;
513 int len; 504 int len;
514 int i; 505 int i;
515 506
@@ -524,56 +515,43 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
524 spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; 515 spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
525 516
526 /* First spaces to align center */ 517 /* First spaces to align center */
527 for (i = 0; i < spaces / 2; i++) { 518 for (i = 0; i < spaces / 2; i++)
528 ret = trace_seq_putc(s, ' '); 519 trace_seq_putc(s, ' ');
529 if (!ret)
530 return TRACE_TYPE_PARTIAL_LINE;
531 }
532 520
533 ret = trace_seq_printf(s, "%s-%s", comm, pid_str); 521 trace_seq_printf(s, "%s-%s", comm, pid_str);
534 if (!ret)
535 return TRACE_TYPE_PARTIAL_LINE;
536 522
537 /* Last spaces to align center */ 523 /* Last spaces to align center */
538 for (i = 0; i < spaces - (spaces / 2); i++) { 524 for (i = 0; i < spaces - (spaces / 2); i++)
539 ret = trace_seq_putc(s, ' '); 525 trace_seq_putc(s, ' ');
540 if (!ret)
541 return TRACE_TYPE_PARTIAL_LINE;
542 }
543 return TRACE_TYPE_HANDLED;
544} 526}
545 527
546 528
547static enum print_line_t 529static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
548print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
549{ 530{
550 if (!trace_seq_putc(s, ' ')) 531 trace_seq_putc(s, ' ');
551 return 0; 532 trace_print_lat_fmt(s, entry);
552
553 return trace_print_lat_fmt(s, entry);
554} 533}
555 534
556/* If the pid changed since the last trace, output this event */ 535/* If the pid changed since the last trace, output this event */
557static enum print_line_t 536static void
558verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) 537verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
559{ 538{
560 pid_t prev_pid; 539 pid_t prev_pid;
561 pid_t *last_pid; 540 pid_t *last_pid;
562 int ret;
563 541
564 if (!data) 542 if (!data)
565 return TRACE_TYPE_HANDLED; 543 return;
566 544
567 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); 545 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
568 546
569 if (*last_pid == pid) 547 if (*last_pid == pid)
570 return TRACE_TYPE_HANDLED; 548 return;
571 549
572 prev_pid = *last_pid; 550 prev_pid = *last_pid;
573 *last_pid = pid; 551 *last_pid = pid;
574 552
575 if (prev_pid == -1) 553 if (prev_pid == -1)
576 return TRACE_TYPE_HANDLED; 554 return;
577/* 555/*
578 * Context-switch trace line: 556 * Context-switch trace line:
579 557
@@ -582,33 +560,12 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
582 ------------------------------------------ 560 ------------------------------------------
583 561
584 */ 562 */
585 ret = trace_seq_puts(s, 563 trace_seq_puts(s, " ------------------------------------------\n");
586 " ------------------------------------------\n"); 564 print_graph_cpu(s, cpu);
587 if (!ret) 565 print_graph_proc(s, prev_pid);
588 return TRACE_TYPE_PARTIAL_LINE; 566 trace_seq_puts(s, " => ");
589 567 print_graph_proc(s, pid);
590 ret = print_graph_cpu(s, cpu); 568 trace_seq_puts(s, "\n ------------------------------------------\n\n");
591 if (ret == TRACE_TYPE_PARTIAL_LINE)
592 return TRACE_TYPE_PARTIAL_LINE;
593
594 ret = print_graph_proc(s, prev_pid);
595 if (ret == TRACE_TYPE_PARTIAL_LINE)
596 return TRACE_TYPE_PARTIAL_LINE;
597
598 ret = trace_seq_puts(s, " => ");
599 if (!ret)
600 return TRACE_TYPE_PARTIAL_LINE;
601
602 ret = print_graph_proc(s, pid);
603 if (ret == TRACE_TYPE_PARTIAL_LINE)
604 return TRACE_TYPE_PARTIAL_LINE;
605
606 ret = trace_seq_puts(s,
607 "\n ------------------------------------------\n\n");
608 if (!ret)
609 return TRACE_TYPE_PARTIAL_LINE;
610
611 return TRACE_TYPE_HANDLED;
612} 569}
613 570
614static struct ftrace_graph_ret_entry * 571static struct ftrace_graph_ret_entry *
@@ -682,175 +639,122 @@ get_return_for_leaf(struct trace_iterator *iter,
682 return next; 639 return next;
683} 640}
684 641
685static int print_graph_abs_time(u64 t, struct trace_seq *s) 642static void print_graph_abs_time(u64 t, struct trace_seq *s)
686{ 643{
687 unsigned long usecs_rem; 644 unsigned long usecs_rem;
688 645
689 usecs_rem = do_div(t, NSEC_PER_SEC); 646 usecs_rem = do_div(t, NSEC_PER_SEC);
690 usecs_rem /= 1000; 647 usecs_rem /= 1000;
691 648
692 return trace_seq_printf(s, "%5lu.%06lu | ", 649 trace_seq_printf(s, "%5lu.%06lu | ",
693 (unsigned long)t, usecs_rem); 650 (unsigned long)t, usecs_rem);
694} 651}
695 652
696static enum print_line_t 653static void
697print_graph_irq(struct trace_iterator *iter, unsigned long addr, 654print_graph_irq(struct trace_iterator *iter, unsigned long addr,
698 enum trace_type type, int cpu, pid_t pid, u32 flags) 655 enum trace_type type, int cpu, pid_t pid, u32 flags)
699{ 656{
700 int ret;
701 struct trace_seq *s = &iter->seq; 657 struct trace_seq *s = &iter->seq;
658 struct trace_entry *ent = iter->ent;
702 659
703 if (addr < (unsigned long)__irqentry_text_start || 660 if (addr < (unsigned long)__irqentry_text_start ||
704 addr >= (unsigned long)__irqentry_text_end) 661 addr >= (unsigned long)__irqentry_text_end)
705 return TRACE_TYPE_UNHANDLED; 662 return;
706 663
707 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 664 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
708 /* Absolute time */ 665 /* Absolute time */
709 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 666 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
710 ret = print_graph_abs_time(iter->ts, s); 667 print_graph_abs_time(iter->ts, s);
711 if (!ret)
712 return TRACE_TYPE_PARTIAL_LINE;
713 }
714 668
715 /* Cpu */ 669 /* Cpu */
716 if (flags & TRACE_GRAPH_PRINT_CPU) { 670 if (flags & TRACE_GRAPH_PRINT_CPU)
717 ret = print_graph_cpu(s, cpu); 671 print_graph_cpu(s, cpu);
718 if (ret == TRACE_TYPE_PARTIAL_LINE)
719 return TRACE_TYPE_PARTIAL_LINE;
720 }
721 672
722 /* Proc */ 673 /* Proc */
723 if (flags & TRACE_GRAPH_PRINT_PROC) { 674 if (flags & TRACE_GRAPH_PRINT_PROC) {
724 ret = print_graph_proc(s, pid); 675 print_graph_proc(s, pid);
725 if (ret == TRACE_TYPE_PARTIAL_LINE) 676 trace_seq_puts(s, " | ");
726 return TRACE_TYPE_PARTIAL_LINE;
727 ret = trace_seq_puts(s, " | ");
728 if (!ret)
729 return TRACE_TYPE_PARTIAL_LINE;
730 } 677 }
678
679 /* Latency format */
680 if (trace_flags & TRACE_ITER_LATENCY_FMT)
681 print_graph_lat_fmt(s, ent);
731 } 682 }
732 683
733 /* No overhead */ 684 /* No overhead */
734 ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); 685 print_graph_duration(0, s, flags | FLAGS_FILL_START);
735 if (ret != TRACE_TYPE_HANDLED)
736 return ret;
737 686
738 if (type == TRACE_GRAPH_ENT) 687 if (type == TRACE_GRAPH_ENT)
739 ret = trace_seq_puts(s, "==========>"); 688 trace_seq_puts(s, "==========>");
740 else 689 else
741 ret = trace_seq_puts(s, "<=========="); 690 trace_seq_puts(s, "<==========");
742
743 if (!ret)
744 return TRACE_TYPE_PARTIAL_LINE;
745
746 ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);
747 if (ret != TRACE_TYPE_HANDLED)
748 return ret;
749
750 ret = trace_seq_putc(s, '\n');
751 691
752 if (!ret) 692 print_graph_duration(0, s, flags | FLAGS_FILL_END);
753 return TRACE_TYPE_PARTIAL_LINE; 693 trace_seq_putc(s, '\n');
754 return TRACE_TYPE_HANDLED;
755} 694}
756 695
757enum print_line_t 696void
758trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) 697trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
759{ 698{
760 unsigned long nsecs_rem = do_div(duration, 1000); 699 unsigned long nsecs_rem = do_div(duration, 1000);
761 /* log10(ULONG_MAX) + '\0' */ 700 /* log10(ULONG_MAX) + '\0' */
762 char msecs_str[21]; 701 char usecs_str[21];
763 char nsecs_str[5]; 702 char nsecs_str[5];
764 int ret, len; 703 int len;
765 int i; 704 int i;
766 705
767 sprintf(msecs_str, "%lu", (unsigned long) duration); 706 sprintf(usecs_str, "%lu", (unsigned long) duration);
768 707
769 /* Print msecs */ 708 /* Print msecs */
770 ret = trace_seq_printf(s, "%s", msecs_str); 709 trace_seq_printf(s, "%s", usecs_str);
771 if (!ret)
772 return TRACE_TYPE_PARTIAL_LINE;
773 710
774 len = strlen(msecs_str); 711 len = strlen(usecs_str);
775 712
776 /* Print nsecs (we don't want to exceed 7 numbers) */ 713 /* Print nsecs (we don't want to exceed 7 numbers) */
777 if (len < 7) { 714 if (len < 7) {
778 size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); 715 size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
779 716
780 snprintf(nsecs_str, slen, "%03lu", nsecs_rem); 717 snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
781 ret = trace_seq_printf(s, ".%s", nsecs_str); 718 trace_seq_printf(s, ".%s", nsecs_str);
782 if (!ret)
783 return TRACE_TYPE_PARTIAL_LINE;
784 len += strlen(nsecs_str); 719 len += strlen(nsecs_str);
785 } 720 }
786 721
787 ret = trace_seq_puts(s, " us "); 722 trace_seq_puts(s, " us ");
788 if (!ret)
789 return TRACE_TYPE_PARTIAL_LINE;
790 723
791 /* Print remaining spaces to fit the row's width */ 724 /* Print remaining spaces to fit the row's width */
792 for (i = len; i < 7; i++) { 725 for (i = len; i < 7; i++)
793 ret = trace_seq_putc(s, ' '); 726 trace_seq_putc(s, ' ');
794 if (!ret)
795 return TRACE_TYPE_PARTIAL_LINE;
796 }
797 return TRACE_TYPE_HANDLED;
798} 727}
799 728
800static enum print_line_t 729static void
801print_graph_duration(unsigned long long duration, struct trace_seq *s, 730print_graph_duration(unsigned long long duration, struct trace_seq *s,
802 u32 flags) 731 u32 flags)
803{ 732{
804 int ret = -1;
805
806 if (!(flags & TRACE_GRAPH_PRINT_DURATION) || 733 if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
807 !(trace_flags & TRACE_ITER_CONTEXT_INFO)) 734 !(trace_flags & TRACE_ITER_CONTEXT_INFO))
808 return TRACE_TYPE_HANDLED; 735 return;
809 736
810 /* No real adata, just filling the column with spaces */ 737 /* No real adata, just filling the column with spaces */
811 switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { 738 switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
812 case FLAGS_FILL_FULL: 739 case FLAGS_FILL_FULL:
813 ret = trace_seq_puts(s, " | "); 740 trace_seq_puts(s, " | ");
814 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 741 return;
815 case FLAGS_FILL_START: 742 case FLAGS_FILL_START:
816 ret = trace_seq_puts(s, " "); 743 trace_seq_puts(s, " ");
817 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 744 return;
818 case FLAGS_FILL_END: 745 case FLAGS_FILL_END:
819 ret = trace_seq_puts(s, " |"); 746 trace_seq_puts(s, " |");
820 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 747 return;
821 } 748 }
822 749
823 /* Signal a overhead of time execution to the output */ 750 /* Signal a overhead of time execution to the output */
824 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { 751 if (flags & TRACE_GRAPH_PRINT_OVERHEAD)
825 /* Duration exceeded 100 msecs */ 752 trace_seq_printf(s, "%c ", trace_find_mark(duration));
826 if (duration > 100000ULL) 753 else
827 ret = trace_seq_puts(s, "! "); 754 trace_seq_puts(s, " ");
828 /* Duration exceeded 10 msecs */
829 else if (duration > 10000ULL)
830 ret = trace_seq_puts(s, "+ ");
831 }
832
833 /*
834 * The -1 means we either did not exceed the duration tresholds
835 * or we dont want to print out the overhead. Either way we need
836 * to fill out the space.
837 */
838 if (ret == -1)
839 ret = trace_seq_puts(s, " ");
840
841 /* Catching here any failure happenned above */
842 if (!ret)
843 return TRACE_TYPE_PARTIAL_LINE;
844
845 ret = trace_print_graph_duration(duration, s);
846 if (ret != TRACE_TYPE_HANDLED)
847 return ret;
848
849 ret = trace_seq_puts(s, "| ");
850 if (!ret)
851 return TRACE_TYPE_PARTIAL_LINE;
852 755
853 return TRACE_TYPE_HANDLED; 756 trace_print_graph_duration(duration, s);
757 trace_seq_puts(s, "| ");
854} 758}
855 759
856/* Case of a leaf function on its call entry */ 760/* Case of a leaf function on its call entry */
@@ -864,7 +768,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
864 struct ftrace_graph_ret *graph_ret; 768 struct ftrace_graph_ret *graph_ret;
865 struct ftrace_graph_ent *call; 769 struct ftrace_graph_ent *call;
866 unsigned long long duration; 770 unsigned long long duration;
867 int ret;
868 int i; 771 int i;
869 772
870 graph_ret = &ret_entry->ret; 773 graph_ret = &ret_entry->ret;
@@ -890,22 +793,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
890 } 793 }
891 794
892 /* Overhead and duration */ 795 /* Overhead and duration */
893 ret = print_graph_duration(duration, s, flags); 796 print_graph_duration(duration, s, flags);
894 if (ret == TRACE_TYPE_PARTIAL_LINE)
895 return TRACE_TYPE_PARTIAL_LINE;
896 797
897 /* Function */ 798 /* Function */
898 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 799 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
899 ret = trace_seq_putc(s, ' '); 800 trace_seq_putc(s, ' ');
900 if (!ret)
901 return TRACE_TYPE_PARTIAL_LINE;
902 }
903 801
904 ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); 802 trace_seq_printf(s, "%ps();\n", (void *)call->func);
905 if (!ret)
906 return TRACE_TYPE_PARTIAL_LINE;
907 803
908 return TRACE_TYPE_HANDLED; 804 return trace_handle_return(s);
909} 805}
910 806
911static enum print_line_t 807static enum print_line_t
@@ -915,7 +811,6 @@ print_graph_entry_nested(struct trace_iterator *iter,
915{ 811{
916 struct ftrace_graph_ent *call = &entry->graph_ent; 812 struct ftrace_graph_ent *call = &entry->graph_ent;
917 struct fgraph_data *data = iter->private; 813 struct fgraph_data *data = iter->private;
918 int ret;
919 int i; 814 int i;
920 815
921 if (data) { 816 if (data) {
@@ -931,19 +826,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
931 } 826 }
932 827
933 /* No time */ 828 /* No time */
934 ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); 829 print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
935 if (ret != TRACE_TYPE_HANDLED)
936 return ret;
937 830
938 /* Function */ 831 /* Function */
939 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 832 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
940 ret = trace_seq_putc(s, ' '); 833 trace_seq_putc(s, ' ');
941 if (!ret) 834
942 return TRACE_TYPE_PARTIAL_LINE; 835 trace_seq_printf(s, "%ps() {\n", (void *)call->func);
943 }
944 836
945 ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); 837 if (trace_seq_has_overflowed(s))
946 if (!ret)
947 return TRACE_TYPE_PARTIAL_LINE; 838 return TRACE_TYPE_PARTIAL_LINE;
948 839
949 /* 840 /*
@@ -953,62 +844,43 @@ print_graph_entry_nested(struct trace_iterator *iter,
953 return TRACE_TYPE_NO_CONSUME; 844 return TRACE_TYPE_NO_CONSUME;
954} 845}
955 846
956static enum print_line_t 847static void
957print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, 848print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
958 int type, unsigned long addr, u32 flags) 849 int type, unsigned long addr, u32 flags)
959{ 850{
960 struct fgraph_data *data = iter->private; 851 struct fgraph_data *data = iter->private;
961 struct trace_entry *ent = iter->ent; 852 struct trace_entry *ent = iter->ent;
962 int cpu = iter->cpu; 853 int cpu = iter->cpu;
963 int ret;
964 854
965 /* Pid */ 855 /* Pid */
966 if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE) 856 verif_pid(s, ent->pid, cpu, data);
967 return TRACE_TYPE_PARTIAL_LINE;
968 857
969 if (type) { 858 if (type)
970 /* Interrupt */ 859 /* Interrupt */
971 ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags); 860 print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
972 if (ret == TRACE_TYPE_PARTIAL_LINE)
973 return TRACE_TYPE_PARTIAL_LINE;
974 }
975 861
976 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) 862 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
977 return 0; 863 return;
978 864
979 /* Absolute time */ 865 /* Absolute time */
980 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 866 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
981 ret = print_graph_abs_time(iter->ts, s); 867 print_graph_abs_time(iter->ts, s);
982 if (!ret)
983 return TRACE_TYPE_PARTIAL_LINE;
984 }
985 868
986 /* Cpu */ 869 /* Cpu */
987 if (flags & TRACE_GRAPH_PRINT_CPU) { 870 if (flags & TRACE_GRAPH_PRINT_CPU)
988 ret = print_graph_cpu(s, cpu); 871 print_graph_cpu(s, cpu);
989 if (ret == TRACE_TYPE_PARTIAL_LINE)
990 return TRACE_TYPE_PARTIAL_LINE;
991 }
992 872
993 /* Proc */ 873 /* Proc */
994 if (flags & TRACE_GRAPH_PRINT_PROC) { 874 if (flags & TRACE_GRAPH_PRINT_PROC) {
995 ret = print_graph_proc(s, ent->pid); 875 print_graph_proc(s, ent->pid);
996 if (ret == TRACE_TYPE_PARTIAL_LINE) 876 trace_seq_puts(s, " | ");
997 return TRACE_TYPE_PARTIAL_LINE;
998
999 ret = trace_seq_puts(s, " | ");
1000 if (!ret)
1001 return TRACE_TYPE_PARTIAL_LINE;
1002 } 877 }
1003 878
1004 /* Latency format */ 879 /* Latency format */
1005 if (trace_flags & TRACE_ITER_LATENCY_FMT) { 880 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1006 ret = print_graph_lat_fmt(s, ent); 881 print_graph_lat_fmt(s, ent);
1007 if (ret == TRACE_TYPE_PARTIAL_LINE)
1008 return TRACE_TYPE_PARTIAL_LINE;
1009 }
1010 882
1011 return 0; 883 return;
1012} 884}
1013 885
1014/* 886/*
@@ -1126,8 +998,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
1126 if (check_irq_entry(iter, flags, call->func, call->depth)) 998 if (check_irq_entry(iter, flags, call->func, call->depth))
1127 return TRACE_TYPE_HANDLED; 999 return TRACE_TYPE_HANDLED;
1128 1000
1129 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) 1001 print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags);
1130 return TRACE_TYPE_PARTIAL_LINE;
1131 1002
1132 leaf_ret = get_return_for_leaf(iter, field); 1003 leaf_ret = get_return_for_leaf(iter, field);
1133 if (leaf_ret) 1004 if (leaf_ret)
@@ -1160,7 +1031,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1160 pid_t pid = ent->pid; 1031 pid_t pid = ent->pid;
1161 int cpu = iter->cpu; 1032 int cpu = iter->cpu;
1162 int func_match = 1; 1033 int func_match = 1;
1163 int ret;
1164 int i; 1034 int i;
1165 1035
1166 if (check_irq_return(iter, flags, trace->depth)) 1036 if (check_irq_return(iter, flags, trace->depth))
@@ -1186,20 +1056,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1186 } 1056 }
1187 } 1057 }
1188 1058
1189 if (print_graph_prologue(iter, s, 0, 0, flags)) 1059 print_graph_prologue(iter, s, 0, 0, flags);
1190 return TRACE_TYPE_PARTIAL_LINE;
1191 1060
1192 /* Overhead and duration */ 1061 /* Overhead and duration */
1193 ret = print_graph_duration(duration, s, flags); 1062 print_graph_duration(duration, s, flags);
1194 if (ret == TRACE_TYPE_PARTIAL_LINE)
1195 return TRACE_TYPE_PARTIAL_LINE;
1196 1063
1197 /* Closing brace */ 1064 /* Closing brace */
1198 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 1065 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++)
1199 ret = trace_seq_putc(s, ' '); 1066 trace_seq_putc(s, ' ');
1200 if (!ret)
1201 return TRACE_TYPE_PARTIAL_LINE;
1202 }
1203 1067
1204 /* 1068 /*
1205 * If the return function does not have a matching entry, 1069 * If the return function does not have a matching entry,
@@ -1208,30 +1072,20 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1208 * belongs to, write out the function name. Always do 1072 * belongs to, write out the function name. Always do
1209 * that if the funcgraph-tail option is enabled. 1073 * that if the funcgraph-tail option is enabled.
1210 */ 1074 */
1211 if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { 1075 if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
1212 ret = trace_seq_puts(s, "}\n"); 1076 trace_seq_puts(s, "}\n");
1213 if (!ret) 1077 else
1214 return TRACE_TYPE_PARTIAL_LINE; 1078 trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
1215 } else {
1216 ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
1217 if (!ret)
1218 return TRACE_TYPE_PARTIAL_LINE;
1219 }
1220 1079
1221 /* Overrun */ 1080 /* Overrun */
1222 if (flags & TRACE_GRAPH_PRINT_OVERRUN) { 1081 if (flags & TRACE_GRAPH_PRINT_OVERRUN)
1223 ret = trace_seq_printf(s, " (Overruns: %lu)\n", 1082 trace_seq_printf(s, " (Overruns: %lu)\n",
1224 trace->overrun); 1083 trace->overrun);
1225 if (!ret)
1226 return TRACE_TYPE_PARTIAL_LINE;
1227 }
1228 1084
1229 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, 1085 print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
1230 cpu, pid, flags); 1086 cpu, pid, flags);
1231 if (ret == TRACE_TYPE_PARTIAL_LINE)
1232 return TRACE_TYPE_PARTIAL_LINE;
1233 1087
1234 return TRACE_TYPE_HANDLED; 1088 return trace_handle_return(s);
1235} 1089}
1236 1090
1237static enum print_line_t 1091static enum print_line_t
@@ -1248,26 +1102,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1248 if (data) 1102 if (data)
1249 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; 1103 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
1250 1104
1251 if (print_graph_prologue(iter, s, 0, 0, flags)) 1105 print_graph_prologue(iter, s, 0, 0, flags);
1252 return TRACE_TYPE_PARTIAL_LINE;
1253 1106
1254 /* No time */ 1107 /* No time */
1255 ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); 1108 print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
1256 if (ret != TRACE_TYPE_HANDLED)
1257 return ret;
1258 1109
1259 /* Indentation */ 1110 /* Indentation */
1260 if (depth > 0) 1111 if (depth > 0)
1261 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { 1112 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++)
1262 ret = trace_seq_putc(s, ' '); 1113 trace_seq_putc(s, ' ');
1263 if (!ret)
1264 return TRACE_TYPE_PARTIAL_LINE;
1265 }
1266 1114
1267 /* The comment */ 1115 /* The comment */
1268 ret = trace_seq_puts(s, "/* "); 1116 trace_seq_puts(s, "/* ");
1269 if (!ret)
1270 return TRACE_TYPE_PARTIAL_LINE;
1271 1117
1272 switch (iter->ent->type) { 1118 switch (iter->ent->type) {
1273 case TRACE_BPRINT: 1119 case TRACE_BPRINT:
@@ -1290,17 +1136,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1290 return ret; 1136 return ret;
1291 } 1137 }
1292 1138
1139 if (trace_seq_has_overflowed(s))
1140 goto out;
1141
1293 /* Strip ending newline */ 1142 /* Strip ending newline */
1294 if (s->buffer[s->len - 1] == '\n') { 1143 if (s->buffer[s->seq.len - 1] == '\n') {
1295 s->buffer[s->len - 1] = '\0'; 1144 s->buffer[s->seq.len - 1] = '\0';
1296 s->len--; 1145 s->seq.len--;
1297 } 1146 }
1298 1147
1299 ret = trace_seq_puts(s, " */\n"); 1148 trace_seq_puts(s, " */\n");
1300 if (!ret) 1149 out:
1301 return TRACE_TYPE_PARTIAL_LINE; 1150 return trace_handle_return(s);
1302
1303 return TRACE_TYPE_HANDLED;
1304} 1151}
1305 1152
1306 1153
@@ -1407,32 +1254,32 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1407 print_lat_header(s, flags); 1254 print_lat_header(s, flags);
1408 1255
1409 /* 1st line */ 1256 /* 1st line */
1410 seq_printf(s, "#"); 1257 seq_putc(s, '#');
1411 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) 1258 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1412 seq_printf(s, " TIME "); 1259 seq_puts(s, " TIME ");
1413 if (flags & TRACE_GRAPH_PRINT_CPU) 1260 if (flags & TRACE_GRAPH_PRINT_CPU)
1414 seq_printf(s, " CPU"); 1261 seq_puts(s, " CPU");
1415 if (flags & TRACE_GRAPH_PRINT_PROC) 1262 if (flags & TRACE_GRAPH_PRINT_PROC)
1416 seq_printf(s, " TASK/PID "); 1263 seq_puts(s, " TASK/PID ");
1417 if (lat) 1264 if (lat)
1418 seq_printf(s, "||||"); 1265 seq_puts(s, "||||");
1419 if (flags & TRACE_GRAPH_PRINT_DURATION) 1266 if (flags & TRACE_GRAPH_PRINT_DURATION)
1420 seq_printf(s, " DURATION "); 1267 seq_puts(s, " DURATION ");
1421 seq_printf(s, " FUNCTION CALLS\n"); 1268 seq_puts(s, " FUNCTION CALLS\n");
1422 1269
1423 /* 2nd line */ 1270 /* 2nd line */
1424 seq_printf(s, "#"); 1271 seq_putc(s, '#');
1425 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) 1272 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1426 seq_printf(s, " | "); 1273 seq_puts(s, " | ");
1427 if (flags & TRACE_GRAPH_PRINT_CPU) 1274 if (flags & TRACE_GRAPH_PRINT_CPU)
1428 seq_printf(s, " | "); 1275 seq_puts(s, " | ");
1429 if (flags & TRACE_GRAPH_PRINT_PROC) 1276 if (flags & TRACE_GRAPH_PRINT_PROC)
1430 seq_printf(s, " | | "); 1277 seq_puts(s, " | | ");
1431 if (lat) 1278 if (lat)
1432 seq_printf(s, "||||"); 1279 seq_puts(s, "||||");
1433 if (flags & TRACE_GRAPH_PRINT_DURATION) 1280 if (flags & TRACE_GRAPH_PRINT_DURATION)
1434 seq_printf(s, " | | "); 1281 seq_puts(s, " | | ");
1435 seq_printf(s, " | | | |\n"); 1282 seq_puts(s, " | | | |\n");
1436} 1283}
1437 1284
1438static void print_graph_headers(struct seq_file *s) 1285static void print_graph_headers(struct seq_file *s)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index bd90e1b06088..b0b1c44e923a 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -20,10 +20,12 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
20{ 20{
21 /* use static because iter can be a bit big for the stack */ 21 /* use static because iter can be a bit big for the stack */
22 static struct trace_iterator iter; 22 static struct trace_iterator iter;
23 static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
23 unsigned int old_userobj; 24 unsigned int old_userobj;
24 int cnt = 0, cpu; 25 int cnt = 0, cpu;
25 26
26 trace_init_global_iter(&iter); 27 trace_init_global_iter(&iter);
28 iter.buffer_iter = buffer_iter;
27 29
28 for_each_tracing_cpu(cpu) { 30 for_each_tracing_cpu(cpu) {
29 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); 31 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
@@ -57,19 +59,19 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
57 ring_buffer_read_start(iter.buffer_iter[cpu_file]); 59 ring_buffer_read_start(iter.buffer_iter[cpu_file]);
58 tracing_iter_reset(&iter, cpu_file); 60 tracing_iter_reset(&iter, cpu_file);
59 } 61 }
60 if (!trace_empty(&iter)) 62
61 trace_find_next_entry_inc(&iter); 63 while (trace_find_next_entry_inc(&iter)) {
62 while (!trace_empty(&iter)) {
63 if (!cnt) 64 if (!cnt)
64 kdb_printf("---------------------------------\n"); 65 kdb_printf("---------------------------------\n");
65 cnt++; 66 cnt++;
66 67
67 if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) 68 if (!skip_lines) {
68 print_trace_line(&iter); 69 print_trace_line(&iter);
69 if (!skip_lines)
70 trace_printk_seq(&iter.seq); 70 trace_printk_seq(&iter.seq);
71 else 71 } else {
72 skip_lines--; 72 skip_lines--;
73 }
74
73 if (KDB_FLAG(CMD_INTERRUPT)) 75 if (KDB_FLAG(CMD_INTERRUPT))
74 goto out; 76 goto out;
75 } 77 }
@@ -86,9 +88,12 @@ out:
86 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); 88 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
87 } 89 }
88 90
89 for_each_tracing_cpu(cpu) 91 for_each_tracing_cpu(cpu) {
90 if (iter.buffer_iter[cpu]) 92 if (iter.buffer_iter[cpu]) {
91 ring_buffer_read_finish(iter.buffer_iter[cpu]); 93 ring_buffer_read_finish(iter.buffer_iter[cpu]);
94 iter.buffer_iter[cpu] = NULL;
95 }
96 }
92} 97}
93 98
94/* 99/*
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 282f6e4e5539..5edb518be345 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -826,7 +826,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
826 struct trace_kprobe *tk = v; 826 struct trace_kprobe *tk = v;
827 int i; 827 int i;
828 828
829 seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); 829 seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
830 seq_printf(m, ":%s/%s", tk->tp.call.class->system, 830 seq_printf(m, ":%s/%s", tk->tp.call.class->system,
831 ftrace_event_name(&tk->tp.call)); 831 ftrace_event_name(&tk->tp.call));
832 832
@@ -840,7 +840,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
840 840
841 for (i = 0; i < tk->tp.nr_args; i++) 841 for (i = 0; i < tk->tp.nr_args; i++)
842 seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); 842 seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
843 seq_printf(m, "\n"); 843 seq_putc(m, '\n');
844 844
845 return 0; 845 return 0;
846} 846}
@@ -1024,27 +1024,22 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
1024 field = (struct kprobe_trace_entry_head *)iter->ent; 1024 field = (struct kprobe_trace_entry_head *)iter->ent;
1025 tp = container_of(event, struct trace_probe, call.event); 1025 tp = container_of(event, struct trace_probe, call.event);
1026 1026
1027 if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) 1027 trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
1028 goto partial;
1029 1028
1030 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) 1029 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
1031 goto partial; 1030 goto out;
1032 1031
1033 if (!trace_seq_puts(s, ")")) 1032 trace_seq_putc(s, ')');
1034 goto partial;
1035 1033
1036 data = (u8 *)&field[1]; 1034 data = (u8 *)&field[1];
1037 for (i = 0; i < tp->nr_args; i++) 1035 for (i = 0; i < tp->nr_args; i++)
1038 if (!tp->args[i].type->print(s, tp->args[i].name, 1036 if (!tp->args[i].type->print(s, tp->args[i].name,
1039 data + tp->args[i].offset, field)) 1037 data + tp->args[i].offset, field))
1040 goto partial; 1038 goto out;
1041
1042 if (!trace_seq_puts(s, "\n"))
1043 goto partial;
1044 1039
1045 return TRACE_TYPE_HANDLED; 1040 trace_seq_putc(s, '\n');
1046partial: 1041 out:
1047 return TRACE_TYPE_PARTIAL_LINE; 1042 return trace_handle_return(s);
1048} 1043}
1049 1044
1050static enum print_line_t 1045static enum print_line_t
@@ -1060,33 +1055,28 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
1060 field = (struct kretprobe_trace_entry_head *)iter->ent; 1055 field = (struct kretprobe_trace_entry_head *)iter->ent;
1061 tp = container_of(event, struct trace_probe, call.event); 1056 tp = container_of(event, struct trace_probe, call.event);
1062 1057
1063 if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) 1058 trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
1064 goto partial;
1065 1059
1066 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) 1060 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
1067 goto partial; 1061 goto out;
1068 1062
1069 if (!trace_seq_puts(s, " <- ")) 1063 trace_seq_puts(s, " <- ");
1070 goto partial;
1071 1064
1072 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) 1065 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
1073 goto partial; 1066 goto out;
1074 1067
1075 if (!trace_seq_puts(s, ")")) 1068 trace_seq_putc(s, ')');
1076 goto partial;
1077 1069
1078 data = (u8 *)&field[1]; 1070 data = (u8 *)&field[1];
1079 for (i = 0; i < tp->nr_args; i++) 1071 for (i = 0; i < tp->nr_args; i++)
1080 if (!tp->args[i].type->print(s, tp->args[i].name, 1072 if (!tp->args[i].type->print(s, tp->args[i].name,
1081 data + tp->args[i].offset, field)) 1073 data + tp->args[i].offset, field))
1082 goto partial; 1074 goto out;
1083 1075
1084 if (!trace_seq_puts(s, "\n")) 1076 trace_seq_putc(s, '\n');
1085 goto partial;
1086 1077
1087 return TRACE_TYPE_HANDLED; 1078 out:
1088partial: 1079 return trace_handle_return(s);
1089 return TRACE_TYPE_PARTIAL_LINE;
1090} 1080}
1091 1081
1092 1082
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0abd9b863474..7a9ba62e9fef 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -59,17 +59,15 @@ static void mmio_trace_start(struct trace_array *tr)
59 mmio_reset_data(tr); 59 mmio_reset_data(tr);
60} 60}
61 61
62static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) 62static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
63{ 63{
64 int ret = 0;
65 int i; 64 int i;
66 resource_size_t start, end; 65 resource_size_t start, end;
67 const struct pci_driver *drv = pci_dev_driver(dev); 66 const struct pci_driver *drv = pci_dev_driver(dev);
68 67
69 /* XXX: incomplete checks for trace_seq_printf() return value */ 68 trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
70 ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", 69 dev->bus->number, dev->devfn,
71 dev->bus->number, dev->devfn, 70 dev->vendor, dev->device, dev->irq);
72 dev->vendor, dev->device, dev->irq);
73 /* 71 /*
74 * XXX: is pci_resource_to_user() appropriate, since we are 72 * XXX: is pci_resource_to_user() appropriate, since we are
75 * supposed to interpret the __ioremap() phys_addr argument based on 73 * supposed to interpret the __ioremap() phys_addr argument based on
@@ -77,21 +75,20 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
77 */ 75 */
78 for (i = 0; i < 7; i++) { 76 for (i = 0; i < 7; i++) {
79 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); 77 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
80 ret += trace_seq_printf(s, " %llx", 78 trace_seq_printf(s, " %llx",
81 (unsigned long long)(start | 79 (unsigned long long)(start |
82 (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); 80 (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
83 } 81 }
84 for (i = 0; i < 7; i++) { 82 for (i = 0; i < 7; i++) {
85 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); 83 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
86 ret += trace_seq_printf(s, " %llx", 84 trace_seq_printf(s, " %llx",
87 dev->resource[i].start < dev->resource[i].end ? 85 dev->resource[i].start < dev->resource[i].end ?
88 (unsigned long long)(end - start) + 1 : 0); 86 (unsigned long long)(end - start) + 1 : 0);
89 } 87 }
90 if (drv) 88 if (drv)
91 ret += trace_seq_printf(s, " %s\n", drv->name); 89 trace_seq_printf(s, " %s\n", drv->name);
92 else 90 else
93 ret += trace_seq_puts(s, " \n"); 91 trace_seq_puts(s, " \n");
94 return ret;
95} 92}
96 93
97static void destroy_header_iter(struct header_iter *hiter) 94static void destroy_header_iter(struct header_iter *hiter)
@@ -179,28 +176,27 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
179 unsigned long long t = ns2usecs(iter->ts); 176 unsigned long long t = ns2usecs(iter->ts);
180 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 177 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
181 unsigned secs = (unsigned long)t; 178 unsigned secs = (unsigned long)t;
182 int ret = 1;
183 179
184 trace_assign_type(field, entry); 180 trace_assign_type(field, entry);
185 rw = &field->rw; 181 rw = &field->rw;
186 182
187 switch (rw->opcode) { 183 switch (rw->opcode) {
188 case MMIO_READ: 184 case MMIO_READ:
189 ret = trace_seq_printf(s, 185 trace_seq_printf(s,
190 "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 186 "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
191 rw->width, secs, usec_rem, rw->map_id, 187 rw->width, secs, usec_rem, rw->map_id,
192 (unsigned long long)rw->phys, 188 (unsigned long long)rw->phys,
193 rw->value, rw->pc, 0); 189 rw->value, rw->pc, 0);
194 break; 190 break;
195 case MMIO_WRITE: 191 case MMIO_WRITE:
196 ret = trace_seq_printf(s, 192 trace_seq_printf(s,
197 "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 193 "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
198 rw->width, secs, usec_rem, rw->map_id, 194 rw->width, secs, usec_rem, rw->map_id,
199 (unsigned long long)rw->phys, 195 (unsigned long long)rw->phys,
200 rw->value, rw->pc, 0); 196 rw->value, rw->pc, 0);
201 break; 197 break;
202 case MMIO_UNKNOWN_OP: 198 case MMIO_UNKNOWN_OP:
203 ret = trace_seq_printf(s, 199 trace_seq_printf(s,
204 "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," 200 "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
205 "%02lx 0x%lx %d\n", 201 "%02lx 0x%lx %d\n",
206 secs, usec_rem, rw->map_id, 202 secs, usec_rem, rw->map_id,
@@ -209,12 +205,11 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
209 (rw->value >> 0) & 0xff, rw->pc, 0); 205 (rw->value >> 0) & 0xff, rw->pc, 0);
210 break; 206 break;
211 default: 207 default:
212 ret = trace_seq_puts(s, "rw what?\n"); 208 trace_seq_puts(s, "rw what?\n");
213 break; 209 break;
214 } 210 }
215 if (ret) 211
216 return TRACE_TYPE_HANDLED; 212 return trace_handle_return(s);
217 return TRACE_TYPE_PARTIAL_LINE;
218} 213}
219 214
220static enum print_line_t mmio_print_map(struct trace_iterator *iter) 215static enum print_line_t mmio_print_map(struct trace_iterator *iter)
@@ -226,31 +221,29 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
226 unsigned long long t = ns2usecs(iter->ts); 221 unsigned long long t = ns2usecs(iter->ts);
227 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 222 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
228 unsigned secs = (unsigned long)t; 223 unsigned secs = (unsigned long)t;
229 int ret;
230 224
231 trace_assign_type(field, entry); 225 trace_assign_type(field, entry);
232 m = &field->map; 226 m = &field->map;
233 227
234 switch (m->opcode) { 228 switch (m->opcode) {
235 case MMIO_PROBE: 229 case MMIO_PROBE:
236 ret = trace_seq_printf(s, 230 trace_seq_printf(s,
237 "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", 231 "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
238 secs, usec_rem, m->map_id, 232 secs, usec_rem, m->map_id,
239 (unsigned long long)m->phys, m->virt, m->len, 233 (unsigned long long)m->phys, m->virt, m->len,
240 0UL, 0); 234 0UL, 0);
241 break; 235 break;
242 case MMIO_UNPROBE: 236 case MMIO_UNPROBE:
243 ret = trace_seq_printf(s, 237 trace_seq_printf(s,
244 "UNMAP %u.%06lu %d 0x%lx %d\n", 238 "UNMAP %u.%06lu %d 0x%lx %d\n",
245 secs, usec_rem, m->map_id, 0UL, 0); 239 secs, usec_rem, m->map_id, 0UL, 0);
246 break; 240 break;
247 default: 241 default:
248 ret = trace_seq_puts(s, "map what?\n"); 242 trace_seq_puts(s, "map what?\n");
249 break; 243 break;
250 } 244 }
251 if (ret) 245
252 return TRACE_TYPE_HANDLED; 246 return trace_handle_return(s);
253 return TRACE_TYPE_PARTIAL_LINE;
254} 247}
255 248
256static enum print_line_t mmio_print_mark(struct trace_iterator *iter) 249static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
@@ -262,14 +255,11 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
262 unsigned long long t = ns2usecs(iter->ts); 255 unsigned long long t = ns2usecs(iter->ts);
263 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 256 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
264 unsigned secs = (unsigned long)t; 257 unsigned secs = (unsigned long)t;
265 int ret;
266 258
267 /* The trailing newline must be in the message. */ 259 /* The trailing newline must be in the message. */
268 ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); 260 trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
269 if (!ret)
270 return TRACE_TYPE_PARTIAL_LINE;
271 261
272 return TRACE_TYPE_HANDLED; 262 return trace_handle_return(s);
273} 263}
274 264
275static enum print_line_t mmio_print_line(struct trace_iterator *iter) 265static enum print_line_t mmio_print_line(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c6977d5a9b12..b77b9a697619 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -25,15 +25,12 @@ enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
25 struct trace_seq *s = &iter->seq; 25 struct trace_seq *s = &iter->seq;
26 struct trace_entry *entry = iter->ent; 26 struct trace_entry *entry = iter->ent;
27 struct bputs_entry *field; 27 struct bputs_entry *field;
28 int ret;
29 28
30 trace_assign_type(field, entry); 29 trace_assign_type(field, entry);
31 30
32 ret = trace_seq_puts(s, field->str); 31 trace_seq_puts(s, field->str);
33 if (!ret)
34 return TRACE_TYPE_PARTIAL_LINE;
35 32
36 return TRACE_TYPE_HANDLED; 33 return trace_handle_return(s);
37} 34}
38 35
39enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 36enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -41,15 +38,12 @@ enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
41 struct trace_seq *s = &iter->seq; 38 struct trace_seq *s = &iter->seq;
42 struct trace_entry *entry = iter->ent; 39 struct trace_entry *entry = iter->ent;
43 struct bprint_entry *field; 40 struct bprint_entry *field;
44 int ret;
45 41
46 trace_assign_type(field, entry); 42 trace_assign_type(field, entry);
47 43
48 ret = trace_seq_bprintf(s, field->fmt, field->buf); 44 trace_seq_bprintf(s, field->fmt, field->buf);
49 if (!ret)
50 return TRACE_TYPE_PARTIAL_LINE;
51 45
52 return TRACE_TYPE_HANDLED; 46 return trace_handle_return(s);
53} 47}
54 48
55enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) 49enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
@@ -57,15 +51,12 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
57 struct trace_seq *s = &iter->seq; 51 struct trace_seq *s = &iter->seq;
58 struct trace_entry *entry = iter->ent; 52 struct trace_entry *entry = iter->ent;
59 struct print_entry *field; 53 struct print_entry *field;
60 int ret;
61 54
62 trace_assign_type(field, entry); 55 trace_assign_type(field, entry);
63 56
64 ret = trace_seq_puts(s, field->buf); 57 trace_seq_puts(s, field->buf);
65 if (!ret)
66 return TRACE_TYPE_PARTIAL_LINE;
67 58
68 return TRACE_TYPE_HANDLED; 59 return trace_handle_return(s);
69} 60}
70 61
71const char * 62const char *
@@ -124,7 +115,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
124 115
125 if (ret == (const char *)(trace_seq_buffer_ptr(p))) 116 if (ret == (const char *)(trace_seq_buffer_ptr(p)))
126 trace_seq_printf(p, "0x%lx", val); 117 trace_seq_printf(p, "0x%lx", val);
127 118
128 trace_seq_putc(p, 0); 119 trace_seq_putc(p, 0);
129 120
130 return ret; 121 return ret;
@@ -193,7 +184,6 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
193 struct trace_seq *s = &iter->seq; 184 struct trace_seq *s = &iter->seq;
194 struct trace_seq *p = &iter->tmp_seq; 185 struct trace_seq *p = &iter->tmp_seq;
195 struct trace_entry *entry; 186 struct trace_entry *entry;
196 int ret;
197 187
198 event = container_of(trace_event, struct ftrace_event_call, event); 188 event = container_of(trace_event, struct ftrace_event_call, event);
199 entry = iter->ent; 189 entry = iter->ent;
@@ -204,11 +194,9 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
204 } 194 }
205 195
206 trace_seq_init(p); 196 trace_seq_init(p);
207 ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event)); 197 trace_seq_printf(s, "%s: ", ftrace_event_name(event));
208 if (!ret)
209 return TRACE_TYPE_PARTIAL_LINE;
210 198
211 return 0; 199 return trace_handle_return(s);
212} 200}
213EXPORT_SYMBOL(ftrace_raw_output_prep); 201EXPORT_SYMBOL(ftrace_raw_output_prep);
214 202
@@ -216,18 +204,11 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name,
216 char *fmt, va_list ap) 204 char *fmt, va_list ap)
217{ 205{
218 struct trace_seq *s = &iter->seq; 206 struct trace_seq *s = &iter->seq;
219 int ret;
220
221 ret = trace_seq_printf(s, "%s: ", name);
222 if (!ret)
223 return TRACE_TYPE_PARTIAL_LINE;
224
225 ret = trace_seq_vprintf(s, fmt, ap);
226 207
227 if (!ret) 208 trace_seq_printf(s, "%s: ", name);
228 return TRACE_TYPE_PARTIAL_LINE; 209 trace_seq_vprintf(s, fmt, ap);
229 210
230 return TRACE_TYPE_HANDLED; 211 return trace_handle_return(s);
231} 212}
232 213
233int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) 214int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
@@ -260,7 +241,7 @@ static inline const char *kretprobed(const char *name)
260} 241}
261#endif /* CONFIG_KRETPROBES */ 242#endif /* CONFIG_KRETPROBES */
262 243
263static int 244static void
264seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) 245seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
265{ 246{
266#ifdef CONFIG_KALLSYMS 247#ifdef CONFIG_KALLSYMS
@@ -271,12 +252,11 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
271 252
272 name = kretprobed(str); 253 name = kretprobed(str);
273 254
274 return trace_seq_printf(s, fmt, name); 255 trace_seq_printf(s, fmt, name);
275#endif 256#endif
276 return 1;
277} 257}
278 258
279static int 259static void
280seq_print_sym_offset(struct trace_seq *s, const char *fmt, 260seq_print_sym_offset(struct trace_seq *s, const char *fmt,
281 unsigned long address) 261 unsigned long address)
282{ 262{
@@ -287,9 +267,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
287 sprint_symbol(str, address); 267 sprint_symbol(str, address);
288 name = kretprobed(str); 268 name = kretprobed(str);
289 269
290 return trace_seq_printf(s, fmt, name); 270 trace_seq_printf(s, fmt, name);
291#endif 271#endif
292 return 1;
293} 272}
294 273
295#ifndef CONFIG_64BIT 274#ifndef CONFIG_64BIT
@@ -320,14 +299,14 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
320 if (file) { 299 if (file) {
321 ret = trace_seq_path(s, &file->f_path); 300 ret = trace_seq_path(s, &file->f_path);
322 if (ret) 301 if (ret)
323 ret = trace_seq_printf(s, "[+0x%lx]", 302 trace_seq_printf(s, "[+0x%lx]",
324 ip - vmstart); 303 ip - vmstart);
325 } 304 }
326 up_read(&mm->mmap_sem); 305 up_read(&mm->mmap_sem);
327 } 306 }
328 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) 307 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
329 ret = trace_seq_printf(s, " <" IP_FMT ">", ip); 308 trace_seq_printf(s, " <" IP_FMT ">", ip);
330 return ret; 309 return !trace_seq_has_overflowed(s);
331} 310}
332 311
333int 312int
@@ -335,7 +314,6 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
335 unsigned long sym_flags) 314 unsigned long sym_flags)
336{ 315{
337 struct mm_struct *mm = NULL; 316 struct mm_struct *mm = NULL;
338 int ret = 1;
339 unsigned int i; 317 unsigned int i;
340 318
341 if (trace_flags & TRACE_ITER_SYM_USEROBJ) { 319 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
@@ -354,48 +332,45 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
354 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 332 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
355 unsigned long ip = entry->caller[i]; 333 unsigned long ip = entry->caller[i];
356 334
357 if (ip == ULONG_MAX || !ret) 335 if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
358 break; 336 break;
359 if (ret) 337
360 ret = trace_seq_puts(s, " => "); 338 trace_seq_puts(s, " => ");
339
361 if (!ip) { 340 if (!ip) {
362 if (ret) 341 trace_seq_puts(s, "??");
363 ret = trace_seq_puts(s, "??"); 342 trace_seq_putc(s, '\n');
364 if (ret)
365 ret = trace_seq_putc(s, '\n');
366 continue; 343 continue;
367 } 344 }
368 if (!ret) 345
369 break; 346 seq_print_user_ip(s, mm, ip, sym_flags);
370 if (ret) 347 trace_seq_putc(s, '\n');
371 ret = seq_print_user_ip(s, mm, ip, sym_flags);
372 ret = trace_seq_putc(s, '\n');
373 } 348 }
374 349
375 if (mm) 350 if (mm)
376 mmput(mm); 351 mmput(mm);
377 return ret; 352
353 return !trace_seq_has_overflowed(s);
378} 354}
379 355
380int 356int
381seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) 357seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
382{ 358{
383 int ret; 359 if (!ip) {
384 360 trace_seq_putc(s, '0');
385 if (!ip) 361 goto out;
386 return trace_seq_putc(s, '0'); 362 }
387 363
388 if (sym_flags & TRACE_ITER_SYM_OFFSET) 364 if (sym_flags & TRACE_ITER_SYM_OFFSET)
389 ret = seq_print_sym_offset(s, "%s", ip); 365 seq_print_sym_offset(s, "%s", ip);
390 else 366 else
391 ret = seq_print_sym_short(s, "%s", ip); 367 seq_print_sym_short(s, "%s", ip);
392
393 if (!ret)
394 return 0;
395 368
396 if (sym_flags & TRACE_ITER_SYM_ADDR) 369 if (sym_flags & TRACE_ITER_SYM_ADDR)
397 ret = trace_seq_printf(s, " <" IP_FMT ">", ip); 370 trace_seq_printf(s, " <" IP_FMT ">", ip);
398 return ret; 371
372 out:
373 return !trace_seq_has_overflowed(s);
399} 374}
400 375
401/** 376/**
@@ -413,7 +388,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
413 char irqs_off; 388 char irqs_off;
414 int hardirq; 389 int hardirq;
415 int softirq; 390 int softirq;
416 int ret;
417 391
418 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 392 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
419 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 393 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
@@ -445,16 +419,15 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
445 softirq ? 's' : 419 softirq ? 's' :
446 '.'; 420 '.';
447 421
448 if (!trace_seq_printf(s, "%c%c%c", 422 trace_seq_printf(s, "%c%c%c",
449 irqs_off, need_resched, hardsoft_irq)) 423 irqs_off, need_resched, hardsoft_irq);
450 return 0;
451 424
452 if (entry->preempt_count) 425 if (entry->preempt_count)
453 ret = trace_seq_printf(s, "%x", entry->preempt_count); 426 trace_seq_printf(s, "%x", entry->preempt_count);
454 else 427 else
455 ret = trace_seq_putc(s, '.'); 428 trace_seq_putc(s, '.');
456 429
457 return ret; 430 return !trace_seq_has_overflowed(s);
458} 431}
459 432
460static int 433static int
@@ -464,14 +437,38 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
464 437
465 trace_find_cmdline(entry->pid, comm); 438 trace_find_cmdline(entry->pid, comm);
466 439
467 if (!trace_seq_printf(s, "%8.8s-%-5d %3d", 440 trace_seq_printf(s, "%8.8s-%-5d %3d",
468 comm, entry->pid, cpu)) 441 comm, entry->pid, cpu);
469 return 0;
470 442
471 return trace_print_lat_fmt(s, entry); 443 return trace_print_lat_fmt(s, entry);
472} 444}
473 445
474static unsigned long preempt_mark_thresh_us = 100; 446#undef MARK
447#define MARK(v, s) {.val = v, .sym = s}
448/* trace overhead mark */
449static const struct trace_mark {
450 unsigned long long val; /* unit: nsec */
451 char sym;
452} mark[] = {
453 MARK(1000000000ULL , '$'), /* 1 sec */
454 MARK(1000000ULL , '#'), /* 1000 usecs */
455 MARK(100000ULL , '!'), /* 100 usecs */
456 MARK(10000ULL , '+'), /* 10 usecs */
457};
458#undef MARK
459
460char trace_find_mark(unsigned long long d)
461{
462 int i;
463 int size = ARRAY_SIZE(mark);
464
465 for (i = 0; i < size; i++) {
466 if (d >= mark[i].val)
467 break;
468 }
469
470 return (i == size) ? ' ' : mark[i].sym;
471}
475 472
476static int 473static int
477lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) 474lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
@@ -493,24 +490,28 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
493 unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); 490 unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
494 unsigned long rel_msec = (unsigned long)rel_ts; 491 unsigned long rel_msec = (unsigned long)rel_ts;
495 492
496 return trace_seq_printf( 493 trace_seq_printf(
497 s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", 494 s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
498 ns2usecs(iter->ts), 495 ns2usecs(iter->ts),
499 abs_msec, abs_usec, 496 abs_msec, abs_usec,
500 rel_msec, rel_usec); 497 rel_msec, rel_usec);
498
501 } else if (verbose && !in_ns) { 499 } else if (verbose && !in_ns) {
502 return trace_seq_printf( 500 trace_seq_printf(
503 s, "[%016llx] %lld (+%lld): ", 501 s, "[%016llx] %lld (+%lld): ",
504 iter->ts, abs_ts, rel_ts); 502 iter->ts, abs_ts, rel_ts);
503
505 } else if (!verbose && in_ns) { 504 } else if (!verbose && in_ns) {
506 return trace_seq_printf( 505 trace_seq_printf(
507 s, " %4lldus%c: ", 506 s, " %4lldus%c: ",
508 abs_ts, 507 abs_ts,
509 rel_ts > preempt_mark_thresh_us ? '!' : 508 trace_find_mark(rel_ts * NSEC_PER_USEC));
510 rel_ts > 1 ? '+' : ' '); 509
511 } else { /* !verbose && !in_ns */ 510 } else { /* !verbose && !in_ns */
512 return trace_seq_printf(s, " %4lld: ", abs_ts); 511 trace_seq_printf(s, " %4lld: ", abs_ts);
513 } 512 }
513
514 return !trace_seq_has_overflowed(s);
514} 515}
515 516
516int trace_print_context(struct trace_iterator *iter) 517int trace_print_context(struct trace_iterator *iter)
@@ -520,34 +521,29 @@ int trace_print_context(struct trace_iterator *iter)
520 unsigned long long t; 521 unsigned long long t;
521 unsigned long secs, usec_rem; 522 unsigned long secs, usec_rem;
522 char comm[TASK_COMM_LEN]; 523 char comm[TASK_COMM_LEN];
523 int ret;
524 524
525 trace_find_cmdline(entry->pid, comm); 525 trace_find_cmdline(entry->pid, comm);
526 526
527 ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", 527 trace_seq_printf(s, "%16s-%-5d [%03d] ",
528 comm, entry->pid, iter->cpu); 528 comm, entry->pid, iter->cpu);
529 if (!ret)
530 return 0;
531 529
532 if (trace_flags & TRACE_ITER_IRQ_INFO) { 530 if (trace_flags & TRACE_ITER_IRQ_INFO)
533 ret = trace_print_lat_fmt(s, entry); 531 trace_print_lat_fmt(s, entry);
534 if (!ret)
535 return 0;
536 }
537 532
538 if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { 533 if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
539 t = ns2usecs(iter->ts); 534 t = ns2usecs(iter->ts);
540 usec_rem = do_div(t, USEC_PER_SEC); 535 usec_rem = do_div(t, USEC_PER_SEC);
541 secs = (unsigned long)t; 536 secs = (unsigned long)t;
542 return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); 537 trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
543 } else 538 } else
544 return trace_seq_printf(s, " %12llu: ", iter->ts); 539 trace_seq_printf(s, " %12llu: ", iter->ts);
540
541 return !trace_seq_has_overflowed(s);
545} 542}
546 543
547int trace_print_lat_context(struct trace_iterator *iter) 544int trace_print_lat_context(struct trace_iterator *iter)
548{ 545{
549 u64 next_ts; 546 u64 next_ts;
550 int ret;
551 /* trace_find_next_entry will reset ent_size */ 547 /* trace_find_next_entry will reset ent_size */
552 int ent_size = iter->ent_size; 548 int ent_size = iter->ent_size;
553 struct trace_seq *s = &iter->seq; 549 struct trace_seq *s = &iter->seq;
@@ -567,18 +563,17 @@ int trace_print_lat_context(struct trace_iterator *iter)
567 563
568 trace_find_cmdline(entry->pid, comm); 564 trace_find_cmdline(entry->pid, comm);
569 565
570 ret = trace_seq_printf( 566 trace_seq_printf(
571 s, "%16s %5d %3d %d %08x %08lx ", 567 s, "%16s %5d %3d %d %08x %08lx ",
572 comm, entry->pid, iter->cpu, entry->flags, 568 comm, entry->pid, iter->cpu, entry->flags,
573 entry->preempt_count, iter->idx); 569 entry->preempt_count, iter->idx);
574 } else { 570 } else {
575 ret = lat_print_generic(s, entry, iter->cpu); 571 lat_print_generic(s, entry, iter->cpu);
576 } 572 }
577 573
578 if (ret) 574 lat_print_timestamp(iter, next_ts);
579 ret = lat_print_timestamp(iter, next_ts);
580 575
581 return ret; 576 return !trace_seq_has_overflowed(s);
582} 577}
583 578
584static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; 579static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
@@ -692,7 +687,7 @@ int register_ftrace_event(struct trace_event *event)
692 goto out; 687 goto out;
693 688
694 } else { 689 } else {
695 690
696 event->type = next_event_type++; 691 event->type = next_event_type++;
697 list = &ftrace_event_list; 692 list = &ftrace_event_list;
698 } 693 }
@@ -764,10 +759,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
764enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, 759enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
765 struct trace_event *event) 760 struct trace_event *event)
766{ 761{
767 if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) 762 trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type);
768 return TRACE_TYPE_PARTIAL_LINE;
769 763
770 return TRACE_TYPE_HANDLED; 764 return trace_handle_return(&iter->seq);
771} 765}
772 766
773/* TRACE_FN */ 767/* TRACE_FN */
@@ -779,24 +773,16 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
779 773
780 trace_assign_type(field, iter->ent); 774 trace_assign_type(field, iter->ent);
781 775
782 if (!seq_print_ip_sym(s, field->ip, flags)) 776 seq_print_ip_sym(s, field->ip, flags);
783 goto partial;
784 777
785 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { 778 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
786 if (!trace_seq_puts(s, " <-")) 779 trace_seq_puts(s, " <-");
787 goto partial; 780 seq_print_ip_sym(s, field->parent_ip, flags);
788 if (!seq_print_ip_sym(s,
789 field->parent_ip,
790 flags))
791 goto partial;
792 } 781 }
793 if (!trace_seq_putc(s, '\n'))
794 goto partial;
795 782
796 return TRACE_TYPE_HANDLED; 783 trace_seq_putc(s, '\n');
797 784
798 partial: 785 return trace_handle_return(s);
799 return TRACE_TYPE_PARTIAL_LINE;
800} 786}
801 787
802static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, 788static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
@@ -806,12 +792,11 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
806 792
807 trace_assign_type(field, iter->ent); 793 trace_assign_type(field, iter->ent);
808 794
809 if (!trace_seq_printf(&iter->seq, "%lx %lx\n", 795 trace_seq_printf(&iter->seq, "%lx %lx\n",
810 field->ip, 796 field->ip,
811 field->parent_ip)) 797 field->parent_ip);
812 return TRACE_TYPE_PARTIAL_LINE;
813 798
814 return TRACE_TYPE_HANDLED; 799 return trace_handle_return(&iter->seq);
815} 800}
816 801
817static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, 802static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
@@ -822,10 +807,10 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
822 807
823 trace_assign_type(field, iter->ent); 808 trace_assign_type(field, iter->ent);
824 809
825 SEQ_PUT_HEX_FIELD_RET(s, field->ip); 810 SEQ_PUT_HEX_FIELD(s, field->ip);
826 SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); 811 SEQ_PUT_HEX_FIELD(s, field->parent_ip);
827 812
828 return TRACE_TYPE_HANDLED; 813 return trace_handle_return(s);
829} 814}
830 815
831static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, 816static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
@@ -836,10 +821,10 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
836 821
837 trace_assign_type(field, iter->ent); 822 trace_assign_type(field, iter->ent);
838 823
839 SEQ_PUT_FIELD_RET(s, field->ip); 824 SEQ_PUT_FIELD(s, field->ip);
840 SEQ_PUT_FIELD_RET(s, field->parent_ip); 825 SEQ_PUT_FIELD(s, field->parent_ip);
841 826
842 return TRACE_TYPE_HANDLED; 827 return trace_handle_return(s);
843} 828}
844 829
845static struct trace_event_functions trace_fn_funcs = { 830static struct trace_event_functions trace_fn_funcs = {
@@ -868,18 +853,17 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
868 T = task_state_char(field->next_state); 853 T = task_state_char(field->next_state);
869 S = task_state_char(field->prev_state); 854 S = task_state_char(field->prev_state);
870 trace_find_cmdline(field->next_pid, comm); 855 trace_find_cmdline(field->next_pid, comm);
871 if (!trace_seq_printf(&iter->seq, 856 trace_seq_printf(&iter->seq,
872 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", 857 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
873 field->prev_pid, 858 field->prev_pid,
874 field->prev_prio, 859 field->prev_prio,
875 S, delim, 860 S, delim,
876 field->next_cpu, 861 field->next_cpu,
877 field->next_pid, 862 field->next_pid,
878 field->next_prio, 863 field->next_prio,
879 T, comm)) 864 T, comm);
880 return TRACE_TYPE_PARTIAL_LINE; 865
881 866 return trace_handle_return(&iter->seq);
882 return TRACE_TYPE_HANDLED;
883} 867}
884 868
885static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, 869static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
@@ -904,17 +888,16 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
904 if (!S) 888 if (!S)
905 S = task_state_char(field->prev_state); 889 S = task_state_char(field->prev_state);
906 T = task_state_char(field->next_state); 890 T = task_state_char(field->next_state);
907 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", 891 trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
908 field->prev_pid, 892 field->prev_pid,
909 field->prev_prio, 893 field->prev_prio,
910 S, 894 S,
911 field->next_cpu, 895 field->next_cpu,
912 field->next_pid, 896 field->next_pid,
913 field->next_prio, 897 field->next_prio,
914 T)) 898 T);
915 return TRACE_TYPE_PARTIAL_LINE; 899
916 900 return trace_handle_return(&iter->seq);
917 return TRACE_TYPE_HANDLED;
918} 901}
919 902
920static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, 903static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
@@ -942,15 +925,15 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
942 S = task_state_char(field->prev_state); 925 S = task_state_char(field->prev_state);
943 T = task_state_char(field->next_state); 926 T = task_state_char(field->next_state);
944 927
945 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); 928 SEQ_PUT_HEX_FIELD(s, field->prev_pid);
946 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); 929 SEQ_PUT_HEX_FIELD(s, field->prev_prio);
947 SEQ_PUT_HEX_FIELD_RET(s, S); 930 SEQ_PUT_HEX_FIELD(s, S);
948 SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); 931 SEQ_PUT_HEX_FIELD(s, field->next_cpu);
949 SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); 932 SEQ_PUT_HEX_FIELD(s, field->next_pid);
950 SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); 933 SEQ_PUT_HEX_FIELD(s, field->next_prio);
951 SEQ_PUT_HEX_FIELD_RET(s, T); 934 SEQ_PUT_HEX_FIELD(s, T);
952 935
953 return TRACE_TYPE_HANDLED; 936 return trace_handle_return(s);
954} 937}
955 938
956static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, 939static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
@@ -973,14 +956,15 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
973 956
974 trace_assign_type(field, iter->ent); 957 trace_assign_type(field, iter->ent);
975 958
976 SEQ_PUT_FIELD_RET(s, field->prev_pid); 959 SEQ_PUT_FIELD(s, field->prev_pid);
977 SEQ_PUT_FIELD_RET(s, field->prev_prio); 960 SEQ_PUT_FIELD(s, field->prev_prio);
978 SEQ_PUT_FIELD_RET(s, field->prev_state); 961 SEQ_PUT_FIELD(s, field->prev_state);
979 SEQ_PUT_FIELD_RET(s, field->next_pid); 962 SEQ_PUT_FIELD(s, field->next_cpu);
980 SEQ_PUT_FIELD_RET(s, field->next_prio); 963 SEQ_PUT_FIELD(s, field->next_pid);
981 SEQ_PUT_FIELD_RET(s, field->next_state); 964 SEQ_PUT_FIELD(s, field->next_prio);
965 SEQ_PUT_FIELD(s, field->next_state);
982 966
983 return TRACE_TYPE_HANDLED; 967 return trace_handle_return(s);
984} 968}
985 969
986static struct trace_event_functions trace_ctx_funcs = { 970static struct trace_event_functions trace_ctx_funcs = {
@@ -1020,23 +1004,19 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1020 trace_assign_type(field, iter->ent); 1004 trace_assign_type(field, iter->ent);
1021 end = (unsigned long *)((long)iter->ent + iter->ent_size); 1005 end = (unsigned long *)((long)iter->ent + iter->ent_size);
1022 1006
1023 if (!trace_seq_puts(s, "<stack trace>\n")) 1007 trace_seq_puts(s, "<stack trace>\n");
1024 goto partial;
1025 1008
1026 for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { 1009 for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
1027 if (!trace_seq_puts(s, " => "))
1028 goto partial;
1029 1010
1030 if (!seq_print_ip_sym(s, *p, flags)) 1011 if (trace_seq_has_overflowed(s))
1031 goto partial; 1012 break;
1032 if (!trace_seq_putc(s, '\n'))
1033 goto partial;
1034 }
1035 1013
1036 return TRACE_TYPE_HANDLED; 1014 trace_seq_puts(s, " => ");
1015 seq_print_ip_sym(s, *p, flags);
1016 trace_seq_putc(s, '\n');
1017 }
1037 1018
1038 partial: 1019 return trace_handle_return(s);
1039 return TRACE_TYPE_PARTIAL_LINE;
1040} 1020}
1041 1021
1042static struct trace_event_functions trace_stack_funcs = { 1022static struct trace_event_functions trace_stack_funcs = {
@@ -1057,16 +1037,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1057 1037
1058 trace_assign_type(field, iter->ent); 1038 trace_assign_type(field, iter->ent);
1059 1039
1060 if (!trace_seq_puts(s, "<user stack trace>\n")) 1040 trace_seq_puts(s, "<user stack trace>\n");
1061 goto partial; 1041 seq_print_userip_objs(field, s, flags);
1062
1063 if (!seq_print_userip_objs(field, s, flags))
1064 goto partial;
1065
1066 return TRACE_TYPE_HANDLED;
1067 1042
1068 partial: 1043 return trace_handle_return(s);
1069 return TRACE_TYPE_PARTIAL_LINE;
1070} 1044}
1071 1045
1072static struct trace_event_functions trace_user_stack_funcs = { 1046static struct trace_event_functions trace_user_stack_funcs = {
@@ -1089,19 +1063,11 @@ trace_bputs_print(struct trace_iterator *iter, int flags,
1089 1063
1090 trace_assign_type(field, entry); 1064 trace_assign_type(field, entry);
1091 1065
1092 if (!seq_print_ip_sym(s, field->ip, flags)) 1066 seq_print_ip_sym(s, field->ip, flags);
1093 goto partial; 1067 trace_seq_puts(s, ": ");
1068 trace_seq_puts(s, field->str);
1094 1069
1095 if (!trace_seq_puts(s, ": ")) 1070 return trace_handle_return(s);
1096 goto partial;
1097
1098 if (!trace_seq_puts(s, field->str))
1099 goto partial;
1100
1101 return TRACE_TYPE_HANDLED;
1102
1103 partial:
1104 return TRACE_TYPE_PARTIAL_LINE;
1105} 1071}
1106 1072
1107 1073
@@ -1114,16 +1080,10 @@ trace_bputs_raw(struct trace_iterator *iter, int flags,
1114 1080
1115 trace_assign_type(field, iter->ent); 1081 trace_assign_type(field, iter->ent);
1116 1082
1117 if (!trace_seq_printf(s, ": %lx : ", field->ip)) 1083 trace_seq_printf(s, ": %lx : ", field->ip);
1118 goto partial; 1084 trace_seq_puts(s, field->str);
1119
1120 if (!trace_seq_puts(s, field->str))
1121 goto partial;
1122 1085
1123 return TRACE_TYPE_HANDLED; 1086 return trace_handle_return(s);
1124
1125 partial:
1126 return TRACE_TYPE_PARTIAL_LINE;
1127} 1087}
1128 1088
1129static struct trace_event_functions trace_bputs_funcs = { 1089static struct trace_event_functions trace_bputs_funcs = {
@@ -1147,19 +1107,11 @@ trace_bprint_print(struct trace_iterator *iter, int flags,
1147 1107
1148 trace_assign_type(field, entry); 1108 trace_assign_type(field, entry);
1149 1109
1150 if (!seq_print_ip_sym(s, field->ip, flags)) 1110 seq_print_ip_sym(s, field->ip, flags);
1151 goto partial; 1111 trace_seq_puts(s, ": ");
1152 1112 trace_seq_bprintf(s, field->fmt, field->buf);
1153 if (!trace_seq_puts(s, ": "))
1154 goto partial;
1155
1156 if (!trace_seq_bprintf(s, field->fmt, field->buf))
1157 goto partial;
1158 1113
1159 return TRACE_TYPE_HANDLED; 1114 return trace_handle_return(s);
1160
1161 partial:
1162 return TRACE_TYPE_PARTIAL_LINE;
1163} 1115}
1164 1116
1165 1117
@@ -1172,16 +1124,10 @@ trace_bprint_raw(struct trace_iterator *iter, int flags,
1172 1124
1173 trace_assign_type(field, iter->ent); 1125 trace_assign_type(field, iter->ent);
1174 1126
1175 if (!trace_seq_printf(s, ": %lx : ", field->ip)) 1127 trace_seq_printf(s, ": %lx : ", field->ip);
1176 goto partial; 1128 trace_seq_bprintf(s, field->fmt, field->buf);
1177
1178 if (!trace_seq_bprintf(s, field->fmt, field->buf))
1179 goto partial;
1180 1129
1181 return TRACE_TYPE_HANDLED; 1130 return trace_handle_return(s);
1182
1183 partial:
1184 return TRACE_TYPE_PARTIAL_LINE;
1185} 1131}
1186 1132
1187static struct trace_event_functions trace_bprint_funcs = { 1133static struct trace_event_functions trace_bprint_funcs = {
@@ -1203,16 +1149,10 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
1203 1149
1204 trace_assign_type(field, iter->ent); 1150 trace_assign_type(field, iter->ent);
1205 1151
1206 if (!seq_print_ip_sym(s, field->ip, flags)) 1152 seq_print_ip_sym(s, field->ip, flags);
1207 goto partial; 1153 trace_seq_printf(s, ": %s", field->buf);
1208
1209 if (!trace_seq_printf(s, ": %s", field->buf))
1210 goto partial;
1211 1154
1212 return TRACE_TYPE_HANDLED; 1155 return trace_handle_return(s);
1213
1214 partial:
1215 return TRACE_TYPE_PARTIAL_LINE;
1216} 1156}
1217 1157
1218static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, 1158static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
@@ -1222,13 +1162,9 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
1222 1162
1223 trace_assign_type(field, iter->ent); 1163 trace_assign_type(field, iter->ent);
1224 1164
1225 if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) 1165 trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
1226 goto partial;
1227
1228 return TRACE_TYPE_HANDLED;
1229 1166
1230 partial: 1167 return trace_handle_return(&iter->seq);
1231 return TRACE_TYPE_PARTIAL_LINE;
1232} 1168}
1233 1169
1234static struct trace_event_functions trace_print_funcs = { 1170static struct trace_event_functions trace_print_funcs = {
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 80b25b585a70..8ef2c40efb3c 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,17 +35,11 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
35extern int __unregister_ftrace_event(struct trace_event *event); 35extern int __unregister_ftrace_event(struct trace_event *event);
36extern struct rw_semaphore trace_event_sem; 36extern struct rw_semaphore trace_event_sem;
37 37
38#define SEQ_PUT_FIELD_RET(s, x) \ 38#define SEQ_PUT_FIELD(s, x) \
39do { \ 39 trace_seq_putmem(s, &(x), sizeof(x))
40 if (!trace_seq_putmem(s, &(x), sizeof(x))) \ 40
41 return TRACE_TYPE_PARTIAL_LINE; \ 41#define SEQ_PUT_HEX_FIELD(s, x) \
42} while (0) 42 trace_seq_putmem_hex(s, &(x), sizeof(x))
43
44#define SEQ_PUT_HEX_FIELD_RET(s, x) \
45do { \
46 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
47 return TRACE_TYPE_PARTIAL_LINE; \
48} while (0)
49 43
50#endif 44#endif
51 45
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2900817ba65c..c4e70b6bd7fa 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -305,7 +305,7 @@ static int t_show(struct seq_file *m, void *v)
305 seq_puts(m, "\\t"); 305 seq_puts(m, "\\t");
306 break; 306 break;
307 case '\\': 307 case '\\':
308 seq_puts(m, "\\"); 308 seq_putc(m, '\\');
309 break; 309 break;
310 case '"': 310 case '"':
311 seq_puts(m, "\\\""); 311 seq_puts(m, "\\\"");
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d4b9fc22cd27..b983b2fd2ca1 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -40,7 +40,8 @@ const char *reserved_field_names[] = {
40int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ 40int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
41 void *data, void *ent) \ 41 void *data, void *ent) \
42{ \ 42{ \
43 return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ 43 trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
44 return !trace_seq_has_overflowed(s); \
44} \ 45} \
45const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ 46const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
46NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); 47NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
@@ -61,10 +62,11 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
61 int len = *(u32 *)data >> 16; 62 int len = *(u32 *)data >> 16;
62 63
63 if (!len) 64 if (!len)
64 return trace_seq_printf(s, " %s=(fault)", name); 65 trace_seq_printf(s, " %s=(fault)", name);
65 else 66 else
66 return trace_seq_printf(s, " %s=\"%s\"", name, 67 trace_seq_printf(s, " %s=\"%s\"", name,
67 (const char *)get_loc_data(data, ent)); 68 (const char *)get_loc_data(data, ent));
69 return !trace_seq_has_overflowed(s);
68} 70}
69NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); 71NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
70 72
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3f34dc9b40f3..2e293beb186e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -14,122 +14,26 @@
14 14
15#include "trace.h" 15#include "trace.h"
16 16
17static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled;
19static int sched_ref; 17static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 18static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped;
22
23
24void
25tracing_sched_switch_trace(struct trace_array *tr,
26 struct task_struct *prev,
27 struct task_struct *next,
28 unsigned long flags, int pc)
29{
30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->trace_buffer.buffer;
32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry;
34
35 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
36 sizeof(*entry), flags, pc);
37 if (!event)
38 return;
39 entry = ring_buffer_event_data(event);
40 entry->prev_pid = prev->pid;
41 entry->prev_prio = prev->prio;
42 entry->prev_state = prev->state;
43 entry->next_pid = next->pid;
44 entry->next_prio = next->prio;
45 entry->next_state = next->state;
46 entry->next_cpu = task_cpu(next);
47
48 if (!call_filter_check_discard(call, entry, buffer, event))
49 trace_buffer_unlock_commit(buffer, event, flags, pc);
50}
51 19
52static void 20static void
53probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) 21probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
54{ 22{
55 struct trace_array_cpu *data;
56 unsigned long flags;
57 int cpu;
58 int pc;
59
60 if (unlikely(!sched_ref)) 23 if (unlikely(!sched_ref))
61 return; 24 return;
62 25
63 tracing_record_cmdline(prev); 26 tracing_record_cmdline(prev);
64 tracing_record_cmdline(next); 27 tracing_record_cmdline(next);
65
66 if (!tracer_enabled || sched_stopped)
67 return;
68
69 pc = preempt_count();
70 local_irq_save(flags);
71 cpu = raw_smp_processor_id();
72 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
73
74 if (likely(!atomic_read(&data->disabled)))
75 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
76
77 local_irq_restore(flags);
78}
79
80void
81tracing_sched_wakeup_trace(struct trace_array *tr,
82 struct task_struct *wakee,
83 struct task_struct *curr,
84 unsigned long flags, int pc)
85{
86 struct ftrace_event_call *call = &event_wakeup;
87 struct ring_buffer_event *event;
88 struct ctx_switch_entry *entry;
89 struct ring_buffer *buffer = tr->trace_buffer.buffer;
90
91 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
92 sizeof(*entry), flags, pc);
93 if (!event)
94 return;
95 entry = ring_buffer_event_data(event);
96 entry->prev_pid = curr->pid;
97 entry->prev_prio = curr->prio;
98 entry->prev_state = curr->state;
99 entry->next_pid = wakee->pid;
100 entry->next_prio = wakee->prio;
101 entry->next_state = wakee->state;
102 entry->next_cpu = task_cpu(wakee);
103
104 if (!call_filter_check_discard(call, entry, buffer, event))
105 trace_buffer_unlock_commit(buffer, event, flags, pc);
106} 28}
107 29
108static void 30static void
109probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) 31probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
110{ 32{
111 struct trace_array_cpu *data;
112 unsigned long flags;
113 int cpu, pc;
114
115 if (unlikely(!sched_ref)) 33 if (unlikely(!sched_ref))
116 return; 34 return;
117 35
118 tracing_record_cmdline(current); 36 tracing_record_cmdline(current);
119
120 if (!tracer_enabled || sched_stopped)
121 return;
122
123 pc = preempt_count();
124 local_irq_save(flags);
125 cpu = raw_smp_processor_id();
126 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
127
128 if (likely(!atomic_read(&data->disabled)))
129 tracing_sched_wakeup_trace(ctx_trace, wakee, current,
130 flags, pc);
131
132 local_irq_restore(flags);
133} 37}
134 38
135static int tracing_sched_register(void) 39static int tracing_sched_register(void)
@@ -197,51 +101,3 @@ void tracing_stop_cmdline_record(void)
197{ 101{
198 tracing_stop_sched_switch(); 102 tracing_stop_sched_switch();
199} 103}
200
201/**
202 * tracing_start_sched_switch_record - start tracing context switches
203 *
204 * Turns on context switch tracing for a tracer.
205 */
206void tracing_start_sched_switch_record(void)
207{
208 if (unlikely(!ctx_trace)) {
209 WARN_ON(1);
210 return;
211 }
212
213 tracing_start_sched_switch();
214
215 mutex_lock(&sched_register_mutex);
216 tracer_enabled++;
217 mutex_unlock(&sched_register_mutex);
218}
219
220/**
221 * tracing_stop_sched_switch_record - start tracing context switches
222 *
223 * Turns off context switch tracing for a tracer.
224 */
225void tracing_stop_sched_switch_record(void)
226{
227 mutex_lock(&sched_register_mutex);
228 tracer_enabled--;
229 WARN_ON(tracer_enabled < 0);
230 mutex_unlock(&sched_register_mutex);
231
232 tracing_stop_sched_switch();
233}
234
235/**
236 * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
237 * @tr: trace array pointer to assign
238 *
239 * Some tracers might want to record the context switches in their
240 * trace. This function lets those tracers assign the trace array
241 * to use.
242 */
243void tracing_sched_switch_assign_trace(struct trace_array *tr)
244{
245 ctx_trace = tr;
246}
247
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 19bd8928ce94..8fb84b362816 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -365,6 +365,62 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
365 wakeup_current_cpu = cpu; 365 wakeup_current_cpu = cpu;
366} 366}
367 367
368static void
369tracing_sched_switch_trace(struct trace_array *tr,
370 struct task_struct *prev,
371 struct task_struct *next,
372 unsigned long flags, int pc)
373{
374 struct ftrace_event_call *call = &event_context_switch;
375 struct ring_buffer *buffer = tr->trace_buffer.buffer;
376 struct ring_buffer_event *event;
377 struct ctx_switch_entry *entry;
378
379 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
380 sizeof(*entry), flags, pc);
381 if (!event)
382 return;
383 entry = ring_buffer_event_data(event);
384 entry->prev_pid = prev->pid;
385 entry->prev_prio = prev->prio;
386 entry->prev_state = prev->state;
387 entry->next_pid = next->pid;
388 entry->next_prio = next->prio;
389 entry->next_state = next->state;
390 entry->next_cpu = task_cpu(next);
391
392 if (!call_filter_check_discard(call, entry, buffer, event))
393 trace_buffer_unlock_commit(buffer, event, flags, pc);
394}
395
396static void
397tracing_sched_wakeup_trace(struct trace_array *tr,
398 struct task_struct *wakee,
399 struct task_struct *curr,
400 unsigned long flags, int pc)
401{
402 struct ftrace_event_call *call = &event_wakeup;
403 struct ring_buffer_event *event;
404 struct ctx_switch_entry *entry;
405 struct ring_buffer *buffer = tr->trace_buffer.buffer;
406
407 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
408 sizeof(*entry), flags, pc);
409 if (!event)
410 return;
411 entry = ring_buffer_event_data(event);
412 entry->prev_pid = curr->pid;
413 entry->prev_prio = curr->prio;
414 entry->prev_state = curr->state;
415 entry->next_pid = wakee->pid;
416 entry->next_prio = wakee->prio;
417 entry->next_state = wakee->state;
418 entry->next_cpu = task_cpu(wakee);
419
420 if (!call_filter_check_discard(call, entry, buffer, event))
421 trace_buffer_unlock_commit(buffer, event, flags, pc);
422}
423
368static void notrace 424static void notrace
369probe_wakeup_sched_switch(void *ignore, 425probe_wakeup_sched_switch(void *ignore,
370 struct task_struct *prev, struct task_struct *next) 426 struct task_struct *prev, struct task_struct *next)
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 1f24ed99dca2..f8b45d8792f9 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -27,10 +27,19 @@
27#include <linux/trace_seq.h> 27#include <linux/trace_seq.h>
28 28
29/* How much buffer is left on the trace_seq? */ 29/* How much buffer is left on the trace_seq? */
30#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len) 30#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq)
31 31
32/* How much buffer is written? */ 32/* How much buffer is written? */
33#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1)) 33#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq)
34
35/*
36 * trace_seq should work with being initialized with 0s.
37 */
38static inline void __trace_seq_init(struct trace_seq *s)
39{
40 if (unlikely(!s->seq.size))
41 trace_seq_init(s);
42}
34 43
35/** 44/**
36 * trace_print_seq - move the contents of trace_seq into a seq_file 45 * trace_print_seq - move the contents of trace_seq into a seq_file
@@ -43,10 +52,11 @@
43 */ 52 */
44int trace_print_seq(struct seq_file *m, struct trace_seq *s) 53int trace_print_seq(struct seq_file *m, struct trace_seq *s)
45{ 54{
46 unsigned int len = TRACE_SEQ_BUF_USED(s);
47 int ret; 55 int ret;
48 56
49 ret = seq_write(m, s->buffer, len); 57 __trace_seq_init(s);
58
59 ret = seq_buf_print_seq(m, &s->seq);
50 60
51 /* 61 /*
52 * Only reset this buffer if we successfully wrote to the 62 * Only reset this buffer if we successfully wrote to the
@@ -69,34 +79,26 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
69 * trace_seq_printf() is used to store strings into a special 79 * trace_seq_printf() is used to store strings into a special
70 * buffer (@s). Then the output may be either used by 80 * buffer (@s). Then the output may be either used by
71 * the sequencer or pulled into another buffer. 81 * the sequencer or pulled into another buffer.
72 *
73 * Returns 1 if we successfully written all the contents to
74 * the buffer.
75 * Returns 0 if we the length to write is bigger than the
76 * reserved buffer space. In this case, nothing gets written.
77 */ 82 */
78int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) 83void trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
79{ 84{
80 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 85 unsigned int save_len = s->seq.len;
81 va_list ap; 86 va_list ap;
82 int ret;
83 87
84 if (s->full || !len) 88 if (s->full)
85 return 0; 89 return;
90
91 __trace_seq_init(s);
86 92
87 va_start(ap, fmt); 93 va_start(ap, fmt);
88 ret = vsnprintf(s->buffer + s->len, len, fmt, ap); 94 seq_buf_vprintf(&s->seq, fmt, ap);
89 va_end(ap); 95 va_end(ap);
90 96
91 /* If we can't write it all, don't bother writing anything */ 97 /* If we can't write it all, don't bother writing anything */
92 if (ret >= len) { 98 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
99 s->seq.len = save_len;
93 s->full = 1; 100 s->full = 1;
94 return 0;
95 } 101 }
96
97 s->len += ret;
98
99 return 1;
100} 102}
101EXPORT_SYMBOL_GPL(trace_seq_printf); 103EXPORT_SYMBOL_GPL(trace_seq_printf);
102 104
@@ -107,25 +109,23 @@ EXPORT_SYMBOL_GPL(trace_seq_printf);
107 * @nmaskbits: The number of bits that are valid in @maskp 109 * @nmaskbits: The number of bits that are valid in @maskp
108 * 110 *
109 * Writes a ASCII representation of a bitmask string into @s. 111 * Writes a ASCII representation of a bitmask string into @s.
110 *
111 * Returns 1 if we successfully written all the contents to
112 * the buffer.
113 * Returns 0 if we the length to write is bigger than the
114 * reserved buffer space. In this case, nothing gets written.
115 */ 112 */
116int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, 113void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
117 int nmaskbits) 114 int nmaskbits)
118{ 115{
119 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 116 unsigned int save_len = s->seq.len;
120 int ret;
121 117
122 if (s->full || !len) 118 if (s->full)
123 return 0; 119 return;
124 120
125 ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); 121 __trace_seq_init(s);
126 s->len += ret;
127 122
128 return 1; 123 seq_buf_bitmask(&s->seq, maskp, nmaskbits);
124
125 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
126 s->seq.len = save_len;
127 s->full = 1;
128 }
129} 129}
130EXPORT_SYMBOL_GPL(trace_seq_bitmask); 130EXPORT_SYMBOL_GPL(trace_seq_bitmask);
131 131
@@ -139,28 +139,23 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
139 * trace_seq_printf is used to store strings into a special 139 * trace_seq_printf is used to store strings into a special
140 * buffer (@s). Then the output may be either used by 140 * buffer (@s). Then the output may be either used by
141 * the sequencer or pulled into another buffer. 141 * the sequencer or pulled into another buffer.
142 *
143 * Returns how much it wrote to the buffer.
144 */ 142 */
145int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) 143void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
146{ 144{
147 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 145 unsigned int save_len = s->seq.len;
148 int ret;
149 146
150 if (s->full || !len) 147 if (s->full)
151 return 0; 148 return;
152 149
153 ret = vsnprintf(s->buffer + s->len, len, fmt, args); 150 __trace_seq_init(s);
151
152 seq_buf_vprintf(&s->seq, fmt, args);
154 153
155 /* If we can't write it all, don't bother writing anything */ 154 /* If we can't write it all, don't bother writing anything */
156 if (ret >= len) { 155 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
156 s->seq.len = save_len;
157 s->full = 1; 157 s->full = 1;
158 return 0;
159 } 158 }
160
161 s->len += ret;
162
163 return len;
164} 159}
165EXPORT_SYMBOL_GPL(trace_seq_vprintf); 160EXPORT_SYMBOL_GPL(trace_seq_vprintf);
166 161
@@ -178,28 +173,24 @@ EXPORT_SYMBOL_GPL(trace_seq_vprintf);
178 * 173 *
179 * This function will take the format and the binary array and finish 174 * This function will take the format and the binary array and finish
180 * the conversion into the ASCII string within the buffer. 175 * the conversion into the ASCII string within the buffer.
181 *
182 * Returns how much it wrote to the buffer.
183 */ 176 */
184int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) 177void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
185{ 178{
186 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 179 unsigned int save_len = s->seq.len;
187 int ret;
188 180
189 if (s->full || !len) 181 if (s->full)
190 return 0; 182 return;
183
184 __trace_seq_init(s);
191 185
192 ret = bstr_printf(s->buffer + s->len, len, fmt, binary); 186 seq_buf_bprintf(&s->seq, fmt, binary);
193 187
194 /* If we can't write it all, don't bother writing anything */ 188 /* If we can't write it all, don't bother writing anything */
195 if (ret >= len) { 189 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
190 s->seq.len = save_len;
196 s->full = 1; 191 s->full = 1;
197 return 0; 192 return;
198 } 193 }
199
200 s->len += ret;
201
202 return len;
203} 194}
204EXPORT_SYMBOL_GPL(trace_seq_bprintf); 195EXPORT_SYMBOL_GPL(trace_seq_bprintf);
205 196
@@ -212,25 +203,22 @@ EXPORT_SYMBOL_GPL(trace_seq_bprintf);
212 * copy to user routines. This function records a simple string 203 * copy to user routines. This function records a simple string
213 * into a special buffer (@s) for later retrieval by a sequencer 204 * into a special buffer (@s) for later retrieval by a sequencer
214 * or other mechanism. 205 * or other mechanism.
215 *
216 * Returns how much it wrote to the buffer.
217 */ 206 */
218int trace_seq_puts(struct trace_seq *s, const char *str) 207void trace_seq_puts(struct trace_seq *s, const char *str)
219{ 208{
220 unsigned int len = strlen(str); 209 unsigned int len = strlen(str);
221 210
222 if (s->full) 211 if (s->full)
223 return 0; 212 return;
213
214 __trace_seq_init(s);
224 215
225 if (len > TRACE_SEQ_BUF_LEFT(s)) { 216 if (len > TRACE_SEQ_BUF_LEFT(s)) {
226 s->full = 1; 217 s->full = 1;
227 return 0; 218 return;
228 } 219 }
229 220
230 memcpy(s->buffer + s->len, str, len); 221 seq_buf_putmem(&s->seq, str, len);
231 s->len += len;
232
233 return len;
234} 222}
235EXPORT_SYMBOL_GPL(trace_seq_puts); 223EXPORT_SYMBOL_GPL(trace_seq_puts);
236 224
@@ -243,22 +231,20 @@ EXPORT_SYMBOL_GPL(trace_seq_puts);
243 * copy to user routines. This function records a simple charater 231 * copy to user routines. This function records a simple charater
244 * into a special buffer (@s) for later retrieval by a sequencer 232 * into a special buffer (@s) for later retrieval by a sequencer
245 * or other mechanism. 233 * or other mechanism.
246 *
247 * Returns how much it wrote to the buffer.
248 */ 234 */
249int trace_seq_putc(struct trace_seq *s, unsigned char c) 235void trace_seq_putc(struct trace_seq *s, unsigned char c)
250{ 236{
251 if (s->full) 237 if (s->full)
252 return 0; 238 return;
239
240 __trace_seq_init(s);
253 241
254 if (TRACE_SEQ_BUF_LEFT(s) < 1) { 242 if (TRACE_SEQ_BUF_LEFT(s) < 1) {
255 s->full = 1; 243 s->full = 1;
256 return 0; 244 return;
257 } 245 }
258 246
259 s->buffer[s->len++] = c; 247 seq_buf_putc(&s->seq, c);
260
261 return 1;
262} 248}
263EXPORT_SYMBOL_GPL(trace_seq_putc); 249EXPORT_SYMBOL_GPL(trace_seq_putc);
264 250
@@ -271,29 +257,23 @@ EXPORT_SYMBOL_GPL(trace_seq_putc);
271 * There may be cases where raw memory needs to be written into the 257 * There may be cases where raw memory needs to be written into the
272 * buffer and a strcpy() would not work. Using this function allows 258 * buffer and a strcpy() would not work. Using this function allows
273 * for such cases. 259 * for such cases.
274 *
275 * Returns how much it wrote to the buffer.
276 */ 260 */
277int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) 261void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
278{ 262{
279 if (s->full) 263 if (s->full)
280 return 0; 264 return;
265
266 __trace_seq_init(s);
281 267
282 if (len > TRACE_SEQ_BUF_LEFT(s)) { 268 if (len > TRACE_SEQ_BUF_LEFT(s)) {
283 s->full = 1; 269 s->full = 1;
284 return 0; 270 return;
285 } 271 }
286 272
287 memcpy(s->buffer + s->len, mem, len); 273 seq_buf_putmem(&s->seq, mem, len);
288 s->len += len;
289
290 return len;
291} 274}
292EXPORT_SYMBOL_GPL(trace_seq_putmem); 275EXPORT_SYMBOL_GPL(trace_seq_putmem);
293 276
294#define MAX_MEMHEX_BYTES 8U
295#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
296
297/** 277/**
298 * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex 278 * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
299 * @s: trace sequence descriptor 279 * @s: trace sequence descriptor
@@ -303,41 +283,31 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem);
303 * This is similar to trace_seq_putmem() except instead of just copying the 283 * This is similar to trace_seq_putmem() except instead of just copying the
304 * raw memory into the buffer it writes its ASCII representation of it 284 * raw memory into the buffer it writes its ASCII representation of it
305 * in hex characters. 285 * in hex characters.
306 *
307 * Returns how much it wrote to the buffer.
308 */ 286 */
309int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, 287void trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
310 unsigned int len) 288 unsigned int len)
311{ 289{
312 unsigned char hex[HEX_CHARS]; 290 unsigned int save_len = s->seq.len;
313 const unsigned char *data = mem;
314 unsigned int start_len;
315 int i, j;
316 int cnt = 0;
317 291
318 if (s->full) 292 if (s->full)
319 return 0; 293 return;
320 294
321 while (len) { 295 __trace_seq_init(s);
322 start_len = min(len, HEX_CHARS - 1); 296
323#ifdef __BIG_ENDIAN 297 /* Each byte is represented by two chars */
324 for (i = 0, j = 0; i < start_len; i++) { 298 if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) {
325#else 299 s->full = 1;
326 for (i = start_len-1, j = 0; i >= 0; i--) { 300 return;
327#endif 301 }
328 hex[j++] = hex_asc_hi(data[i]); 302
329 hex[j++] = hex_asc_lo(data[i]); 303 /* The added spaces can still cause an overflow */
330 } 304 seq_buf_putmem_hex(&s->seq, mem, len);
331 if (WARN_ON_ONCE(j == 0 || j/2 > len)) 305
332 break; 306 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
333 307 s->seq.len = save_len;
334 /* j increments twice per loop */ 308 s->full = 1;
335 len -= j / 2; 309 return;
336 hex[j++] = ' ';
337
338 cnt += trace_seq_putmem(s, hex, j);
339 } 310 }
340 return cnt;
341} 311}
342EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); 312EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
343 313
@@ -355,30 +325,27 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
355 */ 325 */
356int trace_seq_path(struct trace_seq *s, const struct path *path) 326int trace_seq_path(struct trace_seq *s, const struct path *path)
357{ 327{
358 unsigned char *p; 328 unsigned int save_len = s->seq.len;
359 329
360 if (s->full) 330 if (s->full)
361 return 0; 331 return 0;
362 332
333 __trace_seq_init(s);
334
363 if (TRACE_SEQ_BUF_LEFT(s) < 1) { 335 if (TRACE_SEQ_BUF_LEFT(s) < 1) {
364 s->full = 1; 336 s->full = 1;
365 return 0; 337 return 0;
366 } 338 }
367 339
368 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); 340 seq_buf_path(&s->seq, path, "\n");
369 if (!IS_ERR(p)) { 341
370 p = mangle_path(s->buffer + s->len, p, "\n"); 342 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
371 if (p) { 343 s->seq.len = save_len;
372 s->len = p - s->buffer; 344 s->full = 1;
373 return 1; 345 return 0;
374 }
375 } else {
376 s->buffer[s->len++] = '?';
377 return 1;
378 } 346 }
379 347
380 s->full = 1; 348 return 1;
381 return 0;
382} 349}
383EXPORT_SYMBOL_GPL(trace_seq_path); 350EXPORT_SYMBOL_GPL(trace_seq_path);
384 351
@@ -404,25 +371,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
404 */ 371 */
405int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) 372int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
406{ 373{
407 int len; 374 __trace_seq_init(s);
408 int ret; 375 return seq_buf_to_user(&s->seq, ubuf, cnt);
409
410 if (!cnt)
411 return 0;
412
413 if (s->len <= s->readpos)
414 return -EBUSY;
415
416 len = s->len - s->readpos;
417 if (cnt > len)
418 cnt = len;
419 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
420 if (ret == cnt)
421 return -EFAULT;
422
423 cnt -= ret;
424
425 s->readpos += cnt;
426 return cnt;
427} 376}
428EXPORT_SYMBOL_GPL(trace_seq_to_user); 377EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4dc8b79c5f75..dfe00a4f3f3e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -114,7 +114,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
114 struct trace_entry *ent = iter->ent; 114 struct trace_entry *ent = iter->ent;
115 struct syscall_trace_enter *trace; 115 struct syscall_trace_enter *trace;
116 struct syscall_metadata *entry; 116 struct syscall_metadata *entry;
117 int i, ret, syscall; 117 int i, syscall;
118 118
119 trace = (typeof(trace))ent; 119 trace = (typeof(trace))ent;
120 syscall = trace->nr; 120 syscall = trace->nr;
@@ -128,35 +128,28 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
128 goto end; 128 goto end;
129 } 129 }
130 130
131 ret = trace_seq_printf(s, "%s(", entry->name); 131 trace_seq_printf(s, "%s(", entry->name);
132 if (!ret)
133 return TRACE_TYPE_PARTIAL_LINE;
134 132
135 for (i = 0; i < entry->nb_args; i++) { 133 for (i = 0; i < entry->nb_args; i++) {
134
135 if (trace_seq_has_overflowed(s))
136 goto end;
137
136 /* parameter types */ 138 /* parameter types */
137 if (trace_flags & TRACE_ITER_VERBOSE) { 139 if (trace_flags & TRACE_ITER_VERBOSE)
138 ret = trace_seq_printf(s, "%s ", entry->types[i]); 140 trace_seq_printf(s, "%s ", entry->types[i]);
139 if (!ret) 141
140 return TRACE_TYPE_PARTIAL_LINE;
141 }
142 /* parameter values */ 142 /* parameter values */
143 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], 143 trace_seq_printf(s, "%s: %lx%s", entry->args[i],
144 trace->args[i], 144 trace->args[i],
145 i == entry->nb_args - 1 ? "" : ", "); 145 i == entry->nb_args - 1 ? "" : ", ");
146 if (!ret)
147 return TRACE_TYPE_PARTIAL_LINE;
148 } 146 }
149 147
150 ret = trace_seq_putc(s, ')'); 148 trace_seq_putc(s, ')');
151 if (!ret)
152 return TRACE_TYPE_PARTIAL_LINE;
153
154end: 149end:
155 ret = trace_seq_putc(s, '\n'); 150 trace_seq_putc(s, '\n');
156 if (!ret)
157 return TRACE_TYPE_PARTIAL_LINE;
158 151
159 return TRACE_TYPE_HANDLED; 152 return trace_handle_return(s);
160} 153}
161 154
162static enum print_line_t 155static enum print_line_t
@@ -168,7 +161,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
168 struct syscall_trace_exit *trace; 161 struct syscall_trace_exit *trace;
169 int syscall; 162 int syscall;
170 struct syscall_metadata *entry; 163 struct syscall_metadata *entry;
171 int ret;
172 164
173 trace = (typeof(trace))ent; 165 trace = (typeof(trace))ent;
174 syscall = trace->nr; 166 syscall = trace->nr;
@@ -176,7 +168,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
176 168
177 if (!entry) { 169 if (!entry) {
178 trace_seq_putc(s, '\n'); 170 trace_seq_putc(s, '\n');
179 return TRACE_TYPE_HANDLED; 171 goto out;
180 } 172 }
181 173
182 if (entry->exit_event->event.type != ent->type) { 174 if (entry->exit_event->event.type != ent->type) {
@@ -184,12 +176,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
184 return TRACE_TYPE_UNHANDLED; 176 return TRACE_TYPE_UNHANDLED;
185 } 177 }
186 178
187 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 179 trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
188 trace->ret); 180 trace->ret);
189 if (!ret)
190 return TRACE_TYPE_PARTIAL_LINE;
191 181
192 return TRACE_TYPE_HANDLED; 182 out:
183 return trace_handle_return(s);
193} 184}
194 185
195extern char *__bad_type_size(void); 186extern char *__bad_type_size(void);
@@ -313,7 +304,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
313 int size; 304 int size;
314 305
315 syscall_nr = trace_get_syscall_nr(current, regs); 306 syscall_nr = trace_get_syscall_nr(current, regs);
316 if (syscall_nr < 0) 307 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
317 return; 308 return;
318 309
319 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ 310 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
@@ -360,7 +351,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
360 int syscall_nr; 351 int syscall_nr;
361 352
362 syscall_nr = trace_get_syscall_nr(current, regs); 353 syscall_nr = trace_get_syscall_nr(current, regs);
363 if (syscall_nr < 0) 354 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
364 return; 355 return;
365 356
366 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ 357 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
@@ -567,7 +558,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
567 int size; 558 int size;
568 559
569 syscall_nr = trace_get_syscall_nr(current, regs); 560 syscall_nr = trace_get_syscall_nr(current, regs);
570 if (syscall_nr < 0) 561 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
571 return; 562 return;
572 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 563 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
573 return; 564 return;
@@ -641,7 +632,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
641 int size; 632 int size;
642 633
643 syscall_nr = trace_get_syscall_nr(current, regs); 634 syscall_nr = trace_get_syscall_nr(current, regs);
644 if (syscall_nr < 0) 635 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
645 return; 636 return;
646 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 637 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
647 return; 638 return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 33ff6a24b802..8520acc34b18 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -552,8 +552,7 @@ error:
552 return ret; 552 return ret;
553 553
554fail_address_parse: 554fail_address_parse:
555 if (inode) 555 iput(inode);
556 iput(inode);
557 556
558 pr_info("Failed to parse address or file.\n"); 557 pr_info("Failed to parse address or file.\n");
559 558
@@ -606,7 +605,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
606 for (i = 0; i < tu->tp.nr_args; i++) 605 for (i = 0; i < tu->tp.nr_args; i++)
607 seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); 606 seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
608 607
609 seq_printf(m, "\n"); 608 seq_putc(m, '\n');
610 return 0; 609 return 0;
611} 610}
612 611
@@ -852,16 +851,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
852 tu = container_of(event, struct trace_uprobe, tp.call.event); 851 tu = container_of(event, struct trace_uprobe, tp.call.event);
853 852
854 if (is_ret_probe(tu)) { 853 if (is_ret_probe(tu)) {
855 if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", 854 trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
856 ftrace_event_name(&tu->tp.call), 855 ftrace_event_name(&tu->tp.call),
857 entry->vaddr[1], entry->vaddr[0])) 856 entry->vaddr[1], entry->vaddr[0]);
858 goto partial;
859 data = DATAOF_TRACE_ENTRY(entry, true); 857 data = DATAOF_TRACE_ENTRY(entry, true);
860 } else { 858 } else {
861 if (!trace_seq_printf(s, "%s: (0x%lx)", 859 trace_seq_printf(s, "%s: (0x%lx)",
862 ftrace_event_name(&tu->tp.call), 860 ftrace_event_name(&tu->tp.call),
863 entry->vaddr[0])) 861 entry->vaddr[0]);
864 goto partial;
865 data = DATAOF_TRACE_ENTRY(entry, false); 862 data = DATAOF_TRACE_ENTRY(entry, false);
866 } 863 }
867 864
@@ -869,14 +866,13 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
869 struct probe_arg *parg = &tu->tp.args[i]; 866 struct probe_arg *parg = &tu->tp.args[i];
870 867
871 if (!parg->type->print(s, parg->name, data + parg->offset, entry)) 868 if (!parg->type->print(s, parg->name, data + parg->offset, entry))
872 goto partial; 869 goto out;
873 } 870 }
874 871
875 if (trace_seq_puts(s, "\n")) 872 trace_seq_putc(s, '\n');
876 return TRACE_TYPE_HANDLED;
877 873
878partial: 874 out:
879 return TRACE_TYPE_PARTIAL_LINE; 875 return trace_handle_return(s);
880} 876}
881 877
882typedef bool (*filter_func_t)(struct uprobe_consumer *self, 878typedef bool (*filter_func_t)(struct uprobe_consumer *self,