diff options
Diffstat (limited to 'kernel')
162 files changed, 15552 insertions, 10806 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 057472fbc272..0b5ff083fa22 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -10,8 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | |||
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ | 12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ |
13 | async.o range.o | 13 | async.o range.o jump_label.o |
14 | obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o | ||
15 | obj-y += groups.o | 14 | obj-y += groups.o |
16 | 15 | ||
17 | ifdef CONFIG_FUNCTION_TRACER | 16 | ifdef CONFIG_FUNCTION_TRACER |
@@ -23,6 +22,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg | |||
23 | CFLAGS_REMOVE_cgroup-debug.o = -pg | 22 | CFLAGS_REMOVE_cgroup-debug.o = -pg |
24 | CFLAGS_REMOVE_sched_clock.o = -pg | 23 | CFLAGS_REMOVE_sched_clock.o = -pg |
25 | CFLAGS_REMOVE_perf_event.o = -pg | 24 | CFLAGS_REMOVE_perf_event.o = -pg |
25 | CFLAGS_REMOVE_irq_work.o = -pg | ||
26 | endif | 26 | endif |
27 | 27 | ||
28 | obj-$(CONFIG_FREEZER) += freezer.o | 28 | obj-$(CONFIG_FREEZER) += freezer.o |
@@ -70,14 +70,15 @@ obj-$(CONFIG_IKCONFIG) += configs.o | |||
70 | obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o | 70 | obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o |
71 | obj-$(CONFIG_SMP) += stop_machine.o | 71 | obj-$(CONFIG_SMP) += stop_machine.o |
72 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o | 72 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o |
73 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o | 73 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o |
74 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | 74 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
75 | obj-$(CONFIG_GCOV_KERNEL) += gcov/ | 75 | obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o |
76 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o | 76 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o |
77 | obj-$(CONFIG_GCOV_KERNEL) += gcov/ | ||
77 | obj-$(CONFIG_KPROBES) += kprobes.o | 78 | obj-$(CONFIG_KPROBES) += kprobes.o |
78 | obj-$(CONFIG_KGDB) += debug/ | 79 | obj-$(CONFIG_KGDB) += debug/ |
79 | obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | ||
80 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o | 80 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o |
81 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o | ||
81 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 82 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
82 | obj-$(CONFIG_SECCOMP) += seccomp.o | 83 | obj-$(CONFIG_SECCOMP) += seccomp.o |
83 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 84 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
@@ -85,6 +86,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o | |||
85 | obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o | 86 | obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o |
86 | obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o | 87 | obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o |
87 | obj-$(CONFIG_TINY_RCU) += rcutiny.o | 88 | obj-$(CONFIG_TINY_RCU) += rcutiny.o |
89 | obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o | ||
88 | obj-$(CONFIG_RELAY) += relay.o | 90 | obj-$(CONFIG_RELAY) += relay.o |
89 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 91 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
90 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 92 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
@@ -99,8 +101,7 @@ obj-$(CONFIG_TRACING) += trace/ | |||
99 | obj-$(CONFIG_X86_DS) += trace/ | 101 | obj-$(CONFIG_X86_DS) += trace/ |
100 | obj-$(CONFIG_RING_BUFFER) += trace/ | 102 | obj-$(CONFIG_RING_BUFFER) += trace/ |
101 | obj-$(CONFIG_SMP) += sched_cpupri.o | 103 | obj-$(CONFIG_SMP) += sched_cpupri.o |
102 | obj-$(CONFIG_SLOW_WORK) += slow-work.o | 104 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
103 | obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o | ||
104 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 105 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o |
105 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 106 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
106 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | 107 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o |
diff --git a/kernel/acct.c b/kernel/acct.c index 385b88461c29..fa7eb3de2ddc 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) | |||
122 | spin_unlock(&acct_lock); | 122 | spin_unlock(&acct_lock); |
123 | 123 | ||
124 | /* May block */ | 124 | /* May block */ |
125 | if (vfs_statfs(file->f_path.dentry, &sbuf)) | 125 | if (vfs_statfs(&file->f_path, &sbuf)) |
126 | return res; | 126 | return res; |
127 | suspend = sbuf.f_blocks * SUSPEND; | 127 | suspend = sbuf.f_blocks * SUSPEND; |
128 | resume = sbuf.f_blocks * RESUME; | 128 | resume = sbuf.f_blocks * RESUME; |
diff --git a/kernel/async.c b/kernel/async.c index 15319d6c18fe..cd9dbb913c77 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
@@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel. | |||
49 | */ | 49 | */ |
50 | 50 | ||
51 | #include <linux/async.h> | 51 | #include <linux/async.h> |
52 | #include <linux/bug.h> | ||
53 | #include <linux/module.h> | 52 | #include <linux/module.h> |
54 | #include <linux/wait.h> | 53 | #include <linux/wait.h> |
55 | #include <linux/sched.h> | 54 | #include <linux/sched.h> |
56 | #include <linux/init.h> | ||
57 | #include <linux/kthread.h> | ||
58 | #include <linux/delay.h> | ||
59 | #include <linux/slab.h> | 55 | #include <linux/slab.h> |
56 | #include <linux/workqueue.h> | ||
60 | #include <asm/atomic.h> | 57 | #include <asm/atomic.h> |
61 | 58 | ||
62 | static async_cookie_t next_cookie = 1; | 59 | static async_cookie_t next_cookie = 1; |
63 | 60 | ||
64 | #define MAX_THREADS 256 | ||
65 | #define MAX_WORK 32768 | 61 | #define MAX_WORK 32768 |
66 | 62 | ||
67 | static LIST_HEAD(async_pending); | 63 | static LIST_HEAD(async_pending); |
68 | static LIST_HEAD(async_running); | 64 | static LIST_HEAD(async_running); |
69 | static DEFINE_SPINLOCK(async_lock); | 65 | static DEFINE_SPINLOCK(async_lock); |
70 | 66 | ||
71 | static int async_enabled = 0; | ||
72 | |||
73 | struct async_entry { | 67 | struct async_entry { |
74 | struct list_head list; | 68 | struct list_head list; |
75 | async_cookie_t cookie; | 69 | struct work_struct work; |
76 | async_func_ptr *func; | 70 | async_cookie_t cookie; |
77 | void *data; | 71 | async_func_ptr *func; |
78 | struct list_head *running; | 72 | void *data; |
73 | struct list_head *running; | ||
79 | }; | 74 | }; |
80 | 75 | ||
81 | static DECLARE_WAIT_QUEUE_HEAD(async_done); | 76 | static DECLARE_WAIT_QUEUE_HEAD(async_done); |
82 | static DECLARE_WAIT_QUEUE_HEAD(async_new); | ||
83 | 77 | ||
84 | static atomic_t entry_count; | 78 | static atomic_t entry_count; |
85 | static atomic_t thread_count; | ||
86 | 79 | ||
87 | extern int initcall_debug; | 80 | extern int initcall_debug; |
88 | 81 | ||
@@ -117,27 +110,23 @@ static async_cookie_t lowest_in_progress(struct list_head *running) | |||
117 | spin_unlock_irqrestore(&async_lock, flags); | 110 | spin_unlock_irqrestore(&async_lock, flags); |
118 | return ret; | 111 | return ret; |
119 | } | 112 | } |
113 | |||
120 | /* | 114 | /* |
121 | * pick the first pending entry and run it | 115 | * pick the first pending entry and run it |
122 | */ | 116 | */ |
123 | static void run_one_entry(void) | 117 | static void async_run_entry_fn(struct work_struct *work) |
124 | { | 118 | { |
119 | struct async_entry *entry = | ||
120 | container_of(work, struct async_entry, work); | ||
125 | unsigned long flags; | 121 | unsigned long flags; |
126 | struct async_entry *entry; | ||
127 | ktime_t calltime, delta, rettime; | 122 | ktime_t calltime, delta, rettime; |
128 | 123 | ||
129 | /* 1) pick one task from the pending queue */ | 124 | /* 1) move self to the running queue */ |
130 | |||
131 | spin_lock_irqsave(&async_lock, flags); | 125 | spin_lock_irqsave(&async_lock, flags); |
132 | if (list_empty(&async_pending)) | ||
133 | goto out; | ||
134 | entry = list_first_entry(&async_pending, struct async_entry, list); | ||
135 | |||
136 | /* 2) move it to the running queue */ | ||
137 | list_move_tail(&entry->list, entry->running); | 126 | list_move_tail(&entry->list, entry->running); |
138 | spin_unlock_irqrestore(&async_lock, flags); | 127 | spin_unlock_irqrestore(&async_lock, flags); |
139 | 128 | ||
140 | /* 3) run it (and print duration)*/ | 129 | /* 2) run (and print duration) */ |
141 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 130 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
142 | printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, | 131 | printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, |
143 | entry->func, task_pid_nr(current)); | 132 | entry->func, task_pid_nr(current)); |
@@ -153,31 +142,25 @@ static void run_one_entry(void) | |||
153 | (long long)ktime_to_ns(delta) >> 10); | 142 | (long long)ktime_to_ns(delta) >> 10); |
154 | } | 143 | } |
155 | 144 | ||
156 | /* 4) remove it from the running queue */ | 145 | /* 3) remove self from the running queue */ |
157 | spin_lock_irqsave(&async_lock, flags); | 146 | spin_lock_irqsave(&async_lock, flags); |
158 | list_del(&entry->list); | 147 | list_del(&entry->list); |
159 | 148 | ||
160 | /* 5) free the entry */ | 149 | /* 4) free the entry */ |
161 | kfree(entry); | 150 | kfree(entry); |
162 | atomic_dec(&entry_count); | 151 | atomic_dec(&entry_count); |
163 | 152 | ||
164 | spin_unlock_irqrestore(&async_lock, flags); | 153 | spin_unlock_irqrestore(&async_lock, flags); |
165 | 154 | ||
166 | /* 6) wake up any waiters. */ | 155 | /* 5) wake up any waiters */ |
167 | wake_up(&async_done); | 156 | wake_up(&async_done); |
168 | return; | ||
169 | |||
170 | out: | ||
171 | spin_unlock_irqrestore(&async_lock, flags); | ||
172 | } | 157 | } |
173 | 158 | ||
174 | |||
175 | static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) | 159 | static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) |
176 | { | 160 | { |
177 | struct async_entry *entry; | 161 | struct async_entry *entry; |
178 | unsigned long flags; | 162 | unsigned long flags; |
179 | async_cookie_t newcookie; | 163 | async_cookie_t newcookie; |
180 | |||
181 | 164 | ||
182 | /* allow irq-off callers */ | 165 | /* allow irq-off callers */ |
183 | entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); | 166 | entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); |
@@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l | |||
186 | * If we're out of memory or if there's too much work | 169 | * If we're out of memory or if there's too much work |
187 | * pending already, we execute synchronously. | 170 | * pending already, we execute synchronously. |
188 | */ | 171 | */ |
189 | if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { | 172 | if (!entry || atomic_read(&entry_count) > MAX_WORK) { |
190 | kfree(entry); | 173 | kfree(entry); |
191 | spin_lock_irqsave(&async_lock, flags); | 174 | spin_lock_irqsave(&async_lock, flags); |
192 | newcookie = next_cookie++; | 175 | newcookie = next_cookie++; |
@@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l | |||
196 | ptr(data, newcookie); | 179 | ptr(data, newcookie); |
197 | return newcookie; | 180 | return newcookie; |
198 | } | 181 | } |
182 | INIT_WORK(&entry->work, async_run_entry_fn); | ||
199 | entry->func = ptr; | 183 | entry->func = ptr; |
200 | entry->data = data; | 184 | entry->data = data; |
201 | entry->running = running; | 185 | entry->running = running; |
@@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l | |||
205 | list_add_tail(&entry->list, &async_pending); | 189 | list_add_tail(&entry->list, &async_pending); |
206 | atomic_inc(&entry_count); | 190 | atomic_inc(&entry_count); |
207 | spin_unlock_irqrestore(&async_lock, flags); | 191 | spin_unlock_irqrestore(&async_lock, flags); |
208 | wake_up(&async_new); | 192 | |
193 | /* schedule for execution */ | ||
194 | queue_work(system_unbound_wq, &entry->work); | ||
195 | |||
209 | return newcookie; | 196 | return newcookie; |
210 | } | 197 | } |
211 | 198 | ||
@@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie) | |||
312 | async_synchronize_cookie_domain(cookie, &async_running); | 299 | async_synchronize_cookie_domain(cookie, &async_running); |
313 | } | 300 | } |
314 | EXPORT_SYMBOL_GPL(async_synchronize_cookie); | 301 | EXPORT_SYMBOL_GPL(async_synchronize_cookie); |
315 | |||
316 | |||
317 | static int async_thread(void *unused) | ||
318 | { | ||
319 | DECLARE_WAITQUEUE(wq, current); | ||
320 | add_wait_queue(&async_new, &wq); | ||
321 | |||
322 | while (!kthread_should_stop()) { | ||
323 | int ret = HZ; | ||
324 | set_current_state(TASK_INTERRUPTIBLE); | ||
325 | /* | ||
326 | * check the list head without lock.. false positives | ||
327 | * are dealt with inside run_one_entry() while holding | ||
328 | * the lock. | ||
329 | */ | ||
330 | rmb(); | ||
331 | if (!list_empty(&async_pending)) | ||
332 | run_one_entry(); | ||
333 | else | ||
334 | ret = schedule_timeout(HZ); | ||
335 | |||
336 | if (ret == 0) { | ||
337 | /* | ||
338 | * we timed out, this means we as thread are redundant. | ||
339 | * we sign off and die, but we to avoid any races there | ||
340 | * is a last-straw check to see if work snuck in. | ||
341 | */ | ||
342 | atomic_dec(&thread_count); | ||
343 | wmb(); /* manager must see our departure first */ | ||
344 | if (list_empty(&async_pending)) | ||
345 | break; | ||
346 | /* | ||
347 | * woops work came in between us timing out and us | ||
348 | * signing off; we need to stay alive and keep working. | ||
349 | */ | ||
350 | atomic_inc(&thread_count); | ||
351 | } | ||
352 | } | ||
353 | remove_wait_queue(&async_new, &wq); | ||
354 | |||
355 | return 0; | ||
356 | } | ||
357 | |||
358 | static int async_manager_thread(void *unused) | ||
359 | { | ||
360 | DECLARE_WAITQUEUE(wq, current); | ||
361 | add_wait_queue(&async_new, &wq); | ||
362 | |||
363 | while (!kthread_should_stop()) { | ||
364 | int tc, ec; | ||
365 | |||
366 | set_current_state(TASK_INTERRUPTIBLE); | ||
367 | |||
368 | tc = atomic_read(&thread_count); | ||
369 | rmb(); | ||
370 | ec = atomic_read(&entry_count); | ||
371 | |||
372 | while (tc < ec && tc < MAX_THREADS) { | ||
373 | if (IS_ERR(kthread_run(async_thread, NULL, "async/%i", | ||
374 | tc))) { | ||
375 | msleep(100); | ||
376 | continue; | ||
377 | } | ||
378 | atomic_inc(&thread_count); | ||
379 | tc++; | ||
380 | } | ||
381 | |||
382 | schedule(); | ||
383 | } | ||
384 | remove_wait_queue(&async_new, &wq); | ||
385 | |||
386 | return 0; | ||
387 | } | ||
388 | |||
389 | static int __init async_init(void) | ||
390 | { | ||
391 | async_enabled = | ||
392 | !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr")); | ||
393 | |||
394 | WARN_ON(!async_enabled); | ||
395 | return 0; | ||
396 | } | ||
397 | |||
398 | core_initcall(async_init); | ||
diff --git a/kernel/audit.c b/kernel/audit.c index c71bd26631a2..d96045789b54 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -56,7 +56,6 @@ | |||
56 | #include <net/netlink.h> | 56 | #include <net/netlink.h> |
57 | #include <linux/skbuff.h> | 57 | #include <linux/skbuff.h> |
58 | #include <linux/netlink.h> | 58 | #include <linux/netlink.h> |
59 | #include <linux/inotify.h> | ||
60 | #include <linux/freezer.h> | 59 | #include <linux/freezer.h> |
61 | #include <linux/tty.h> | 60 | #include <linux/tty.h> |
62 | 61 | ||
@@ -407,7 +406,7 @@ static void kauditd_send_skb(struct sk_buff *skb) | |||
407 | audit_hold_skb(skb); | 406 | audit_hold_skb(skb); |
408 | } else | 407 | } else |
409 | /* drop the extra reference if sent ok */ | 408 | /* drop the extra reference if sent ok */ |
410 | kfree_skb(skb); | 409 | consume_skb(skb); |
411 | } | 410 | } |
412 | 411 | ||
413 | static int kauditd_thread(void *dummy) | 412 | static int kauditd_thread(void *dummy) |
diff --git a/kernel/audit.h b/kernel/audit.h index 208687be4f30..f7206db4e13d 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex; | |||
103 | extern void audit_free_rule_rcu(struct rcu_head *); | 103 | extern void audit_free_rule_rcu(struct rcu_head *); |
104 | extern struct list_head audit_filter_list[]; | 104 | extern struct list_head audit_filter_list[]; |
105 | 105 | ||
106 | extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); | ||
107 | |||
106 | /* audit watch functions */ | 108 | /* audit watch functions */ |
107 | extern unsigned long audit_watch_inode(struct audit_watch *watch); | 109 | #ifdef CONFIG_AUDIT_WATCH |
108 | extern dev_t audit_watch_dev(struct audit_watch *watch); | ||
109 | extern void audit_put_watch(struct audit_watch *watch); | 110 | extern void audit_put_watch(struct audit_watch *watch); |
110 | extern void audit_get_watch(struct audit_watch *watch); | 111 | extern void audit_get_watch(struct audit_watch *watch); |
111 | extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); | 112 | extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); |
112 | extern int audit_add_watch(struct audit_krule *krule); | 113 | extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); |
113 | extern void audit_remove_watch(struct audit_watch *watch); | 114 | extern void audit_remove_watch_rule(struct audit_krule *krule); |
114 | extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list); | ||
115 | extern void audit_inotify_unregister(struct list_head *in_list); | ||
116 | extern char *audit_watch_path(struct audit_watch *watch); | 115 | extern char *audit_watch_path(struct audit_watch *watch); |
117 | extern struct list_head *audit_watch_rules(struct audit_watch *watch); | 116 | extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); |
118 | 117 | #else | |
119 | extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, | 118 | #define audit_put_watch(w) {} |
120 | struct audit_watch *watch); | 119 | #define audit_get_watch(w) {} |
120 | #define audit_to_watch(k, p, l, o) (-EINVAL) | ||
121 | #define audit_add_watch(k, l) (-EINVAL) | ||
122 | #define audit_remove_watch_rule(k) BUG() | ||
123 | #define audit_watch_path(w) "" | ||
124 | #define audit_watch_compare(w, i, d) 0 | ||
125 | |||
126 | #endif /* CONFIG_AUDIT_WATCH */ | ||
121 | 127 | ||
122 | #ifdef CONFIG_AUDIT_TREE | 128 | #ifdef CONFIG_AUDIT_TREE |
123 | extern struct audit_chunk *audit_tree_lookup(const struct inode *); | 129 | extern struct audit_chunk *audit_tree_lookup(const struct inode *); |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 46a57b57a335..7f18d3a4527e 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -1,5 +1,5 @@ | |||
1 | #include "audit.h" | 1 | #include "audit.h" |
2 | #include <linux/inotify.h> | 2 | #include <linux/fsnotify_backend.h> |
3 | #include <linux/namei.h> | 3 | #include <linux/namei.h> |
4 | #include <linux/mount.h> | 4 | #include <linux/mount.h> |
5 | #include <linux/kthread.h> | 5 | #include <linux/kthread.h> |
@@ -22,7 +22,7 @@ struct audit_tree { | |||
22 | 22 | ||
23 | struct audit_chunk { | 23 | struct audit_chunk { |
24 | struct list_head hash; | 24 | struct list_head hash; |
25 | struct inotify_watch watch; | 25 | struct fsnotify_mark mark; |
26 | struct list_head trees; /* with root here */ | 26 | struct list_head trees; /* with root here */ |
27 | int dead; | 27 | int dead; |
28 | int count; | 28 | int count; |
@@ -59,7 +59,7 @@ static LIST_HEAD(prune_list); | |||
59 | * tree is refcounted; one reference for "some rules on rules_list refer to | 59 | * tree is refcounted; one reference for "some rules on rules_list refer to |
60 | * it", one for each chunk with pointer to it. | 60 | * it", one for each chunk with pointer to it. |
61 | * | 61 | * |
62 | * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount | 62 | * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount |
63 | * of watch contributes 1 to .refs). | 63 | * of watch contributes 1 to .refs). |
64 | * | 64 | * |
65 | * node.index allows to get from node.list to containing chunk. | 65 | * node.index allows to get from node.list to containing chunk. |
@@ -68,7 +68,7 @@ static LIST_HEAD(prune_list); | |||
68 | * that makes a difference. Some. | 68 | * that makes a difference. Some. |
69 | */ | 69 | */ |
70 | 70 | ||
71 | static struct inotify_handle *rtree_ih; | 71 | static struct fsnotify_group *audit_tree_group; |
72 | 72 | ||
73 | static struct audit_tree *alloc_tree(const char *s) | 73 | static struct audit_tree *alloc_tree(const char *s) |
74 | { | 74 | { |
@@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree) | |||
111 | return tree->pathname; | 111 | return tree->pathname; |
112 | } | 112 | } |
113 | 113 | ||
114 | static struct audit_chunk *alloc_chunk(int count) | ||
115 | { | ||
116 | struct audit_chunk *chunk; | ||
117 | size_t size; | ||
118 | int i; | ||
119 | |||
120 | size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); | ||
121 | chunk = kzalloc(size, GFP_KERNEL); | ||
122 | if (!chunk) | ||
123 | return NULL; | ||
124 | |||
125 | INIT_LIST_HEAD(&chunk->hash); | ||
126 | INIT_LIST_HEAD(&chunk->trees); | ||
127 | chunk->count = count; | ||
128 | atomic_long_set(&chunk->refs, 1); | ||
129 | for (i = 0; i < count; i++) { | ||
130 | INIT_LIST_HEAD(&chunk->owners[i].list); | ||
131 | chunk->owners[i].index = i; | ||
132 | } | ||
133 | inotify_init_watch(&chunk->watch); | ||
134 | return chunk; | ||
135 | } | ||
136 | |||
137 | static void free_chunk(struct audit_chunk *chunk) | 114 | static void free_chunk(struct audit_chunk *chunk) |
138 | { | 115 | { |
139 | int i; | 116 | int i; |
@@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu) | |||
157 | audit_put_chunk(chunk); | 134 | audit_put_chunk(chunk); |
158 | } | 135 | } |
159 | 136 | ||
137 | static void audit_tree_destroy_watch(struct fsnotify_mark *entry) | ||
138 | { | ||
139 | struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); | ||
140 | call_rcu(&chunk->head, __put_chunk); | ||
141 | } | ||
142 | |||
143 | static struct audit_chunk *alloc_chunk(int count) | ||
144 | { | ||
145 | struct audit_chunk *chunk; | ||
146 | size_t size; | ||
147 | int i; | ||
148 | |||
149 | size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); | ||
150 | chunk = kzalloc(size, GFP_KERNEL); | ||
151 | if (!chunk) | ||
152 | return NULL; | ||
153 | |||
154 | INIT_LIST_HEAD(&chunk->hash); | ||
155 | INIT_LIST_HEAD(&chunk->trees); | ||
156 | chunk->count = count; | ||
157 | atomic_long_set(&chunk->refs, 1); | ||
158 | for (i = 0; i < count; i++) { | ||
159 | INIT_LIST_HEAD(&chunk->owners[i].list); | ||
160 | chunk->owners[i].index = i; | ||
161 | } | ||
162 | fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); | ||
163 | return chunk; | ||
164 | } | ||
165 | |||
160 | enum {HASH_SIZE = 128}; | 166 | enum {HASH_SIZE = 128}; |
161 | static struct list_head chunk_hash_heads[HASH_SIZE]; | 167 | static struct list_head chunk_hash_heads[HASH_SIZE]; |
162 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); | 168 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); |
@@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode) | |||
167 | return chunk_hash_heads + n % HASH_SIZE; | 173 | return chunk_hash_heads + n % HASH_SIZE; |
168 | } | 174 | } |
169 | 175 | ||
170 | /* hash_lock is held by caller */ | 176 | /* hash_lock & entry->lock is held by caller */ |
171 | static void insert_hash(struct audit_chunk *chunk) | 177 | static void insert_hash(struct audit_chunk *chunk) |
172 | { | 178 | { |
173 | struct list_head *list = chunk_hash(chunk->watch.inode); | 179 | struct fsnotify_mark *entry = &chunk->mark; |
180 | struct list_head *list; | ||
181 | |||
182 | if (!entry->i.inode) | ||
183 | return; | ||
184 | list = chunk_hash(entry->i.inode); | ||
174 | list_add_rcu(&chunk->hash, list); | 185 | list_add_rcu(&chunk->hash, list); |
175 | } | 186 | } |
176 | 187 | ||
@@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) | |||
181 | struct audit_chunk *p; | 192 | struct audit_chunk *p; |
182 | 193 | ||
183 | list_for_each_entry_rcu(p, list, hash) { | 194 | list_for_each_entry_rcu(p, list, hash) { |
184 | if (p->watch.inode == inode) { | 195 | /* mark.inode may have gone NULL, but who cares? */ |
196 | if (p->mark.i.inode == inode) { | ||
185 | atomic_long_inc(&p->refs); | 197 | atomic_long_inc(&p->refs); |
186 | return p; | 198 | return p; |
187 | } | 199 | } |
@@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p) | |||
210 | static void untag_chunk(struct node *p) | 222 | static void untag_chunk(struct node *p) |
211 | { | 223 | { |
212 | struct audit_chunk *chunk = find_chunk(p); | 224 | struct audit_chunk *chunk = find_chunk(p); |
225 | struct fsnotify_mark *entry = &chunk->mark; | ||
213 | struct audit_chunk *new; | 226 | struct audit_chunk *new; |
214 | struct audit_tree *owner; | 227 | struct audit_tree *owner; |
215 | int size = chunk->count - 1; | 228 | int size = chunk->count - 1; |
216 | int i, j; | 229 | int i, j; |
217 | 230 | ||
218 | if (!pin_inotify_watch(&chunk->watch)) { | 231 | fsnotify_get_mark(entry); |
219 | /* | ||
220 | * Filesystem is shutting down; all watches are getting | ||
221 | * evicted, just take it off the node list for this | ||
222 | * tree and let the eviction logics take care of the | ||
223 | * rest. | ||
224 | */ | ||
225 | owner = p->owner; | ||
226 | if (owner->root == chunk) { | ||
227 | list_del_init(&owner->same_root); | ||
228 | owner->root = NULL; | ||
229 | } | ||
230 | list_del_init(&p->list); | ||
231 | p->owner = NULL; | ||
232 | put_tree(owner); | ||
233 | return; | ||
234 | } | ||
235 | 232 | ||
236 | spin_unlock(&hash_lock); | 233 | spin_unlock(&hash_lock); |
237 | 234 | ||
238 | /* | 235 | spin_lock(&entry->lock); |
239 | * pin_inotify_watch() succeeded, so the watch won't go away | 236 | if (chunk->dead || !entry->i.inode) { |
240 | * from under us. | 237 | spin_unlock(&entry->lock); |
241 | */ | ||
242 | mutex_lock(&chunk->watch.inode->inotify_mutex); | ||
243 | if (chunk->dead) { | ||
244 | mutex_unlock(&chunk->watch.inode->inotify_mutex); | ||
245 | goto out; | 238 | goto out; |
246 | } | 239 | } |
247 | 240 | ||
@@ -256,16 +249,17 @@ static void untag_chunk(struct node *p) | |||
256 | list_del_init(&p->list); | 249 | list_del_init(&p->list); |
257 | list_del_rcu(&chunk->hash); | 250 | list_del_rcu(&chunk->hash); |
258 | spin_unlock(&hash_lock); | 251 | spin_unlock(&hash_lock); |
259 | inotify_evict_watch(&chunk->watch); | 252 | spin_unlock(&entry->lock); |
260 | mutex_unlock(&chunk->watch.inode->inotify_mutex); | 253 | fsnotify_destroy_mark(entry); |
261 | put_inotify_watch(&chunk->watch); | 254 | fsnotify_put_mark(entry); |
262 | goto out; | 255 | goto out; |
263 | } | 256 | } |
264 | 257 | ||
265 | new = alloc_chunk(size); | 258 | new = alloc_chunk(size); |
266 | if (!new) | 259 | if (!new) |
267 | goto Fallback; | 260 | goto Fallback; |
268 | if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) { | 261 | fsnotify_duplicate_mark(&new->mark, entry); |
262 | if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { | ||
269 | free_chunk(new); | 263 | free_chunk(new); |
270 | goto Fallback; | 264 | goto Fallback; |
271 | } | 265 | } |
@@ -298,9 +292,9 @@ static void untag_chunk(struct node *p) | |||
298 | list_for_each_entry(owner, &new->trees, same_root) | 292 | list_for_each_entry(owner, &new->trees, same_root) |
299 | owner->root = new; | 293 | owner->root = new; |
300 | spin_unlock(&hash_lock); | 294 | spin_unlock(&hash_lock); |
301 | inotify_evict_watch(&chunk->watch); | 295 | spin_unlock(&entry->lock); |
302 | mutex_unlock(&chunk->watch.inode->inotify_mutex); | 296 | fsnotify_destroy_mark(entry); |
303 | put_inotify_watch(&chunk->watch); | 297 | fsnotify_put_mark(entry); |
304 | goto out; | 298 | goto out; |
305 | 299 | ||
306 | Fallback: | 300 | Fallback: |
@@ -314,31 +308,33 @@ Fallback: | |||
314 | p->owner = NULL; | 308 | p->owner = NULL; |
315 | put_tree(owner); | 309 | put_tree(owner); |
316 | spin_unlock(&hash_lock); | 310 | spin_unlock(&hash_lock); |
317 | mutex_unlock(&chunk->watch.inode->inotify_mutex); | 311 | spin_unlock(&entry->lock); |
318 | out: | 312 | out: |
319 | unpin_inotify_watch(&chunk->watch); | 313 | fsnotify_put_mark(entry); |
320 | spin_lock(&hash_lock); | 314 | spin_lock(&hash_lock); |
321 | } | 315 | } |
322 | 316 | ||
323 | static int create_chunk(struct inode *inode, struct audit_tree *tree) | 317 | static int create_chunk(struct inode *inode, struct audit_tree *tree) |
324 | { | 318 | { |
319 | struct fsnotify_mark *entry; | ||
325 | struct audit_chunk *chunk = alloc_chunk(1); | 320 | struct audit_chunk *chunk = alloc_chunk(1); |
326 | if (!chunk) | 321 | if (!chunk) |
327 | return -ENOMEM; | 322 | return -ENOMEM; |
328 | 323 | ||
329 | if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) { | 324 | entry = &chunk->mark; |
325 | if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { | ||
330 | free_chunk(chunk); | 326 | free_chunk(chunk); |
331 | return -ENOSPC; | 327 | return -ENOSPC; |
332 | } | 328 | } |
333 | 329 | ||
334 | mutex_lock(&inode->inotify_mutex); | 330 | spin_lock(&entry->lock); |
335 | spin_lock(&hash_lock); | 331 | spin_lock(&hash_lock); |
336 | if (tree->goner) { | 332 | if (tree->goner) { |
337 | spin_unlock(&hash_lock); | 333 | spin_unlock(&hash_lock); |
338 | chunk->dead = 1; | 334 | chunk->dead = 1; |
339 | inotify_evict_watch(&chunk->watch); | 335 | spin_unlock(&entry->lock); |
340 | mutex_unlock(&inode->inotify_mutex); | 336 | fsnotify_destroy_mark(entry); |
341 | put_inotify_watch(&chunk->watch); | 337 | fsnotify_put_mark(entry); |
342 | return 0; | 338 | return 0; |
343 | } | 339 | } |
344 | chunk->owners[0].index = (1U << 31); | 340 | chunk->owners[0].index = (1U << 31); |
@@ -351,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) | |||
351 | } | 347 | } |
352 | insert_hash(chunk); | 348 | insert_hash(chunk); |
353 | spin_unlock(&hash_lock); | 349 | spin_unlock(&hash_lock); |
354 | mutex_unlock(&inode->inotify_mutex); | 350 | spin_unlock(&entry->lock); |
355 | return 0; | 351 | return 0; |
356 | } | 352 | } |
357 | 353 | ||
358 | /* the first tagged inode becomes root of tree */ | 354 | /* the first tagged inode becomes root of tree */ |
359 | static int tag_chunk(struct inode *inode, struct audit_tree *tree) | 355 | static int tag_chunk(struct inode *inode, struct audit_tree *tree) |
360 | { | 356 | { |
361 | struct inotify_watch *watch; | 357 | struct fsnotify_mark *old_entry, *chunk_entry; |
362 | struct audit_tree *owner; | 358 | struct audit_tree *owner; |
363 | struct audit_chunk *chunk, *old; | 359 | struct audit_chunk *chunk, *old; |
364 | struct node *p; | 360 | struct node *p; |
365 | int n; | 361 | int n; |
366 | 362 | ||
367 | if (inotify_find_watch(rtree_ih, inode, &watch) < 0) | 363 | old_entry = fsnotify_find_inode_mark(audit_tree_group, inode); |
364 | if (!old_entry) | ||
368 | return create_chunk(inode, tree); | 365 | return create_chunk(inode, tree); |
369 | 366 | ||
370 | old = container_of(watch, struct audit_chunk, watch); | 367 | old = container_of(old_entry, struct audit_chunk, mark); |
371 | 368 | ||
372 | /* are we already there? */ | 369 | /* are we already there? */ |
373 | spin_lock(&hash_lock); | 370 | spin_lock(&hash_lock); |
374 | for (n = 0; n < old->count; n++) { | 371 | for (n = 0; n < old->count; n++) { |
375 | if (old->owners[n].owner == tree) { | 372 | if (old->owners[n].owner == tree) { |
376 | spin_unlock(&hash_lock); | 373 | spin_unlock(&hash_lock); |
377 | put_inotify_watch(&old->watch); | 374 | fsnotify_put_mark(old_entry); |
378 | return 0; | 375 | return 0; |
379 | } | 376 | } |
380 | } | 377 | } |
@@ -382,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
382 | 379 | ||
383 | chunk = alloc_chunk(old->count + 1); | 380 | chunk = alloc_chunk(old->count + 1); |
384 | if (!chunk) { | 381 | if (!chunk) { |
385 | put_inotify_watch(&old->watch); | 382 | fsnotify_put_mark(old_entry); |
386 | return -ENOMEM; | 383 | return -ENOMEM; |
387 | } | 384 | } |
388 | 385 | ||
389 | mutex_lock(&inode->inotify_mutex); | 386 | chunk_entry = &chunk->mark; |
390 | if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { | 387 | |
391 | mutex_unlock(&inode->inotify_mutex); | 388 | spin_lock(&old_entry->lock); |
392 | put_inotify_watch(&old->watch); | 389 | if (!old_entry->i.inode) { |
390 | /* old_entry is being shot, lets just lie */ | ||
391 | spin_unlock(&old_entry->lock); | ||
392 | fsnotify_put_mark(old_entry); | ||
393 | free_chunk(chunk); | 393 | free_chunk(chunk); |
394 | return -ENOENT; | ||
395 | } | ||
396 | |||
397 | fsnotify_duplicate_mark(chunk_entry, old_entry); | ||
398 | if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { | ||
399 | spin_unlock(&old_entry->lock); | ||
400 | free_chunk(chunk); | ||
401 | fsnotify_put_mark(old_entry); | ||
394 | return -ENOSPC; | 402 | return -ENOSPC; |
395 | } | 403 | } |
404 | |||
405 | /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */ | ||
406 | spin_lock(&chunk_entry->lock); | ||
396 | spin_lock(&hash_lock); | 407 | spin_lock(&hash_lock); |
408 | |||
409 | /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */ | ||
397 | if (tree->goner) { | 410 | if (tree->goner) { |
398 | spin_unlock(&hash_lock); | 411 | spin_unlock(&hash_lock); |
399 | chunk->dead = 1; | 412 | chunk->dead = 1; |
400 | inotify_evict_watch(&chunk->watch); | 413 | spin_unlock(&chunk_entry->lock); |
401 | mutex_unlock(&inode->inotify_mutex); | 414 | spin_unlock(&old_entry->lock); |
402 | put_inotify_watch(&old->watch); | 415 | |
403 | put_inotify_watch(&chunk->watch); | 416 | fsnotify_destroy_mark(chunk_entry); |
417 | |||
418 | fsnotify_put_mark(chunk_entry); | ||
419 | fsnotify_put_mark(old_entry); | ||
404 | return 0; | 420 | return 0; |
405 | } | 421 | } |
406 | list_replace_init(&old->trees, &chunk->trees); | 422 | list_replace_init(&old->trees, &chunk->trees); |
@@ -426,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
426 | list_add(&tree->same_root, &chunk->trees); | 442 | list_add(&tree->same_root, &chunk->trees); |
427 | } | 443 | } |
428 | spin_unlock(&hash_lock); | 444 | spin_unlock(&hash_lock); |
429 | inotify_evict_watch(&old->watch); | 445 | spin_unlock(&chunk_entry->lock); |
430 | mutex_unlock(&inode->inotify_mutex); | 446 | spin_unlock(&old_entry->lock); |
431 | put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ | 447 | fsnotify_destroy_mark(old_entry); |
432 | put_inotify_watch(&old->watch); /* and kill it */ | 448 | fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ |
449 | fsnotify_put_mark(old_entry); /* and kill it */ | ||
433 | return 0; | 450 | return 0; |
434 | } | 451 | } |
435 | 452 | ||
@@ -584,7 +601,9 @@ void audit_trim_trees(void) | |||
584 | 601 | ||
585 | spin_lock(&hash_lock); | 602 | spin_lock(&hash_lock); |
586 | list_for_each_entry(node, &tree->chunks, list) { | 603 | list_for_each_entry(node, &tree->chunks, list) { |
587 | struct inode *inode = find_chunk(node)->watch.inode; | 604 | struct audit_chunk *chunk = find_chunk(node); |
605 | /* this could be NULL if the watch is dieing else where... */ | ||
606 | struct inode *inode = chunk->mark.i.inode; | ||
588 | node->index |= 1U<<31; | 607 | node->index |= 1U<<31; |
589 | if (iterate_mounts(compare_root, inode, root_mnt)) | 608 | if (iterate_mounts(compare_root, inode, root_mnt)) |
590 | node->index &= ~(1U<<31); | 609 | node->index &= ~(1U<<31); |
@@ -846,7 +865,6 @@ void audit_kill_trees(struct list_head *list) | |||
846 | * Here comes the stuff asynchronous to auditctl operations | 865 | * Here comes the stuff asynchronous to auditctl operations |
847 | */ | 866 | */ |
848 | 867 | ||
849 | /* inode->inotify_mutex is locked */ | ||
850 | static void evict_chunk(struct audit_chunk *chunk) | 868 | static void evict_chunk(struct audit_chunk *chunk) |
851 | { | 869 | { |
852 | struct audit_tree *owner; | 870 | struct audit_tree *owner; |
@@ -885,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk) | |||
885 | mutex_unlock(&audit_filter_mutex); | 903 | mutex_unlock(&audit_filter_mutex); |
886 | } | 904 | } |
887 | 905 | ||
888 | static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, | 906 | static int audit_tree_handle_event(struct fsnotify_group *group, |
889 | u32 cookie, const char *dname, struct inode *inode) | 907 | struct fsnotify_mark *inode_mark, |
908 | struct fsnotify_mark *vfsmonut_mark, | ||
909 | struct fsnotify_event *event) | ||
910 | { | ||
911 | BUG(); | ||
912 | return -EOPNOTSUPP; | ||
913 | } | ||
914 | |||
915 | static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) | ||
890 | { | 916 | { |
891 | struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); | 917 | struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); |
892 | 918 | ||
893 | if (mask & IN_IGNORED) { | 919 | evict_chunk(chunk); |
894 | evict_chunk(chunk); | 920 | fsnotify_put_mark(entry); |
895 | put_inotify_watch(watch); | ||
896 | } | ||
897 | } | 921 | } |
898 | 922 | ||
899 | static void destroy_watch(struct inotify_watch *watch) | 923 | static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, |
924 | struct fsnotify_mark *inode_mark, | ||
925 | struct fsnotify_mark *vfsmount_mark, | ||
926 | __u32 mask, void *data, int data_type) | ||
900 | { | 927 | { |
901 | struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); | 928 | return false; |
902 | call_rcu(&chunk->head, __put_chunk); | ||
903 | } | 929 | } |
904 | 930 | ||
905 | static const struct inotify_operations rtree_inotify_ops = { | 931 | static const struct fsnotify_ops audit_tree_ops = { |
906 | .handle_event = handle_event, | 932 | .handle_event = audit_tree_handle_event, |
907 | .destroy_watch = destroy_watch, | 933 | .should_send_event = audit_tree_send_event, |
934 | .free_group_priv = NULL, | ||
935 | .free_event_priv = NULL, | ||
936 | .freeing_mark = audit_tree_freeing_mark, | ||
908 | }; | 937 | }; |
909 | 938 | ||
910 | static int __init audit_tree_init(void) | 939 | static int __init audit_tree_init(void) |
911 | { | 940 | { |
912 | int i; | 941 | int i; |
913 | 942 | ||
914 | rtree_ih = inotify_init(&rtree_inotify_ops); | 943 | audit_tree_group = fsnotify_alloc_group(&audit_tree_ops); |
915 | if (IS_ERR(rtree_ih)) | 944 | if (IS_ERR(audit_tree_group)) |
916 | audit_panic("cannot initialize inotify handle for rectree watches"); | 945 | audit_panic("cannot initialize fsnotify group for rectree watches"); |
917 | 946 | ||
918 | for (i = 0; i < HASH_SIZE; i++) | 947 | for (i = 0; i < HASH_SIZE; i++) |
919 | INIT_LIST_HEAD(&chunk_hash_heads[i]); | 948 | INIT_LIST_HEAD(&chunk_hash_heads[i]); |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 8df43696f4ba..f0c9b2e7542d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
@@ -24,18 +24,18 @@ | |||
24 | #include <linux/kthread.h> | 24 | #include <linux/kthread.h> |
25 | #include <linux/mutex.h> | 25 | #include <linux/mutex.h> |
26 | #include <linux/fs.h> | 26 | #include <linux/fs.h> |
27 | #include <linux/fsnotify_backend.h> | ||
27 | #include <linux/namei.h> | 28 | #include <linux/namei.h> |
28 | #include <linux/netlink.h> | 29 | #include <linux/netlink.h> |
29 | #include <linux/sched.h> | 30 | #include <linux/sched.h> |
30 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
31 | #include <linux/inotify.h> | ||
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include "audit.h" | 33 | #include "audit.h" |
34 | 34 | ||
35 | /* | 35 | /* |
36 | * Reference counting: | 36 | * Reference counting: |
37 | * | 37 | * |
38 | * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED | 38 | * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED |
39 | * event. Each audit_watch holds a reference to its associated parent. | 39 | * event. Each audit_watch holds a reference to its associated parent. |
40 | * | 40 | * |
41 | * audit_watch: if added to lists, lifetime is from audit_init_watch() to | 41 | * audit_watch: if added to lists, lifetime is from audit_init_watch() to |
@@ -51,40 +51,61 @@ struct audit_watch { | |||
51 | unsigned long ino; /* associated inode number */ | 51 | unsigned long ino; /* associated inode number */ |
52 | struct audit_parent *parent; /* associated parent */ | 52 | struct audit_parent *parent; /* associated parent */ |
53 | struct list_head wlist; /* entry in parent->watches list */ | 53 | struct list_head wlist; /* entry in parent->watches list */ |
54 | struct list_head rules; /* associated rules */ | 54 | struct list_head rules; /* anchor for krule->rlist */ |
55 | }; | 55 | }; |
56 | 56 | ||
57 | struct audit_parent { | 57 | struct audit_parent { |
58 | struct list_head ilist; /* entry in inotify registration list */ | 58 | struct list_head watches; /* anchor for audit_watch->wlist */ |
59 | struct list_head watches; /* associated watches */ | 59 | struct fsnotify_mark mark; /* fsnotify mark on the inode */ |
60 | struct inotify_watch wdata; /* inotify watch data */ | ||
61 | unsigned flags; /* status flags */ | ||
62 | }; | 60 | }; |
63 | 61 | ||
64 | /* Inotify handle. */ | 62 | /* fsnotify handle. */ |
65 | struct inotify_handle *audit_ih; | 63 | struct fsnotify_group *audit_watch_group; |
66 | 64 | ||
67 | /* | 65 | /* fsnotify events we care about. */ |
68 | * audit_parent status flags: | 66 | #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ |
69 | * | 67 | FS_MOVE_SELF | FS_EVENT_ON_CHILD) |
70 | * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to | ||
71 | * a filesystem event to ensure we're adding audit watches to a valid parent. | ||
72 | * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot | ||
73 | * receive them while we have nameidata, but must be used for IN_MOVE_SELF which | ||
74 | * we can receive while holding nameidata. | ||
75 | */ | ||
76 | #define AUDIT_PARENT_INVALID 0x001 | ||
77 | 68 | ||
78 | /* Inotify events we care about. */ | 69 | static void audit_free_parent(struct audit_parent *parent) |
79 | #define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF | 70 | { |
71 | WARN_ON(!list_empty(&parent->watches)); | ||
72 | kfree(parent); | ||
73 | } | ||
80 | 74 | ||
81 | static void audit_free_parent(struct inotify_watch *i_watch) | 75 | static void audit_watch_free_mark(struct fsnotify_mark *entry) |
82 | { | 76 | { |
83 | struct audit_parent *parent; | 77 | struct audit_parent *parent; |
84 | 78 | ||
85 | parent = container_of(i_watch, struct audit_parent, wdata); | 79 | parent = container_of(entry, struct audit_parent, mark); |
86 | WARN_ON(!list_empty(&parent->watches)); | 80 | audit_free_parent(parent); |
87 | kfree(parent); | 81 | } |
82 | |||
83 | static void audit_get_parent(struct audit_parent *parent) | ||
84 | { | ||
85 | if (likely(parent)) | ||
86 | fsnotify_get_mark(&parent->mark); | ||
87 | } | ||
88 | |||
89 | static void audit_put_parent(struct audit_parent *parent) | ||
90 | { | ||
91 | if (likely(parent)) | ||
92 | fsnotify_put_mark(&parent->mark); | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * Find and return the audit_parent on the given inode. If found a reference | ||
97 | * is taken on this parent. | ||
98 | */ | ||
99 | static inline struct audit_parent *audit_find_parent(struct inode *inode) | ||
100 | { | ||
101 | struct audit_parent *parent = NULL; | ||
102 | struct fsnotify_mark *entry; | ||
103 | |||
104 | entry = fsnotify_find_inode_mark(audit_watch_group, inode); | ||
105 | if (entry) | ||
106 | parent = container_of(entry, struct audit_parent, mark); | ||
107 | |||
108 | return parent; | ||
88 | } | 109 | } |
89 | 110 | ||
90 | void audit_get_watch(struct audit_watch *watch) | 111 | void audit_get_watch(struct audit_watch *watch) |
@@ -105,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch) | |||
105 | void audit_remove_watch(struct audit_watch *watch) | 126 | void audit_remove_watch(struct audit_watch *watch) |
106 | { | 127 | { |
107 | list_del(&watch->wlist); | 128 | list_del(&watch->wlist); |
108 | put_inotify_watch(&watch->parent->wdata); | 129 | audit_put_parent(watch->parent); |
109 | watch->parent = NULL; | 130 | watch->parent = NULL; |
110 | audit_put_watch(watch); /* match initial get */ | 131 | audit_put_watch(watch); /* match initial get */ |
111 | } | 132 | } |
@@ -115,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch) | |||
115 | return watch->path; | 136 | return watch->path; |
116 | } | 137 | } |
117 | 138 | ||
118 | struct list_head *audit_watch_rules(struct audit_watch *watch) | 139 | int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) |
119 | { | ||
120 | return &watch->rules; | ||
121 | } | ||
122 | |||
123 | unsigned long audit_watch_inode(struct audit_watch *watch) | ||
124 | { | 140 | { |
125 | return watch->ino; | 141 | return (watch->ino != (unsigned long)-1) && |
126 | } | 142 | (watch->ino == ino) && |
127 | 143 | (watch->dev == dev); | |
128 | dev_t audit_watch_dev(struct audit_watch *watch) | ||
129 | { | ||
130 | return watch->dev; | ||
131 | } | 144 | } |
132 | 145 | ||
133 | /* Initialize a parent watch entry. */ | 146 | /* Initialize a parent watch entry. */ |
134 | static struct audit_parent *audit_init_parent(struct nameidata *ndp) | 147 | static struct audit_parent *audit_init_parent(struct nameidata *ndp) |
135 | { | 148 | { |
149 | struct inode *inode = ndp->path.dentry->d_inode; | ||
136 | struct audit_parent *parent; | 150 | struct audit_parent *parent; |
137 | s32 wd; | 151 | int ret; |
138 | 152 | ||
139 | parent = kzalloc(sizeof(*parent), GFP_KERNEL); | 153 | parent = kzalloc(sizeof(*parent), GFP_KERNEL); |
140 | if (unlikely(!parent)) | 154 | if (unlikely(!parent)) |
141 | return ERR_PTR(-ENOMEM); | 155 | return ERR_PTR(-ENOMEM); |
142 | 156 | ||
143 | INIT_LIST_HEAD(&parent->watches); | 157 | INIT_LIST_HEAD(&parent->watches); |
144 | parent->flags = 0; | 158 | |
145 | 159 | fsnotify_init_mark(&parent->mark, audit_watch_free_mark); | |
146 | inotify_init_watch(&parent->wdata); | 160 | parent->mark.mask = AUDIT_FS_WATCH; |
147 | /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ | 161 | ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0); |
148 | get_inotify_watch(&parent->wdata); | 162 | if (ret < 0) { |
149 | wd = inotify_add_watch(audit_ih, &parent->wdata, | 163 | audit_free_parent(parent); |
150 | ndp->path.dentry->d_inode, AUDIT_IN_WATCH); | 164 | return ERR_PTR(ret); |
151 | if (wd < 0) { | ||
152 | audit_free_parent(&parent->wdata); | ||
153 | return ERR_PTR(wd); | ||
154 | } | 165 | } |
155 | 166 | ||
156 | return parent; | 167 | return parent; |
@@ -179,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) | |||
179 | { | 190 | { |
180 | struct audit_watch *watch; | 191 | struct audit_watch *watch; |
181 | 192 | ||
182 | if (!audit_ih) | 193 | if (!audit_watch_group) |
183 | return -EOPNOTSUPP; | 194 | return -EOPNOTSUPP; |
184 | 195 | ||
185 | if (path[0] != '/' || path[len-1] == '/' || | 196 | if (path[0] != '/' || path[len-1] == '/' || |
@@ -217,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old) | |||
217 | 228 | ||
218 | new->dev = old->dev; | 229 | new->dev = old->dev; |
219 | new->ino = old->ino; | 230 | new->ino = old->ino; |
220 | get_inotify_watch(&old->parent->wdata); | 231 | audit_get_parent(old->parent); |
221 | new->parent = old->parent; | 232 | new->parent = old->parent; |
222 | 233 | ||
223 | out: | 234 | out: |
@@ -251,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent, | |||
251 | struct audit_entry *oentry, *nentry; | 262 | struct audit_entry *oentry, *nentry; |
252 | 263 | ||
253 | mutex_lock(&audit_filter_mutex); | 264 | mutex_lock(&audit_filter_mutex); |
265 | /* Run all of the watches on this parent looking for the one that | ||
266 | * matches the given dname */ | ||
254 | list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { | 267 | list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { |
255 | if (audit_compare_dname_path(dname, owatch->path, NULL)) | 268 | if (audit_compare_dname_path(dname, owatch->path, NULL)) |
256 | continue; | 269 | continue; |
257 | 270 | ||
258 | /* If the update involves invalidating rules, do the inode-based | 271 | /* If the update involves invalidating rules, do the inode-based |
259 | * filtering now, so we don't omit records. */ | 272 | * filtering now, so we don't omit records. */ |
260 | if (invalidating && current->audit_context) | 273 | if (invalidating && !audit_dummy_context()) |
261 | audit_filter_inodes(current, current->audit_context); | 274 | audit_filter_inodes(current, current->audit_context); |
262 | 275 | ||
276 | /* updating ino will likely change which audit_hash_list we | ||
277 | * are on so we need a new watch for the new list */ | ||
263 | nwatch = audit_dupe_watch(owatch); | 278 | nwatch = audit_dupe_watch(owatch); |
264 | if (IS_ERR(nwatch)) { | 279 | if (IS_ERR(nwatch)) { |
265 | mutex_unlock(&audit_filter_mutex); | 280 | mutex_unlock(&audit_filter_mutex); |
@@ -275,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent, | |||
275 | list_del(&oentry->rule.rlist); | 290 | list_del(&oentry->rule.rlist); |
276 | list_del_rcu(&oentry->list); | 291 | list_del_rcu(&oentry->list); |
277 | 292 | ||
278 | nentry = audit_dupe_rule(&oentry->rule, nwatch); | 293 | nentry = audit_dupe_rule(&oentry->rule); |
279 | if (IS_ERR(nentry)) { | 294 | if (IS_ERR(nentry)) { |
280 | list_del(&oentry->rule.list); | 295 | list_del(&oentry->rule.list); |
281 | audit_panic("error updating watch, removing"); | 296 | audit_panic("error updating watch, removing"); |
282 | } else { | 297 | } else { |
283 | int h = audit_hash_ino((u32)ino); | 298 | int h = audit_hash_ino((u32)ino); |
299 | |||
300 | /* | ||
301 | * nentry->rule.watch == oentry->rule.watch so | ||
302 | * we must drop that reference and set it to our | ||
303 | * new watch. | ||
304 | */ | ||
305 | audit_put_watch(nentry->rule.watch); | ||
306 | audit_get_watch(nwatch); | ||
307 | nentry->rule.watch = nwatch; | ||
284 | list_add(&nentry->rule.rlist, &nwatch->rules); | 308 | list_add(&nentry->rule.rlist, &nwatch->rules); |
285 | list_add_rcu(&nentry->list, &audit_inode_hash[h]); | 309 | list_add_rcu(&nentry->list, &audit_inode_hash[h]); |
286 | list_replace(&oentry->rule.list, | 310 | list_replace(&oentry->rule.list, |
@@ -312,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
312 | struct audit_entry *e; | 336 | struct audit_entry *e; |
313 | 337 | ||
314 | mutex_lock(&audit_filter_mutex); | 338 | mutex_lock(&audit_filter_mutex); |
315 | parent->flags |= AUDIT_PARENT_INVALID; | ||
316 | list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { | 339 | list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { |
317 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { | 340 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { |
318 | e = container_of(r, struct audit_entry, rule); | 341 | e = container_of(r, struct audit_entry, rule); |
@@ -325,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
325 | audit_remove_watch(w); | 348 | audit_remove_watch(w); |
326 | } | 349 | } |
327 | mutex_unlock(&audit_filter_mutex); | 350 | mutex_unlock(&audit_filter_mutex); |
328 | } | ||
329 | |||
330 | /* Unregister inotify watches for parents on in_list. | ||
331 | * Generates an IN_IGNORED event. */ | ||
332 | void audit_inotify_unregister(struct list_head *in_list) | ||
333 | { | ||
334 | struct audit_parent *p, *n; | ||
335 | 351 | ||
336 | list_for_each_entry_safe(p, n, in_list, ilist) { | 352 | fsnotify_destroy_mark(&parent->mark); |
337 | list_del(&p->ilist); | ||
338 | inotify_rm_watch(audit_ih, &p->wdata); | ||
339 | /* the unpin matching the pin in audit_do_del_rule() */ | ||
340 | unpin_inotify_watch(&p->wdata); | ||
341 | } | ||
342 | } | 353 | } |
343 | 354 | ||
344 | /* Get path information necessary for adding watches. */ | 355 | /* Get path information necessary for adding watches. */ |
@@ -389,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) | |||
389 | } | 400 | } |
390 | } | 401 | } |
391 | 402 | ||
392 | /* Associate the given rule with an existing parent inotify_watch. | 403 | /* Associate the given rule with an existing parent. |
393 | * Caller must hold audit_filter_mutex. */ | 404 | * Caller must hold audit_filter_mutex. */ |
394 | static void audit_add_to_parent(struct audit_krule *krule, | 405 | static void audit_add_to_parent(struct audit_krule *krule, |
395 | struct audit_parent *parent) | 406 | struct audit_parent *parent) |
@@ -397,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule, | |||
397 | struct audit_watch *w, *watch = krule->watch; | 408 | struct audit_watch *w, *watch = krule->watch; |
398 | int watch_found = 0; | 409 | int watch_found = 0; |
399 | 410 | ||
411 | BUG_ON(!mutex_is_locked(&audit_filter_mutex)); | ||
412 | |||
400 | list_for_each_entry(w, &parent->watches, wlist) { | 413 | list_for_each_entry(w, &parent->watches, wlist) { |
401 | if (strcmp(watch->path, w->path)) | 414 | if (strcmp(watch->path, w->path)) |
402 | continue; | 415 | continue; |
@@ -413,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule, | |||
413 | } | 426 | } |
414 | 427 | ||
415 | if (!watch_found) { | 428 | if (!watch_found) { |
416 | get_inotify_watch(&parent->wdata); | 429 | audit_get_parent(parent); |
417 | watch->parent = parent; | 430 | watch->parent = parent; |
418 | 431 | ||
419 | list_add(&watch->wlist, &parent->watches); | 432 | list_add(&watch->wlist, &parent->watches); |
@@ -423,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule, | |||
423 | 436 | ||
424 | /* Find a matching watch entry, or add this one. | 437 | /* Find a matching watch entry, or add this one. |
425 | * Caller must hold audit_filter_mutex. */ | 438 | * Caller must hold audit_filter_mutex. */ |
426 | int audit_add_watch(struct audit_krule *krule) | 439 | int audit_add_watch(struct audit_krule *krule, struct list_head **list) |
427 | { | 440 | { |
428 | struct audit_watch *watch = krule->watch; | 441 | struct audit_watch *watch = krule->watch; |
429 | struct inotify_watch *i_watch; | ||
430 | struct audit_parent *parent; | 442 | struct audit_parent *parent; |
431 | struct nameidata *ndp = NULL, *ndw = NULL; | 443 | struct nameidata *ndp = NULL, *ndw = NULL; |
432 | int ret = 0; | 444 | int h, ret = 0; |
433 | 445 | ||
434 | mutex_unlock(&audit_filter_mutex); | 446 | mutex_unlock(&audit_filter_mutex); |
435 | 447 | ||
@@ -441,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule) | |||
441 | goto error; | 453 | goto error; |
442 | } | 454 | } |
443 | 455 | ||
456 | mutex_lock(&audit_filter_mutex); | ||
457 | |||
444 | /* update watch filter fields */ | 458 | /* update watch filter fields */ |
445 | if (ndw) { | 459 | if (ndw) { |
446 | watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; | 460 | watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; |
447 | watch->ino = ndw->path.dentry->d_inode->i_ino; | 461 | watch->ino = ndw->path.dentry->d_inode->i_ino; |
448 | } | 462 | } |
449 | 463 | ||
450 | /* The audit_filter_mutex must not be held during inotify calls because | 464 | /* either find an old parent or attach a new one */ |
451 | * we hold it during inotify event callback processing. If an existing | 465 | parent = audit_find_parent(ndp->path.dentry->d_inode); |
452 | * inotify watch is found, inotify_find_watch() grabs a reference before | 466 | if (!parent) { |
453 | * returning. | ||
454 | */ | ||
455 | if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, | ||
456 | &i_watch) < 0) { | ||
457 | parent = audit_init_parent(ndp); | 467 | parent = audit_init_parent(ndp); |
458 | if (IS_ERR(parent)) { | 468 | if (IS_ERR(parent)) { |
459 | /* caller expects mutex locked */ | ||
460 | mutex_lock(&audit_filter_mutex); | ||
461 | ret = PTR_ERR(parent); | 469 | ret = PTR_ERR(parent); |
462 | goto error; | 470 | goto error; |
463 | } | 471 | } |
464 | } else | 472 | } |
465 | parent = container_of(i_watch, struct audit_parent, wdata); | ||
466 | |||
467 | mutex_lock(&audit_filter_mutex); | ||
468 | 473 | ||
469 | /* parent was moved before we took audit_filter_mutex */ | 474 | audit_add_to_parent(krule, parent); |
470 | if (parent->flags & AUDIT_PARENT_INVALID) | ||
471 | ret = -ENOENT; | ||
472 | else | ||
473 | audit_add_to_parent(krule, parent); | ||
474 | 475 | ||
475 | /* match get in audit_init_parent or inotify_find_watch */ | 476 | /* match get in audit_find_parent or audit_init_parent */ |
476 | put_inotify_watch(&parent->wdata); | 477 | audit_put_parent(parent); |
477 | 478 | ||
479 | h = audit_hash_ino((u32)watch->ino); | ||
480 | *list = &audit_inode_hash[h]; | ||
478 | error: | 481 | error: |
479 | audit_put_nd(ndp, ndw); /* NULL args OK */ | 482 | audit_put_nd(ndp, ndw); /* NULL args OK */ |
480 | return ret; | 483 | return ret; |
481 | 484 | ||
482 | } | 485 | } |
483 | 486 | ||
484 | void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) | 487 | void audit_remove_watch_rule(struct audit_krule *krule) |
485 | { | 488 | { |
486 | struct audit_watch *watch = krule->watch; | 489 | struct audit_watch *watch = krule->watch; |
487 | struct audit_parent *parent = watch->parent; | 490 | struct audit_parent *parent = watch->parent; |
@@ -492,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) | |||
492 | audit_remove_watch(watch); | 495 | audit_remove_watch(watch); |
493 | 496 | ||
494 | if (list_empty(&parent->watches)) { | 497 | if (list_empty(&parent->watches)) { |
495 | /* Put parent on the inotify un-registration | 498 | audit_get_parent(parent); |
496 | * list. Grab a reference before releasing | 499 | fsnotify_destroy_mark(&parent->mark); |
497 | * audit_filter_mutex, to be released in | 500 | audit_put_parent(parent); |
498 | * audit_inotify_unregister(). | ||
499 | * If filesystem is going away, just leave | ||
500 | * the sucker alone, eviction will take | ||
501 | * care of it. */ | ||
502 | if (pin_inotify_watch(&parent->wdata)) | ||
503 | list_add(&parent->ilist, list); | ||
504 | } | 501 | } |
505 | } | 502 | } |
506 | } | 503 | } |
507 | 504 | ||
508 | /* Update watch data in audit rules based on inotify events. */ | 505 | static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, |
509 | static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, | 506 | struct fsnotify_mark *inode_mark, |
510 | u32 cookie, const char *dname, struct inode *inode) | 507 | struct fsnotify_mark *vfsmount_mark, |
508 | __u32 mask, void *data, int data_type) | ||
509 | { | ||
510 | return true; | ||
511 | } | ||
512 | |||
513 | /* Update watch data in audit rules based on fsnotify events. */ | ||
514 | static int audit_watch_handle_event(struct fsnotify_group *group, | ||
515 | struct fsnotify_mark *inode_mark, | ||
516 | struct fsnotify_mark *vfsmount_mark, | ||
517 | struct fsnotify_event *event) | ||
511 | { | 518 | { |
519 | struct inode *inode; | ||
520 | __u32 mask = event->mask; | ||
521 | const char *dname = event->file_name; | ||
512 | struct audit_parent *parent; | 522 | struct audit_parent *parent; |
513 | 523 | ||
514 | parent = container_of(i_watch, struct audit_parent, wdata); | 524 | parent = container_of(inode_mark, struct audit_parent, mark); |
515 | 525 | ||
516 | if (mask & (IN_CREATE|IN_MOVED_TO) && inode) | 526 | BUG_ON(group != audit_watch_group); |
517 | audit_update_watch(parent, dname, inode->i_sb->s_dev, | 527 | |
518 | inode->i_ino, 0); | 528 | switch (event->data_type) { |
519 | else if (mask & (IN_DELETE|IN_MOVED_FROM)) | 529 | case (FSNOTIFY_EVENT_PATH): |
530 | inode = event->path.dentry->d_inode; | ||
531 | break; | ||
532 | case (FSNOTIFY_EVENT_INODE): | ||
533 | inode = event->inode; | ||
534 | break; | ||
535 | default: | ||
536 | BUG(); | ||
537 | inode = NULL; | ||
538 | break; | ||
539 | }; | ||
540 | |||
541 | if (mask & (FS_CREATE|FS_MOVED_TO) && inode) | ||
542 | audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); | ||
543 | else if (mask & (FS_DELETE|FS_MOVED_FROM)) | ||
520 | audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); | 544 | audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); |
521 | /* inotify automatically removes the watch and sends IN_IGNORED */ | 545 | else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) |
522 | else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) | ||
523 | audit_remove_parent_watches(parent); | ||
524 | /* inotify does not remove the watch, so remove it manually */ | ||
525 | else if(mask & IN_MOVE_SELF) { | ||
526 | audit_remove_parent_watches(parent); | 546 | audit_remove_parent_watches(parent); |
527 | inotify_remove_watch_locked(audit_ih, i_watch); | 547 | |
528 | } else if (mask & IN_IGNORED) | 548 | return 0; |
529 | put_inotify_watch(i_watch); | ||
530 | } | 549 | } |
531 | 550 | ||
532 | static const struct inotify_operations audit_inotify_ops = { | 551 | static const struct fsnotify_ops audit_watch_fsnotify_ops = { |
533 | .handle_event = audit_handle_ievent, | 552 | .should_send_event = audit_watch_should_send_event, |
534 | .destroy_watch = audit_free_parent, | 553 | .handle_event = audit_watch_handle_event, |
554 | .free_group_priv = NULL, | ||
555 | .freeing_mark = NULL, | ||
556 | .free_event_priv = NULL, | ||
535 | }; | 557 | }; |
536 | 558 | ||
537 | static int __init audit_watch_init(void) | 559 | static int __init audit_watch_init(void) |
538 | { | 560 | { |
539 | audit_ih = inotify_init(&audit_inotify_ops); | 561 | audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops); |
540 | if (IS_ERR(audit_ih)) | 562 | if (IS_ERR(audit_watch_group)) { |
541 | audit_panic("cannot initialize inotify handle"); | 563 | audit_watch_group = NULL; |
564 | audit_panic("cannot create audit fsnotify group"); | ||
565 | } | ||
542 | return 0; | 566 | return 0; |
543 | } | 567 | } |
544 | subsys_initcall(audit_watch_init); | 568 | device_initcall(audit_watch_init); |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index ce08041f578d..eb7675499fb5 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -71,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e) | |||
71 | { | 71 | { |
72 | int i; | 72 | int i; |
73 | struct audit_krule *erule = &e->rule; | 73 | struct audit_krule *erule = &e->rule; |
74 | |||
74 | /* some rules don't have associated watches */ | 75 | /* some rules don't have associated watches */ |
75 | if (erule->watch) | 76 | if (erule->watch) |
76 | audit_put_watch(erule->watch); | 77 | audit_put_watch(erule->watch); |
@@ -746,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, | |||
746 | * rule with the new rule in the filterlist, then free the old rule. | 747 | * rule with the new rule in the filterlist, then free the old rule. |
747 | * The rlist element is undefined; list manipulations are handled apart from | 748 | * The rlist element is undefined; list manipulations are handled apart from |
748 | * the initial copy. */ | 749 | * the initial copy. */ |
749 | struct audit_entry *audit_dupe_rule(struct audit_krule *old, | 750 | struct audit_entry *audit_dupe_rule(struct audit_krule *old) |
750 | struct audit_watch *watch) | ||
751 | { | 751 | { |
752 | u32 fcount = old->field_count; | 752 | u32 fcount = old->field_count; |
753 | struct audit_entry *entry; | 753 | struct audit_entry *entry; |
@@ -769,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old, | |||
769 | new->prio = old->prio; | 769 | new->prio = old->prio; |
770 | new->buflen = old->buflen; | 770 | new->buflen = old->buflen; |
771 | new->inode_f = old->inode_f; | 771 | new->inode_f = old->inode_f; |
772 | new->watch = NULL; | ||
773 | new->field_count = old->field_count; | 772 | new->field_count = old->field_count; |
773 | |||
774 | /* | 774 | /* |
775 | * note that we are OK with not refcounting here; audit_match_tree() | 775 | * note that we are OK with not refcounting here; audit_match_tree() |
776 | * never dereferences tree and we can't get false positives there | 776 | * never dereferences tree and we can't get false positives there |
@@ -811,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old, | |||
811 | } | 811 | } |
812 | } | 812 | } |
813 | 813 | ||
814 | if (watch) { | 814 | if (old->watch) { |
815 | audit_get_watch(watch); | 815 | audit_get_watch(old->watch); |
816 | new->watch = watch; | 816 | new->watch = old->watch; |
817 | } | 817 | } |
818 | 818 | ||
819 | return entry; | 819 | return entry; |
@@ -866,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry) | |||
866 | struct audit_watch *watch = entry->rule.watch; | 866 | struct audit_watch *watch = entry->rule.watch; |
867 | struct audit_tree *tree = entry->rule.tree; | 867 | struct audit_tree *tree = entry->rule.tree; |
868 | struct list_head *list; | 868 | struct list_head *list; |
869 | int h, err; | 869 | int err; |
870 | #ifdef CONFIG_AUDITSYSCALL | 870 | #ifdef CONFIG_AUDITSYSCALL |
871 | int dont_count = 0; | 871 | int dont_count = 0; |
872 | 872 | ||
@@ -889,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry) | |||
889 | 889 | ||
890 | if (watch) { | 890 | if (watch) { |
891 | /* audit_filter_mutex is dropped and re-taken during this call */ | 891 | /* audit_filter_mutex is dropped and re-taken during this call */ |
892 | err = audit_add_watch(&entry->rule); | 892 | err = audit_add_watch(&entry->rule, &list); |
893 | if (err) { | 893 | if (err) { |
894 | mutex_unlock(&audit_filter_mutex); | 894 | mutex_unlock(&audit_filter_mutex); |
895 | goto error; | 895 | goto error; |
896 | } | 896 | } |
897 | /* entry->rule.watch may have changed during audit_add_watch() */ | ||
898 | watch = entry->rule.watch; | ||
899 | h = audit_hash_ino((u32)audit_watch_inode(watch)); | ||
900 | list = &audit_inode_hash[h]; | ||
901 | } | 897 | } |
902 | if (tree) { | 898 | if (tree) { |
903 | err = audit_add_tree_rule(&entry->rule); | 899 | err = audit_add_tree_rule(&entry->rule); |
@@ -949,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry) | |||
949 | struct audit_watch *watch = entry->rule.watch; | 945 | struct audit_watch *watch = entry->rule.watch; |
950 | struct audit_tree *tree = entry->rule.tree; | 946 | struct audit_tree *tree = entry->rule.tree; |
951 | struct list_head *list; | 947 | struct list_head *list; |
952 | LIST_HEAD(inotify_list); | ||
953 | int ret = 0; | 948 | int ret = 0; |
954 | #ifdef CONFIG_AUDITSYSCALL | 949 | #ifdef CONFIG_AUDITSYSCALL |
955 | int dont_count = 0; | 950 | int dont_count = 0; |
@@ -969,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry) | |||
969 | } | 964 | } |
970 | 965 | ||
971 | if (e->rule.watch) | 966 | if (e->rule.watch) |
972 | audit_remove_watch_rule(&e->rule, &inotify_list); | 967 | audit_remove_watch_rule(&e->rule); |
973 | 968 | ||
974 | if (e->rule.tree) | 969 | if (e->rule.tree) |
975 | audit_remove_tree_rule(&e->rule); | 970 | audit_remove_tree_rule(&e->rule); |
@@ -987,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry) | |||
987 | #endif | 982 | #endif |
988 | mutex_unlock(&audit_filter_mutex); | 983 | mutex_unlock(&audit_filter_mutex); |
989 | 984 | ||
990 | if (!list_empty(&inotify_list)) | ||
991 | audit_inotify_unregister(&inotify_list); | ||
992 | |||
993 | out: | 985 | out: |
994 | if (watch) | 986 | if (watch) |
995 | audit_put_watch(watch); /* match initial get */ | 987 | audit_put_watch(watch); /* match initial get */ |
@@ -1323,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r) | |||
1323 | { | 1315 | { |
1324 | struct audit_entry *entry = container_of(r, struct audit_entry, rule); | 1316 | struct audit_entry *entry = container_of(r, struct audit_entry, rule); |
1325 | struct audit_entry *nentry; | 1317 | struct audit_entry *nentry; |
1326 | struct audit_watch *watch; | ||
1327 | struct audit_tree *tree; | ||
1328 | int err = 0; | 1318 | int err = 0; |
1329 | 1319 | ||
1330 | if (!security_audit_rule_known(r)) | 1320 | if (!security_audit_rule_known(r)) |
1331 | return 0; | 1321 | return 0; |
1332 | 1322 | ||
1333 | watch = r->watch; | 1323 | nentry = audit_dupe_rule(r); |
1334 | tree = r->tree; | ||
1335 | nentry = audit_dupe_rule(r, watch); | ||
1336 | if (IS_ERR(nentry)) { | 1324 | if (IS_ERR(nentry)) { |
1337 | /* save the first error encountered for the | 1325 | /* save the first error encountered for the |
1338 | * return value */ | 1326 | * return value */ |
1339 | err = PTR_ERR(nentry); | 1327 | err = PTR_ERR(nentry); |
1340 | audit_panic("error updating LSM filters"); | 1328 | audit_panic("error updating LSM filters"); |
1341 | if (watch) | 1329 | if (r->watch) |
1342 | list_del(&r->rlist); | 1330 | list_del(&r->rlist); |
1343 | list_del_rcu(&entry->list); | 1331 | list_del_rcu(&entry->list); |
1344 | list_del(&r->list); | 1332 | list_del(&r->list); |
1345 | } else { | 1333 | } else { |
1346 | if (watch) { | 1334 | if (r->watch || r->tree) |
1347 | list_add(&nentry->rule.rlist, audit_watch_rules(watch)); | ||
1348 | list_del(&r->rlist); | ||
1349 | } else if (tree) | ||
1350 | list_replace_init(&r->rlist, &nentry->rule.rlist); | 1335 | list_replace_init(&r->rlist, &nentry->rule.rlist); |
1351 | list_replace_rcu(&entry->list, &nentry->list); | 1336 | list_replace_rcu(&entry->list, &nentry->list); |
1352 | list_replace(&r->list, &nentry->rule.list); | 1337 | list_replace(&r->list, &nentry->rule.list); |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3828ad5fb8f1..1b31c130d034 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -65,7 +65,6 @@ | |||
65 | #include <linux/binfmts.h> | 65 | #include <linux/binfmts.h> |
66 | #include <linux/highmem.h> | 66 | #include <linux/highmem.h> |
67 | #include <linux/syscalls.h> | 67 | #include <linux/syscalls.h> |
68 | #include <linux/inotify.h> | ||
69 | #include <linux/capability.h> | 68 | #include <linux/capability.h> |
70 | #include <linux/fs_struct.h> | 69 | #include <linux/fs_struct.h> |
71 | 70 | ||
@@ -549,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
549 | } | 548 | } |
550 | break; | 549 | break; |
551 | case AUDIT_WATCH: | 550 | case AUDIT_WATCH: |
552 | if (name && audit_watch_inode(rule->watch) != (unsigned long)-1) | 551 | if (name) |
553 | result = (name->dev == audit_watch_dev(rule->watch) && | 552 | result = audit_watch_compare(rule->watch, name->ino, name->dev); |
554 | name->ino == audit_watch_inode(rule->watch)); | ||
555 | break; | 553 | break; |
556 | case AUDIT_DIR: | 554 | case AUDIT_DIR: |
557 | if (ctx) | 555 | if (ctx) |
@@ -1726,7 +1724,7 @@ static inline void handle_one(const struct inode *inode) | |||
1726 | struct audit_tree_refs *p; | 1724 | struct audit_tree_refs *p; |
1727 | struct audit_chunk *chunk; | 1725 | struct audit_chunk *chunk; |
1728 | int count; | 1726 | int count; |
1729 | if (likely(list_empty(&inode->inotify_watches))) | 1727 | if (likely(hlist_empty(&inode->i_fsnotify_marks))) |
1730 | return; | 1728 | return; |
1731 | context = current->audit_context; | 1729 | context = current->audit_context; |
1732 | p = context->trees; | 1730 | p = context->trees; |
@@ -1769,7 +1767,7 @@ retry: | |||
1769 | seq = read_seqbegin(&rename_lock); | 1767 | seq = read_seqbegin(&rename_lock); |
1770 | for(;;) { | 1768 | for(;;) { |
1771 | struct inode *inode = d->d_inode; | 1769 | struct inode *inode = d->d_inode; |
1772 | if (inode && unlikely(!list_empty(&inode->inotify_watches))) { | 1770 | if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { |
1773 | struct audit_chunk *chunk; | 1771 | struct audit_chunk *chunk; |
1774 | chunk = audit_tree_lookup(inode); | 1772 | chunk = audit_tree_lookup(inode); |
1775 | if (chunk) { | 1773 | if (chunk) { |
@@ -1837,13 +1835,8 @@ void __audit_getname(const char *name) | |||
1837 | context->names[context->name_count].ino = (unsigned long)-1; | 1835 | context->names[context->name_count].ino = (unsigned long)-1; |
1838 | context->names[context->name_count].osid = 0; | 1836 | context->names[context->name_count].osid = 0; |
1839 | ++context->name_count; | 1837 | ++context->name_count; |
1840 | if (!context->pwd.dentry) { | 1838 | if (!context->pwd.dentry) |
1841 | read_lock(¤t->fs->lock); | 1839 | get_fs_pwd(current->fs, &context->pwd); |
1842 | context->pwd = current->fs->pwd; | ||
1843 | path_get(¤t->fs->pwd); | ||
1844 | read_unlock(¤t->fs->lock); | ||
1845 | } | ||
1846 | |||
1847 | } | 1840 | } |
1848 | 1841 | ||
1849 | /* audit_putname - intercept a putname request | 1842 | /* audit_putname - intercept a putname request |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3ac6f5b0a64b..5cf366965d0c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -52,7 +52,6 @@ | |||
52 | #include <linux/cgroupstats.h> | 52 | #include <linux/cgroupstats.h> |
53 | #include <linux/hash.h> | 53 | #include <linux/hash.h> |
54 | #include <linux/namei.h> | 54 | #include <linux/namei.h> |
55 | #include <linux/smp_lock.h> | ||
56 | #include <linux/pid_namespace.h> | 55 | #include <linux/pid_namespace.h> |
57 | #include <linux/idr.h> | 56 | #include <linux/idr.h> |
58 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | 57 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ |
@@ -138,7 +137,7 @@ struct css_id { | |||
138 | * is called after synchronize_rcu(). But for safe use, css_is_removed() | 137 | * is called after synchronize_rcu(). But for safe use, css_is_removed() |
139 | * css_tryget() should be used for avoiding race. | 138 | * css_tryget() should be used for avoiding race. |
140 | */ | 139 | */ |
141 | struct cgroup_subsys_state *css; | 140 | struct cgroup_subsys_state __rcu *css; |
142 | /* | 141 | /* |
143 | * ID of this css. | 142 | * ID of this css. |
144 | */ | 143 | */ |
@@ -244,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
244 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 243 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
245 | } | 244 | } |
246 | 245 | ||
246 | static int clone_children(const struct cgroup *cgrp) | ||
247 | { | ||
248 | return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
249 | } | ||
250 | |||
247 | /* | 251 | /* |
248 | * for_each_subsys() allows you to iterate on each subsystem attached to | 252 | * for_each_subsys() allows you to iterate on each subsystem attached to |
249 | * an active hierarchy | 253 | * an active hierarchy |
@@ -778,6 +782,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) | |||
778 | struct inode *inode = new_inode(sb); | 782 | struct inode *inode = new_inode(sb); |
779 | 783 | ||
780 | if (inode) { | 784 | if (inode) { |
785 | inode->i_ino = get_next_ino(); | ||
781 | inode->i_mode = mode; | 786 | inode->i_mode = mode; |
782 | inode->i_uid = current_fsuid(); | 787 | inode->i_uid = current_fsuid(); |
783 | inode->i_gid = current_fsgid(); | 788 | inode->i_gid = current_fsgid(); |
@@ -1040,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
1040 | seq_puts(seq, ",noprefix"); | 1045 | seq_puts(seq, ",noprefix"); |
1041 | if (strlen(root->release_agent_path)) | 1046 | if (strlen(root->release_agent_path)) |
1042 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); | 1047 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); |
1048 | if (clone_children(&root->top_cgroup)) | ||
1049 | seq_puts(seq, ",clone_children"); | ||
1043 | if (strlen(root->name)) | 1050 | if (strlen(root->name)) |
1044 | seq_printf(seq, ",name=%s", root->name); | 1051 | seq_printf(seq, ",name=%s", root->name); |
1045 | mutex_unlock(&cgroup_mutex); | 1052 | mutex_unlock(&cgroup_mutex); |
@@ -1050,6 +1057,7 @@ struct cgroup_sb_opts { | |||
1050 | unsigned long subsys_bits; | 1057 | unsigned long subsys_bits; |
1051 | unsigned long flags; | 1058 | unsigned long flags; |
1052 | char *release_agent; | 1059 | char *release_agent; |
1060 | bool clone_children; | ||
1053 | char *name; | 1061 | char *name; |
1054 | /* User explicitly requested empty subsystem */ | 1062 | /* User explicitly requested empty subsystem */ |
1055 | bool none; | 1063 | bool none; |
@@ -1066,7 +1074,8 @@ struct cgroup_sb_opts { | |||
1066 | */ | 1074 | */ |
1067 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | 1075 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) |
1068 | { | 1076 | { |
1069 | char *token, *o = data ?: "all"; | 1077 | char *token, *o = data; |
1078 | bool all_ss = false, one_ss = false; | ||
1070 | unsigned long mask = (unsigned long)-1; | 1079 | unsigned long mask = (unsigned long)-1; |
1071 | int i; | 1080 | int i; |
1072 | bool module_pin_failed = false; | 1081 | bool module_pin_failed = false; |
@@ -1082,30 +1091,37 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1082 | while ((token = strsep(&o, ",")) != NULL) { | 1091 | while ((token = strsep(&o, ",")) != NULL) { |
1083 | if (!*token) | 1092 | if (!*token) |
1084 | return -EINVAL; | 1093 | return -EINVAL; |
1085 | if (!strcmp(token, "all")) { | 1094 | if (!strcmp(token, "none")) { |
1086 | /* Add all non-disabled subsystems */ | ||
1087 | opts->subsys_bits = 0; | ||
1088 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1089 | struct cgroup_subsys *ss = subsys[i]; | ||
1090 | if (ss == NULL) | ||
1091 | continue; | ||
1092 | if (!ss->disabled) | ||
1093 | opts->subsys_bits |= 1ul << i; | ||
1094 | } | ||
1095 | } else if (!strcmp(token, "none")) { | ||
1096 | /* Explicitly have no subsystems */ | 1095 | /* Explicitly have no subsystems */ |
1097 | opts->none = true; | 1096 | opts->none = true; |
1098 | } else if (!strcmp(token, "noprefix")) { | 1097 | continue; |
1098 | } | ||
1099 | if (!strcmp(token, "all")) { | ||
1100 | /* Mutually exclusive option 'all' + subsystem name */ | ||
1101 | if (one_ss) | ||
1102 | return -EINVAL; | ||
1103 | all_ss = true; | ||
1104 | continue; | ||
1105 | } | ||
1106 | if (!strcmp(token, "noprefix")) { | ||
1099 | set_bit(ROOT_NOPREFIX, &opts->flags); | 1107 | set_bit(ROOT_NOPREFIX, &opts->flags); |
1100 | } else if (!strncmp(token, "release_agent=", 14)) { | 1108 | continue; |
1109 | } | ||
1110 | if (!strcmp(token, "clone_children")) { | ||
1111 | opts->clone_children = true; | ||
1112 | continue; | ||
1113 | } | ||
1114 | if (!strncmp(token, "release_agent=", 14)) { | ||
1101 | /* Specifying two release agents is forbidden */ | 1115 | /* Specifying two release agents is forbidden */ |
1102 | if (opts->release_agent) | 1116 | if (opts->release_agent) |
1103 | return -EINVAL; | 1117 | return -EINVAL; |
1104 | opts->release_agent = | 1118 | opts->release_agent = |
1105 | kstrndup(token + 14, PATH_MAX, GFP_KERNEL); | 1119 | kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); |
1106 | if (!opts->release_agent) | 1120 | if (!opts->release_agent) |
1107 | return -ENOMEM; | 1121 | return -ENOMEM; |
1108 | } else if (!strncmp(token, "name=", 5)) { | 1122 | continue; |
1123 | } | ||
1124 | if (!strncmp(token, "name=", 5)) { | ||
1109 | const char *name = token + 5; | 1125 | const char *name = token + 5; |
1110 | /* Can't specify an empty name */ | 1126 | /* Can't specify an empty name */ |
1111 | if (!strlen(name)) | 1127 | if (!strlen(name)) |
@@ -1123,24 +1139,48 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1123 | if (opts->name) | 1139 | if (opts->name) |
1124 | return -EINVAL; | 1140 | return -EINVAL; |
1125 | opts->name = kstrndup(name, | 1141 | opts->name = kstrndup(name, |
1126 | MAX_CGROUP_ROOT_NAMELEN, | 1142 | MAX_CGROUP_ROOT_NAMELEN - 1, |
1127 | GFP_KERNEL); | 1143 | GFP_KERNEL); |
1128 | if (!opts->name) | 1144 | if (!opts->name) |
1129 | return -ENOMEM; | 1145 | return -ENOMEM; |
1130 | } else { | 1146 | |
1131 | struct cgroup_subsys *ss; | 1147 | continue; |
1132 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1148 | } |
1133 | ss = subsys[i]; | 1149 | |
1134 | if (ss == NULL) | 1150 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1135 | continue; | 1151 | struct cgroup_subsys *ss = subsys[i]; |
1136 | if (!strcmp(token, ss->name)) { | 1152 | if (ss == NULL) |
1137 | if (!ss->disabled) | 1153 | continue; |
1138 | set_bit(i, &opts->subsys_bits); | 1154 | if (strcmp(token, ss->name)) |
1139 | break; | 1155 | continue; |
1140 | } | 1156 | if (ss->disabled) |
1141 | } | 1157 | continue; |
1142 | if (i == CGROUP_SUBSYS_COUNT) | 1158 | |
1143 | return -ENOENT; | 1159 | /* Mutually exclusive option 'all' + subsystem name */ |
1160 | if (all_ss) | ||
1161 | return -EINVAL; | ||
1162 | set_bit(i, &opts->subsys_bits); | ||
1163 | one_ss = true; | ||
1164 | |||
1165 | break; | ||
1166 | } | ||
1167 | if (i == CGROUP_SUBSYS_COUNT) | ||
1168 | return -ENOENT; | ||
1169 | } | ||
1170 | |||
1171 | /* | ||
1172 | * If the 'all' option was specified select all the subsystems, | ||
1173 | * otherwise 'all, 'none' and a subsystem name options were not | ||
1174 | * specified, let's default to 'all' | ||
1175 | */ | ||
1176 | if (all_ss || (!all_ss && !one_ss && !opts->none)) { | ||
1177 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1178 | struct cgroup_subsys *ss = subsys[i]; | ||
1179 | if (ss == NULL) | ||
1180 | continue; | ||
1181 | if (ss->disabled) | ||
1182 | continue; | ||
1183 | set_bit(i, &opts->subsys_bits); | ||
1144 | } | 1184 | } |
1145 | } | 1185 | } |
1146 | 1186 | ||
@@ -1222,7 +1262,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1222 | struct cgroup *cgrp = &root->top_cgroup; | 1262 | struct cgroup *cgrp = &root->top_cgroup; |
1223 | struct cgroup_sb_opts opts; | 1263 | struct cgroup_sb_opts opts; |
1224 | 1264 | ||
1225 | lock_kernel(); | ||
1226 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); | 1265 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); |
1227 | mutex_lock(&cgroup_mutex); | 1266 | mutex_lock(&cgroup_mutex); |
1228 | 1267 | ||
@@ -1255,7 +1294,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1255 | kfree(opts.name); | 1294 | kfree(opts.name); |
1256 | mutex_unlock(&cgroup_mutex); | 1295 | mutex_unlock(&cgroup_mutex); |
1257 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 1296 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
1258 | unlock_kernel(); | ||
1259 | return ret; | 1297 | return ret; |
1260 | } | 1298 | } |
1261 | 1299 | ||
@@ -1357,6 +1395,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1357 | strcpy(root->release_agent_path, opts->release_agent); | 1395 | strcpy(root->release_agent_path, opts->release_agent); |
1358 | if (opts->name) | 1396 | if (opts->name) |
1359 | strcpy(root->name, opts->name); | 1397 | strcpy(root->name, opts->name); |
1398 | if (opts->clone_children) | ||
1399 | set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); | ||
1360 | return root; | 1400 | return root; |
1361 | } | 1401 | } |
1362 | 1402 | ||
@@ -1568,7 +1608,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1568 | out_err: | 1608 | out_err: |
1569 | kfree(opts.release_agent); | 1609 | kfree(opts.release_agent); |
1570 | kfree(opts.name); | 1610 | kfree(opts.name); |
1571 | |||
1572 | return ret; | 1611 | return ret; |
1573 | } | 1612 | } |
1574 | 1613 | ||
@@ -1623,6 +1662,8 @@ static struct file_system_type cgroup_fs_type = { | |||
1623 | .kill_sb = cgroup_kill_sb, | 1662 | .kill_sb = cgroup_kill_sb, |
1624 | }; | 1663 | }; |
1625 | 1664 | ||
1665 | static struct kobject *cgroup_kobj; | ||
1666 | |||
1626 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | 1667 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) |
1627 | { | 1668 | { |
1628 | return dentry->d_fsdata; | 1669 | return dentry->d_fsdata; |
@@ -1788,6 +1829,30 @@ out: | |||
1788 | return retval; | 1829 | return retval; |
1789 | } | 1830 | } |
1790 | 1831 | ||
1832 | /** | ||
1833 | * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' | ||
1834 | * @from: attach to all cgroups of a given task | ||
1835 | * @tsk: the task to be attached | ||
1836 | */ | ||
1837 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | ||
1838 | { | ||
1839 | struct cgroupfs_root *root; | ||
1840 | int retval = 0; | ||
1841 | |||
1842 | cgroup_lock(); | ||
1843 | for_each_active_root(root) { | ||
1844 | struct cgroup *from_cg = task_cgroup_from_root(from, root); | ||
1845 | |||
1846 | retval = cgroup_attach_task(from_cg, tsk); | ||
1847 | if (retval) | ||
1848 | break; | ||
1849 | } | ||
1850 | cgroup_unlock(); | ||
1851 | |||
1852 | return retval; | ||
1853 | } | ||
1854 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | ||
1855 | |||
1791 | /* | 1856 | /* |
1792 | * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex | 1857 | * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex |
1793 | * held. May take task_lock of task | 1858 | * held. May take task_lock of task |
@@ -1857,6 +1922,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, | |||
1857 | const char *buffer) | 1922 | const char *buffer) |
1858 | { | 1923 | { |
1859 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); | 1924 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); |
1925 | if (strlen(buffer) >= PATH_MAX) | ||
1926 | return -EINVAL; | ||
1860 | if (!cgroup_lock_live_group(cgrp)) | 1927 | if (!cgroup_lock_live_group(cgrp)) |
1861 | return -ENODEV; | 1928 | return -ENODEV; |
1862 | strcpy(cgrp->root->release_agent_path, buffer); | 1929 | strcpy(cgrp->root->release_agent_path, buffer); |
@@ -3150,6 +3217,23 @@ fail: | |||
3150 | return ret; | 3217 | return ret; |
3151 | } | 3218 | } |
3152 | 3219 | ||
3220 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, | ||
3221 | struct cftype *cft) | ||
3222 | { | ||
3223 | return clone_children(cgrp); | ||
3224 | } | ||
3225 | |||
3226 | static int cgroup_clone_children_write(struct cgroup *cgrp, | ||
3227 | struct cftype *cft, | ||
3228 | u64 val) | ||
3229 | { | ||
3230 | if (val) | ||
3231 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
3232 | else | ||
3233 | clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
3234 | return 0; | ||
3235 | } | ||
3236 | |||
3153 | /* | 3237 | /* |
3154 | * for the common functions, 'private' gives the type of file | 3238 | * for the common functions, 'private' gives the type of file |
3155 | */ | 3239 | */ |
@@ -3180,6 +3264,11 @@ static struct cftype files[] = { | |||
3180 | .write_string = cgroup_write_event_control, | 3264 | .write_string = cgroup_write_event_control, |
3181 | .mode = S_IWUGO, | 3265 | .mode = S_IWUGO, |
3182 | }, | 3266 | }, |
3267 | { | ||
3268 | .name = "cgroup.clone_children", | ||
3269 | .read_u64 = cgroup_clone_children_read, | ||
3270 | .write_u64 = cgroup_clone_children_write, | ||
3271 | }, | ||
3183 | }; | 3272 | }; |
3184 | 3273 | ||
3185 | static struct cftype cft_release_agent = { | 3274 | static struct cftype cft_release_agent = { |
@@ -3309,6 +3398,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3309 | if (notify_on_release(parent)) | 3398 | if (notify_on_release(parent)) |
3310 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 3399 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
3311 | 3400 | ||
3401 | if (clone_children(parent)) | ||
3402 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | ||
3403 | |||
3312 | for_each_subsys(root, ss) { | 3404 | for_each_subsys(root, ss) { |
3313 | struct cgroup_subsys_state *css = ss->create(ss, cgrp); | 3405 | struct cgroup_subsys_state *css = ss->create(ss, cgrp); |
3314 | 3406 | ||
@@ -3323,6 +3415,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3323 | goto err_destroy; | 3415 | goto err_destroy; |
3324 | } | 3416 | } |
3325 | /* At error, ->destroy() callback has to free assigned ID. */ | 3417 | /* At error, ->destroy() callback has to free assigned ID. */ |
3418 | if (clone_children(parent) && ss->post_clone) | ||
3419 | ss->post_clone(ss, cgrp); | ||
3326 | } | 3420 | } |
3327 | 3421 | ||
3328 | cgroup_lock_hierarchy(root); | 3422 | cgroup_lock_hierarchy(root); |
@@ -3871,9 +3965,18 @@ int __init cgroup_init(void) | |||
3871 | hhead = css_set_hash(init_css_set.subsys); | 3965 | hhead = css_set_hash(init_css_set.subsys); |
3872 | hlist_add_head(&init_css_set.hlist, hhead); | 3966 | hlist_add_head(&init_css_set.hlist, hhead); |
3873 | BUG_ON(!init_root_id(&rootnode)); | 3967 | BUG_ON(!init_root_id(&rootnode)); |
3968 | |||
3969 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | ||
3970 | if (!cgroup_kobj) { | ||
3971 | err = -ENOMEM; | ||
3972 | goto out; | ||
3973 | } | ||
3974 | |||
3874 | err = register_filesystem(&cgroup_fs_type); | 3975 | err = register_filesystem(&cgroup_fs_type); |
3875 | if (err < 0) | 3976 | if (err < 0) { |
3977 | kobject_put(cgroup_kobj); | ||
3876 | goto out; | 3978 | goto out; |
3979 | } | ||
3877 | 3980 | ||
3878 | proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); | 3981 | proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); |
3879 | 3982 | ||
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index ce71ed53e88f..e7bebb7c6c38 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task) | |||
48 | struct freezer, css); | 48 | struct freezer, css); |
49 | } | 49 | } |
50 | 50 | ||
51 | int cgroup_freezing_or_frozen(struct task_struct *task) | 51 | static inline int __cgroup_freezing_or_frozen(struct task_struct *task) |
52 | { | 52 | { |
53 | struct freezer *freezer; | 53 | enum freezer_state state = task_freezer(task)->state; |
54 | enum freezer_state state; | 54 | return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); |
55 | } | ||
55 | 56 | ||
57 | int cgroup_freezing_or_frozen(struct task_struct *task) | ||
58 | { | ||
59 | int result; | ||
56 | task_lock(task); | 60 | task_lock(task); |
57 | freezer = task_freezer(task); | 61 | result = __cgroup_freezing_or_frozen(task); |
58 | if (!freezer->css.cgroup->parent) | ||
59 | state = CGROUP_THAWED; /* root cgroup can't be frozen */ | ||
60 | else | ||
61 | state = freezer->state; | ||
62 | task_unlock(task); | 62 | task_unlock(task); |
63 | 63 | return result; | |
64 | return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); | ||
65 | } | 64 | } |
66 | 65 | ||
67 | /* | 66 | /* |
@@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss, | |||
154 | kfree(cgroup_freezer(cgroup)); | 153 | kfree(cgroup_freezer(cgroup)); |
155 | } | 154 | } |
156 | 155 | ||
157 | /* Task is frozen or will freeze immediately when next it gets woken */ | ||
158 | static bool is_task_frozen_enough(struct task_struct *task) | ||
159 | { | ||
160 | return frozen(task) || | ||
161 | (task_is_stopped_or_traced(task) && freezing(task)); | ||
162 | } | ||
163 | |||
164 | /* | 156 | /* |
165 | * The call to cgroup_lock() in the freezer.state write method prevents | 157 | * The call to cgroup_lock() in the freezer.state write method prevents |
166 | * a write to that file racing against an attach, and hence the | 158 | * a write to that file racing against an attach, and hence the |
@@ -174,24 +166,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss, | |||
174 | 166 | ||
175 | /* | 167 | /* |
176 | * Anything frozen can't move or be moved to/from. | 168 | * Anything frozen can't move or be moved to/from. |
177 | * | ||
178 | * Since orig_freezer->state == FROZEN means that @task has been | ||
179 | * frozen, so it's sufficient to check the latter condition. | ||
180 | */ | 169 | */ |
181 | 170 | ||
182 | if (is_task_frozen_enough(task)) | 171 | freezer = cgroup_freezer(new_cgroup); |
172 | if (freezer->state != CGROUP_THAWED) | ||
183 | return -EBUSY; | 173 | return -EBUSY; |
184 | 174 | ||
185 | freezer = cgroup_freezer(new_cgroup); | 175 | rcu_read_lock(); |
186 | if (freezer->state == CGROUP_FROZEN) | 176 | if (__cgroup_freezing_or_frozen(task)) { |
177 | rcu_read_unlock(); | ||
187 | return -EBUSY; | 178 | return -EBUSY; |
179 | } | ||
180 | rcu_read_unlock(); | ||
188 | 181 | ||
189 | if (threadgroup) { | 182 | if (threadgroup) { |
190 | struct task_struct *c; | 183 | struct task_struct *c; |
191 | 184 | ||
192 | rcu_read_lock(); | 185 | rcu_read_lock(); |
193 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | 186 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { |
194 | if (is_task_frozen_enough(c)) { | 187 | if (__cgroup_freezing_or_frozen(c)) { |
195 | rcu_read_unlock(); | 188 | rcu_read_unlock(); |
196 | return -EBUSY; | 189 | return -EBUSY; |
197 | } | 190 | } |
@@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) | |||
236 | /* | 229 | /* |
237 | * caller must hold freezer->lock | 230 | * caller must hold freezer->lock |
238 | */ | 231 | */ |
239 | static void update_freezer_state(struct cgroup *cgroup, | 232 | static void update_if_frozen(struct cgroup *cgroup, |
240 | struct freezer *freezer) | 233 | struct freezer *freezer) |
241 | { | 234 | { |
242 | struct cgroup_iter it; | 235 | struct cgroup_iter it; |
243 | struct task_struct *task; | 236 | struct task_struct *task; |
244 | unsigned int nfrozen = 0, ntotal = 0; | 237 | unsigned int nfrozen = 0, ntotal = 0; |
238 | enum freezer_state old_state = freezer->state; | ||
245 | 239 | ||
246 | cgroup_iter_start(cgroup, &it); | 240 | cgroup_iter_start(cgroup, &it); |
247 | while ((task = cgroup_iter_next(cgroup, &it))) { | 241 | while ((task = cgroup_iter_next(cgroup, &it))) { |
248 | ntotal++; | 242 | ntotal++; |
249 | if (is_task_frozen_enough(task)) | 243 | if (frozen(task)) |
250 | nfrozen++; | 244 | nfrozen++; |
251 | } | 245 | } |
252 | 246 | ||
253 | /* | 247 | if (old_state == CGROUP_THAWED) { |
254 | * Transition to FROZEN when no new tasks can be added ensures | 248 | BUG_ON(nfrozen > 0); |
255 | * that we never exist in the FROZEN state while there are unfrozen | 249 | } else if (old_state == CGROUP_FREEZING) { |
256 | * tasks. | 250 | if (nfrozen == ntotal) |
257 | */ | 251 | freezer->state = CGROUP_FROZEN; |
258 | if (nfrozen == ntotal) | 252 | } else { /* old_state == CGROUP_FROZEN */ |
259 | freezer->state = CGROUP_FROZEN; | 253 | BUG_ON(nfrozen != ntotal); |
260 | else if (nfrozen > 0) | 254 | } |
261 | freezer->state = CGROUP_FREEZING; | 255 | |
262 | else | ||
263 | freezer->state = CGROUP_THAWED; | ||
264 | cgroup_iter_end(cgroup, &it); | 256 | cgroup_iter_end(cgroup, &it); |
265 | } | 257 | } |
266 | 258 | ||
@@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, | |||
279 | if (state == CGROUP_FREEZING) { | 271 | if (state == CGROUP_FREEZING) { |
280 | /* We change from FREEZING to FROZEN lazily if the cgroup was | 272 | /* We change from FREEZING to FROZEN lazily if the cgroup was |
281 | * only partially frozen when we exitted write. */ | 273 | * only partially frozen when we exitted write. */ |
282 | update_freezer_state(cgroup, freezer); | 274 | update_if_frozen(cgroup, freezer); |
283 | state = freezer->state; | 275 | state = freezer->state; |
284 | } | 276 | } |
285 | spin_unlock_irq(&freezer->lock); | 277 | spin_unlock_irq(&freezer->lock); |
@@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | |||
301 | while ((task = cgroup_iter_next(cgroup, &it))) { | 293 | while ((task = cgroup_iter_next(cgroup, &it))) { |
302 | if (!freeze_task(task, true)) | 294 | if (!freeze_task(task, true)) |
303 | continue; | 295 | continue; |
304 | if (is_task_frozen_enough(task)) | 296 | if (frozen(task)) |
305 | continue; | 297 | continue; |
306 | if (!freezing(task) && !freezer_should_skip(task)) | 298 | if (!freezing(task) && !freezer_should_skip(task)) |
307 | num_cant_freeze_now++; | 299 | num_cant_freeze_now++; |
@@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup, | |||
335 | 327 | ||
336 | spin_lock_irq(&freezer->lock); | 328 | spin_lock_irq(&freezer->lock); |
337 | 329 | ||
338 | update_freezer_state(cgroup, freezer); | 330 | update_if_frozen(cgroup, freezer); |
339 | if (goal_state == freezer->state) | 331 | if (goal_state == freezer->state) |
340 | goto out; | 332 | goto out; |
341 | 333 | ||
diff --git a/kernel/compat.c b/kernel/compat.c index 5adab05a3172..c9e2ec0b34a8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -279,11 +279,6 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource, | |||
279 | struct compat_rlimit __user *rlim) | 279 | struct compat_rlimit __user *rlim) |
280 | { | 280 | { |
281 | struct rlimit r; | 281 | struct rlimit r; |
282 | int ret; | ||
283 | mm_segment_t old_fs = get_fs (); | ||
284 | |||
285 | if (resource >= RLIM_NLIMITS) | ||
286 | return -EINVAL; | ||
287 | 282 | ||
288 | if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || | 283 | if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || |
289 | __get_user(r.rlim_cur, &rlim->rlim_cur) || | 284 | __get_user(r.rlim_cur, &rlim->rlim_cur) || |
@@ -294,10 +289,7 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource, | |||
294 | r.rlim_cur = RLIM_INFINITY; | 289 | r.rlim_cur = RLIM_INFINITY; |
295 | if (r.rlim_max == COMPAT_RLIM_INFINITY) | 290 | if (r.rlim_max == COMPAT_RLIM_INFINITY) |
296 | r.rlim_max = RLIM_INFINITY; | 291 | r.rlim_max = RLIM_INFINITY; |
297 | set_fs(KERNEL_DS); | 292 | return do_prlimit(current, resource, &r, NULL); |
298 | ret = sys_setrlimit(resource, (struct rlimit __user *) &r); | ||
299 | set_fs(old_fs); | ||
300 | return ret; | ||
301 | } | 293 | } |
302 | 294 | ||
303 | #ifdef COMPAT_RLIM_OLD_INFINITY | 295 | #ifdef COMPAT_RLIM_OLD_INFINITY |
@@ -329,16 +321,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource, | |||
329 | 321 | ||
330 | #endif | 322 | #endif |
331 | 323 | ||
332 | asmlinkage long compat_sys_getrlimit (unsigned int resource, | 324 | asmlinkage long compat_sys_getrlimit(unsigned int resource, |
333 | struct compat_rlimit __user *rlim) | 325 | struct compat_rlimit __user *rlim) |
334 | { | 326 | { |
335 | struct rlimit r; | 327 | struct rlimit r; |
336 | int ret; | 328 | int ret; |
337 | mm_segment_t old_fs = get_fs(); | ||
338 | 329 | ||
339 | set_fs(KERNEL_DS); | 330 | ret = do_prlimit(current, resource, NULL, &r); |
340 | ret = sys_getrlimit(resource, (struct rlimit __user *) &r); | ||
341 | set_fs(old_fs); | ||
342 | if (!ret) { | 331 | if (!ret) { |
343 | if (r.rlim_cur > COMPAT_RLIM_INFINITY) | 332 | if (r.rlim_cur > COMPAT_RLIM_INFINITY) |
344 | r.rlim_cur = COMPAT_RLIM_INFINITY; | 333 | r.rlim_cur = COMPAT_RLIM_INFINITY; |
@@ -1137,3 +1126,24 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) | |||
1137 | 1126 | ||
1138 | return 0; | 1127 | return 0; |
1139 | } | 1128 | } |
1129 | |||
1130 | /* | ||
1131 | * Allocate user-space memory for the duration of a single system call, | ||
1132 | * in order to marshall parameters inside a compat thunk. | ||
1133 | */ | ||
1134 | void __user *compat_alloc_user_space(unsigned long len) | ||
1135 | { | ||
1136 | void __user *ptr; | ||
1137 | |||
1138 | /* If len would occupy more than half of the entire compat space... */ | ||
1139 | if (unlikely(len > (((compat_uptr_t)~0) >> 1))) | ||
1140 | return NULL; | ||
1141 | |||
1142 | ptr = arch_compat_alloc_user_space(len); | ||
1143 | |||
1144 | if (unlikely(!access_ok(VERIFY_WRITE, ptr, len))) | ||
1145 | return NULL; | ||
1146 | |||
1147 | return ptr; | ||
1148 | } | ||
1149 | EXPORT_SYMBOL_GPL(compat_alloc_user_space); | ||
diff --git a/kernel/configs.c b/kernel/configs.c index abaee684ecbf..b4066b44a99d 100644 --- a/kernel/configs.c +++ b/kernel/configs.c | |||
@@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf, | |||
66 | static const struct file_operations ikconfig_file_ops = { | 66 | static const struct file_operations ikconfig_file_ops = { |
67 | .owner = THIS_MODULE, | 67 | .owner = THIS_MODULE, |
68 | .read = ikconfig_read_current, | 68 | .read = ikconfig_read_current, |
69 | .llseek = default_llseek, | ||
69 | }; | 70 | }; |
70 | 71 | ||
71 | static int __init ikconfig_init(void) | 72 | static int __init ikconfig_init(void) |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 97d1b426a4ac..f6e726f18491 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
235 | return -EINVAL; | 235 | return -EINVAL; |
236 | 236 | ||
237 | cpu_hotplug_begin(); | 237 | cpu_hotplug_begin(); |
238 | set_cpu_active(cpu, false); | ||
239 | err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); | 238 | err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); |
240 | if (err) { | 239 | if (err) { |
241 | set_cpu_active(cpu, true); | ||
242 | |||
243 | nr_calls--; | 240 | nr_calls--; |
244 | __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); | 241 | __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); |
245 | printk("%s: attempt to take down CPU %u failed\n", | 242 | printk("%s: attempt to take down CPU %u failed\n", |
@@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
249 | 246 | ||
250 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); | 247 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); |
251 | if (err) { | 248 | if (err) { |
252 | set_cpu_active(cpu, true); | ||
253 | /* CPU didn't die: tell everyone. Can't complain. */ | 249 | /* CPU didn't die: tell everyone. Can't complain. */ |
254 | cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); | 250 | cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); |
255 | 251 | ||
@@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
321 | goto out_notify; | 317 | goto out_notify; |
322 | BUG_ON(!cpu_online(cpu)); | 318 | BUG_ON(!cpu_online(cpu)); |
323 | 319 | ||
324 | set_cpu_active(cpu, true); | ||
325 | |||
326 | /* Now call notifier in preparation. */ | 320 | /* Now call notifier in preparation. */ |
327 | cpu_notify(CPU_ONLINE | mod, hcpu); | 321 | cpu_notify(CPU_ONLINE | mod, hcpu); |
328 | 322 | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 02b9611eadde..51b143e2a07a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -105,7 +105,7 @@ struct cpuset { | |||
105 | /* for custom sched domain */ | 105 | /* for custom sched domain */ |
106 | int relax_domain_level; | 106 | int relax_domain_level; |
107 | 107 | ||
108 | /* used for walking a cpuset heirarchy */ | 108 | /* used for walking a cpuset hierarchy */ |
109 | struct list_head stack_list; | 109 | struct list_head stack_list; |
110 | }; | 110 | }; |
111 | 111 | ||
@@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | |||
1397 | if (tsk->flags & PF_THREAD_BOUND) | 1397 | if (tsk->flags & PF_THREAD_BOUND) |
1398 | return -EINVAL; | 1398 | return -EINVAL; |
1399 | 1399 | ||
1400 | ret = security_task_setscheduler(tsk, 0, NULL); | 1400 | ret = security_task_setscheduler(tsk); |
1401 | if (ret) | 1401 | if (ret) |
1402 | return ret; | 1402 | return ret; |
1403 | if (threadgroup) { | 1403 | if (threadgroup) { |
@@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | |||
1405 | 1405 | ||
1406 | rcu_read_lock(); | 1406 | rcu_read_lock(); |
1407 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | 1407 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { |
1408 | ret = security_task_setscheduler(c, 0, NULL); | 1408 | ret = security_task_setscheduler(c); |
1409 | if (ret) { | 1409 | if (ret) { |
1410 | rcu_read_unlock(); | 1410 | rcu_read_unlock(); |
1411 | return ret; | 1411 | return ret; |
@@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
2113 | * but making no active use of cpusets. | 2113 | * but making no active use of cpusets. |
2114 | * | 2114 | * |
2115 | * This routine ensures that top_cpuset.cpus_allowed tracks | 2115 | * This routine ensures that top_cpuset.cpus_allowed tracks |
2116 | * cpu_online_map on each CPU hotplug (cpuhp) event. | 2116 | * cpu_active_mask on each CPU hotplug (cpuhp) event. |
2117 | * | 2117 | * |
2118 | * Called within get_online_cpus(). Needs to call cgroup_lock() | 2118 | * Called within get_online_cpus(). Needs to call cgroup_lock() |
2119 | * before calling generate_sched_domains(). | 2119 | * before calling generate_sched_domains(). |
2120 | */ | 2120 | */ |
2121 | static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | 2121 | void cpuset_update_active_cpus(void) |
2122 | unsigned long phase, void *unused_cpu) | ||
2123 | { | 2122 | { |
2124 | struct sched_domain_attr *attr; | 2123 | struct sched_domain_attr *attr; |
2125 | cpumask_var_t *doms; | 2124 | cpumask_var_t *doms; |
2126 | int ndoms; | 2125 | int ndoms; |
2127 | 2126 | ||
2128 | switch (phase) { | ||
2129 | case CPU_ONLINE: | ||
2130 | case CPU_ONLINE_FROZEN: | ||
2131 | case CPU_DOWN_PREPARE: | ||
2132 | case CPU_DOWN_PREPARE_FROZEN: | ||
2133 | case CPU_DOWN_FAILED: | ||
2134 | case CPU_DOWN_FAILED_FROZEN: | ||
2135 | break; | ||
2136 | |||
2137 | default: | ||
2138 | return NOTIFY_DONE; | ||
2139 | } | ||
2140 | |||
2141 | cgroup_lock(); | 2127 | cgroup_lock(); |
2142 | mutex_lock(&callback_mutex); | 2128 | mutex_lock(&callback_mutex); |
2143 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2129 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
@@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |||
2148 | 2134 | ||
2149 | /* Have scheduler rebuild the domains */ | 2135 | /* Have scheduler rebuild the domains */ |
2150 | partition_sched_domains(ndoms, doms, attr); | 2136 | partition_sched_domains(ndoms, doms, attr); |
2151 | |||
2152 | return NOTIFY_OK; | ||
2153 | } | 2137 | } |
2154 | 2138 | ||
2155 | #ifdef CONFIG_MEMORY_HOTPLUG | 2139 | #ifdef CONFIG_MEMORY_HOTPLUG |
@@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void) | |||
2203 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2187 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
2204 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2188 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
2205 | 2189 | ||
2206 | hotcpu_notifier(cpuset_track_online_cpus, 0); | ||
2207 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); | 2190 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); |
2208 | 2191 | ||
2209 | cpuset_wq = create_singlethread_workqueue("cpuset"); | 2192 | cpuset_wq = create_singlethread_workqueue("cpuset"); |
diff --git a/kernel/cred.c b/kernel/cred.c index 60bc8b1e32e6..6a1aa004e376 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -22,10 +22,6 @@ | |||
22 | #define kdebug(FMT, ...) \ | 22 | #define kdebug(FMT, ...) \ |
23 | printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) | 23 | printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) |
24 | #else | 24 | #else |
25 | static inline __attribute__((format(printf, 1, 2))) | ||
26 | void no_printk(const char *fmt, ...) | ||
27 | { | ||
28 | } | ||
29 | #define kdebug(FMT, ...) \ | 25 | #define kdebug(FMT, ...) \ |
30 | no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) | 26 | no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) |
31 | #endif | 27 | #endif |
@@ -329,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds); | |||
329 | 325 | ||
330 | /* | 326 | /* |
331 | * Prepare credentials for current to perform an execve() | 327 | * Prepare credentials for current to perform an execve() |
332 | * - The caller must hold current->cred_guard_mutex | 328 | * - The caller must hold ->cred_guard_mutex |
333 | */ | 329 | */ |
334 | struct cred *prepare_exec_creds(void) | 330 | struct cred *prepare_exec_creds(void) |
335 | { | 331 | { |
@@ -388,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
388 | struct cred *new; | 384 | struct cred *new; |
389 | int ret; | 385 | int ret; |
390 | 386 | ||
391 | mutex_init(&p->cred_guard_mutex); | ||
392 | |||
393 | if ( | 387 | if ( |
394 | #ifdef CONFIG_KEYS | 388 | #ifdef CONFIG_KEYS |
395 | !p->cred->thread_keyring && | 389 | !p->cred->thread_keyring && |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 8bc5eeffec8a..fec596da9bd0 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * Copyright (C) 2000-2001 VERITAS Software Corporation. | 6 | * Copyright (C) 2000-2001 VERITAS Software Corporation. |
7 | * Copyright (C) 2002-2004 Timesys Corporation | 7 | * Copyright (C) 2002-2004 Timesys Corporation |
8 | * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> | 8 | * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> |
9 | * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> | 9 | * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz> |
10 | * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> | 10 | * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> |
11 | * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. | 11 | * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. |
12 | * Copyright (C) 2005-2009 Wind River Systems, Inc. | 12 | * Copyright (C) 2005-2009 Wind River Systems, Inc. |
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/pid.h> | 47 | #include <linux/pid.h> |
48 | #include <linux/smp.h> | 48 | #include <linux/smp.h> |
49 | #include <linux/mm.h> | 49 | #include <linux/mm.h> |
50 | #include <linux/rcupdate.h> | ||
50 | 51 | ||
51 | #include <asm/cacheflush.h> | 52 | #include <asm/cacheflush.h> |
52 | #include <asm/byteorder.h> | 53 | #include <asm/byteorder.h> |
@@ -109,13 +110,15 @@ static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { | |||
109 | */ | 110 | */ |
110 | atomic_t kgdb_active = ATOMIC_INIT(-1); | 111 | atomic_t kgdb_active = ATOMIC_INIT(-1); |
111 | EXPORT_SYMBOL_GPL(kgdb_active); | 112 | EXPORT_SYMBOL_GPL(kgdb_active); |
113 | static DEFINE_RAW_SPINLOCK(dbg_master_lock); | ||
114 | static DEFINE_RAW_SPINLOCK(dbg_slave_lock); | ||
112 | 115 | ||
113 | /* | 116 | /* |
114 | * We use NR_CPUs not PERCPU, in case kgdb is used to debug early | 117 | * We use NR_CPUs not PERCPU, in case kgdb is used to debug early |
115 | * bootup code (which might not have percpu set up yet): | 118 | * bootup code (which might not have percpu set up yet): |
116 | */ | 119 | */ |
117 | static atomic_t passive_cpu_wait[NR_CPUS]; | 120 | static atomic_t masters_in_kgdb; |
118 | static atomic_t cpu_in_kgdb[NR_CPUS]; | 121 | static atomic_t slaves_in_kgdb; |
119 | static atomic_t kgdb_break_tasklet_var; | 122 | static atomic_t kgdb_break_tasklet_var; |
120 | atomic_t kgdb_setting_breakpoint; | 123 | atomic_t kgdb_setting_breakpoint; |
121 | 124 | ||
@@ -457,26 +460,32 @@ static int kgdb_reenter_check(struct kgdb_state *ks) | |||
457 | return 1; | 460 | return 1; |
458 | } | 461 | } |
459 | 462 | ||
460 | static void dbg_cpu_switch(int cpu, int next_cpu) | 463 | static void dbg_touch_watchdogs(void) |
461 | { | 464 | { |
462 | /* Mark the cpu we are switching away from as a slave when it | 465 | touch_softlockup_watchdog_sync(); |
463 | * holds the kgdb_active token. This must be done so that the | 466 | clocksource_touch_watchdog(); |
464 | * that all the cpus wait in for the debug core will not enter | 467 | rcu_cpu_stall_reset(); |
465 | * again as the master. */ | ||
466 | if (cpu == atomic_read(&kgdb_active)) { | ||
467 | kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; | ||
468 | kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER; | ||
469 | } | ||
470 | kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER; | ||
471 | } | 468 | } |
472 | 469 | ||
473 | static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs) | 470 | static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, |
471 | int exception_state) | ||
474 | { | 472 | { |
475 | unsigned long flags; | 473 | unsigned long flags; |
476 | int sstep_tries = 100; | 474 | int sstep_tries = 100; |
477 | int error; | 475 | int error; |
478 | int i, cpu; | 476 | int cpu; |
479 | int trace_on = 0; | 477 | int trace_on = 0; |
478 | int online_cpus = num_online_cpus(); | ||
479 | |||
480 | kgdb_info[ks->cpu].enter_kgdb++; | ||
481 | kgdb_info[ks->cpu].exception_state |= exception_state; | ||
482 | |||
483 | if (exception_state == DCPU_WANT_MASTER) | ||
484 | atomic_inc(&masters_in_kgdb); | ||
485 | else | ||
486 | atomic_inc(&slaves_in_kgdb); | ||
487 | kgdb_disable_hw_debug(ks->linux_regs); | ||
488 | |||
480 | acquirelock: | 489 | acquirelock: |
481 | /* | 490 | /* |
482 | * Interrupts will be restored by the 'trap return' code, except when | 491 | * Interrupts will be restored by the 'trap return' code, except when |
@@ -489,14 +498,15 @@ acquirelock: | |||
489 | kgdb_info[cpu].task = current; | 498 | kgdb_info[cpu].task = current; |
490 | kgdb_info[cpu].ret_state = 0; | 499 | kgdb_info[cpu].ret_state = 0; |
491 | kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; | 500 | kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; |
492 | /* | ||
493 | * Make sure the above info reaches the primary CPU before | ||
494 | * our cpu_in_kgdb[] flag setting does: | ||
495 | */ | ||
496 | atomic_inc(&cpu_in_kgdb[cpu]); | ||
497 | 501 | ||
498 | if (exception_level == 1) | 502 | /* Make sure the above info reaches the primary CPU */ |
503 | smp_mb(); | ||
504 | |||
505 | if (exception_level == 1) { | ||
506 | if (raw_spin_trylock(&dbg_master_lock)) | ||
507 | atomic_xchg(&kgdb_active, cpu); | ||
499 | goto cpu_master_loop; | 508 | goto cpu_master_loop; |
509 | } | ||
500 | 510 | ||
501 | /* | 511 | /* |
502 | * CPU will loop if it is a slave or request to become a kgdb | 512 | * CPU will loop if it is a slave or request to become a kgdb |
@@ -508,10 +518,12 @@ cpu_loop: | |||
508 | kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; | 518 | kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; |
509 | goto cpu_master_loop; | 519 | goto cpu_master_loop; |
510 | } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { | 520 | } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { |
511 | if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu) | 521 | if (raw_spin_trylock(&dbg_master_lock)) { |
522 | atomic_xchg(&kgdb_active, cpu); | ||
512 | break; | 523 | break; |
524 | } | ||
513 | } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { | 525 | } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { |
514 | if (!atomic_read(&passive_cpu_wait[cpu])) | 526 | if (!raw_spin_is_locked(&dbg_slave_lock)) |
515 | goto return_normal; | 527 | goto return_normal; |
516 | } else { | 528 | } else { |
517 | return_normal: | 529 | return_normal: |
@@ -522,9 +534,12 @@ return_normal: | |||
522 | arch_kgdb_ops.correct_hw_break(); | 534 | arch_kgdb_ops.correct_hw_break(); |
523 | if (trace_on) | 535 | if (trace_on) |
524 | tracing_on(); | 536 | tracing_on(); |
525 | atomic_dec(&cpu_in_kgdb[cpu]); | 537 | kgdb_info[cpu].exception_state &= |
526 | touch_softlockup_watchdog_sync(); | 538 | ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); |
527 | clocksource_touch_watchdog(); | 539 | kgdb_info[cpu].enter_kgdb--; |
540 | smp_mb__before_atomic_dec(); | ||
541 | atomic_dec(&slaves_in_kgdb); | ||
542 | dbg_touch_watchdogs(); | ||
528 | local_irq_restore(flags); | 543 | local_irq_restore(flags); |
529 | return 0; | 544 | return 0; |
530 | } | 545 | } |
@@ -541,8 +556,8 @@ return_normal: | |||
541 | (kgdb_info[cpu].task && | 556 | (kgdb_info[cpu].task && |
542 | kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { | 557 | kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { |
543 | atomic_set(&kgdb_active, -1); | 558 | atomic_set(&kgdb_active, -1); |
544 | touch_softlockup_watchdog_sync(); | 559 | raw_spin_unlock(&dbg_master_lock); |
545 | clocksource_touch_watchdog(); | 560 | dbg_touch_watchdogs(); |
546 | local_irq_restore(flags); | 561 | local_irq_restore(flags); |
547 | 562 | ||
548 | goto acquirelock; | 563 | goto acquirelock; |
@@ -563,16 +578,12 @@ return_normal: | |||
563 | if (dbg_io_ops->pre_exception) | 578 | if (dbg_io_ops->pre_exception) |
564 | dbg_io_ops->pre_exception(); | 579 | dbg_io_ops->pre_exception(); |
565 | 580 | ||
566 | kgdb_disable_hw_debug(ks->linux_regs); | ||
567 | |||
568 | /* | 581 | /* |
569 | * Get the passive CPU lock which will hold all the non-primary | 582 | * Get the passive CPU lock which will hold all the non-primary |
570 | * CPU in a spin state while the debugger is active | 583 | * CPU in a spin state while the debugger is active |
571 | */ | 584 | */ |
572 | if (!kgdb_single_step) { | 585 | if (!kgdb_single_step) |
573 | for (i = 0; i < NR_CPUS; i++) | 586 | raw_spin_lock(&dbg_slave_lock); |
574 | atomic_inc(&passive_cpu_wait[i]); | ||
575 | } | ||
576 | 587 | ||
577 | #ifdef CONFIG_SMP | 588 | #ifdef CONFIG_SMP |
578 | /* Signal the other CPUs to enter kgdb_wait() */ | 589 | /* Signal the other CPUs to enter kgdb_wait() */ |
@@ -583,10 +594,9 @@ return_normal: | |||
583 | /* | 594 | /* |
584 | * Wait for the other CPUs to be notified and be waiting for us: | 595 | * Wait for the other CPUs to be notified and be waiting for us: |
585 | */ | 596 | */ |
586 | for_each_online_cpu(i) { | 597 | while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + |
587 | while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i])) | 598 | atomic_read(&slaves_in_kgdb)) != online_cpus) |
588 | cpu_relax(); | 599 | cpu_relax(); |
589 | } | ||
590 | 600 | ||
591 | /* | 601 | /* |
592 | * At this point the primary processor is completely | 602 | * At this point the primary processor is completely |
@@ -605,6 +615,8 @@ cpu_master_loop: | |||
605 | if (dbg_kdb_mode) { | 615 | if (dbg_kdb_mode) { |
606 | kgdb_connected = 1; | 616 | kgdb_connected = 1; |
607 | error = kdb_stub(ks); | 617 | error = kdb_stub(ks); |
618 | if (error == -1) | ||
619 | continue; | ||
608 | kgdb_connected = 0; | 620 | kgdb_connected = 0; |
609 | } else { | 621 | } else { |
610 | error = gdb_serial_stub(ks); | 622 | error = gdb_serial_stub(ks); |
@@ -613,7 +625,8 @@ cpu_master_loop: | |||
613 | if (error == DBG_PASS_EVENT) { | 625 | if (error == DBG_PASS_EVENT) { |
614 | dbg_kdb_mode = !dbg_kdb_mode; | 626 | dbg_kdb_mode = !dbg_kdb_mode; |
615 | } else if (error == DBG_SWITCH_CPU_EVENT) { | 627 | } else if (error == DBG_SWITCH_CPU_EVENT) { |
616 | dbg_cpu_switch(cpu, dbg_switch_cpu); | 628 | kgdb_info[dbg_switch_cpu].exception_state |= |
629 | DCPU_NEXT_MASTER; | ||
617 | goto cpu_loop; | 630 | goto cpu_loop; |
618 | } else { | 631 | } else { |
619 | kgdb_info[cpu].ret_state = error; | 632 | kgdb_info[cpu].ret_state = error; |
@@ -625,24 +638,11 @@ cpu_master_loop: | |||
625 | if (dbg_io_ops->post_exception) | 638 | if (dbg_io_ops->post_exception) |
626 | dbg_io_ops->post_exception(); | 639 | dbg_io_ops->post_exception(); |
627 | 640 | ||
628 | atomic_dec(&cpu_in_kgdb[ks->cpu]); | ||
629 | |||
630 | if (!kgdb_single_step) { | 641 | if (!kgdb_single_step) { |
631 | for (i = NR_CPUS-1; i >= 0; i--) | 642 | raw_spin_unlock(&dbg_slave_lock); |
632 | atomic_dec(&passive_cpu_wait[i]); | 643 | /* Wait till all the CPUs have quit from the debugger. */ |
633 | /* | 644 | while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb)) |
634 | * Wait till all the CPUs have quit from the debugger, | 645 | cpu_relax(); |
635 | * but allow a CPU that hit an exception and is | ||
636 | * waiting to become the master to remain in the debug | ||
637 | * core. | ||
638 | */ | ||
639 | for_each_online_cpu(i) { | ||
640 | while (kgdb_do_roundup && | ||
641 | atomic_read(&cpu_in_kgdb[i]) && | ||
642 | !(kgdb_info[i].exception_state & | ||
643 | DCPU_WANT_MASTER)) | ||
644 | cpu_relax(); | ||
645 | } | ||
646 | } | 646 | } |
647 | 647 | ||
648 | kgdb_restore: | 648 | kgdb_restore: |
@@ -653,12 +653,20 @@ kgdb_restore: | |||
653 | else | 653 | else |
654 | kgdb_sstep_pid = 0; | 654 | kgdb_sstep_pid = 0; |
655 | } | 655 | } |
656 | if (arch_kgdb_ops.correct_hw_break) | ||
657 | arch_kgdb_ops.correct_hw_break(); | ||
656 | if (trace_on) | 658 | if (trace_on) |
657 | tracing_on(); | 659 | tracing_on(); |
660 | |||
661 | kgdb_info[cpu].exception_state &= | ||
662 | ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); | ||
663 | kgdb_info[cpu].enter_kgdb--; | ||
664 | smp_mb__before_atomic_dec(); | ||
665 | atomic_dec(&masters_in_kgdb); | ||
658 | /* Free kgdb_active */ | 666 | /* Free kgdb_active */ |
659 | atomic_set(&kgdb_active, -1); | 667 | atomic_set(&kgdb_active, -1); |
660 | touch_softlockup_watchdog_sync(); | 668 | raw_spin_unlock(&dbg_master_lock); |
661 | clocksource_touch_watchdog(); | 669 | dbg_touch_watchdogs(); |
662 | local_irq_restore(flags); | 670 | local_irq_restore(flags); |
663 | 671 | ||
664 | return kgdb_info[cpu].ret_state; | 672 | return kgdb_info[cpu].ret_state; |
@@ -676,7 +684,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | |||
676 | { | 684 | { |
677 | struct kgdb_state kgdb_var; | 685 | struct kgdb_state kgdb_var; |
678 | struct kgdb_state *ks = &kgdb_var; | 686 | struct kgdb_state *ks = &kgdb_var; |
679 | int ret; | ||
680 | 687 | ||
681 | ks->cpu = raw_smp_processor_id(); | 688 | ks->cpu = raw_smp_processor_id(); |
682 | ks->ex_vector = evector; | 689 | ks->ex_vector = evector; |
@@ -687,11 +694,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | |||
687 | 694 | ||
688 | if (kgdb_reenter_check(ks)) | 695 | if (kgdb_reenter_check(ks)) |
689 | return 0; /* Ouch, double exception ! */ | 696 | return 0; /* Ouch, double exception ! */ |
690 | kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER; | 697 | if (kgdb_info[ks->cpu].enter_kgdb != 0) |
691 | ret = kgdb_cpu_enter(ks, regs); | 698 | return 0; |
692 | kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER | | 699 | |
693 | DCPU_IS_SLAVE); | 700 | return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); |
694 | return ret; | ||
695 | } | 701 | } |
696 | 702 | ||
697 | int kgdb_nmicallback(int cpu, void *regs) | 703 | int kgdb_nmicallback(int cpu, void *regs) |
@@ -704,12 +710,9 @@ int kgdb_nmicallback(int cpu, void *regs) | |||
704 | ks->cpu = cpu; | 710 | ks->cpu = cpu; |
705 | ks->linux_regs = regs; | 711 | ks->linux_regs = regs; |
706 | 712 | ||
707 | if (!atomic_read(&cpu_in_kgdb[cpu]) && | 713 | if (kgdb_info[ks->cpu].enter_kgdb == 0 && |
708 | atomic_read(&kgdb_active) != -1 && | 714 | raw_spin_is_locked(&dbg_master_lock)) { |
709 | atomic_read(&kgdb_active) != cpu) { | 715 | kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE); |
710 | kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; | ||
711 | kgdb_cpu_enter(ks, regs); | ||
712 | kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE; | ||
713 | return 0; | 716 | return 0; |
714 | } | 717 | } |
715 | #endif | 718 | #endif |
@@ -739,7 +742,7 @@ static struct console kgdbcons = { | |||
739 | }; | 742 | }; |
740 | 743 | ||
741 | #ifdef CONFIG_MAGIC_SYSRQ | 744 | #ifdef CONFIG_MAGIC_SYSRQ |
742 | static void sysrq_handle_dbg(int key, struct tty_struct *tty) | 745 | static void sysrq_handle_dbg(int key) |
743 | { | 746 | { |
744 | if (!dbg_io_ops) { | 747 | if (!dbg_io_ops) { |
745 | printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); | 748 | printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); |
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index c5d753d80f67..3494c28a7e7a 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h | |||
@@ -40,6 +40,7 @@ struct debuggerinfo_struct { | |||
40 | int exception_state; | 40 | int exception_state; |
41 | int ret_state; | 41 | int ret_state; |
42 | int irq_depth; | 42 | int irq_depth; |
43 | int enter_kgdb; | ||
43 | }; | 44 | }; |
44 | 45 | ||
45 | extern struct debuggerinfo_struct kgdb_info[]; | 46 | extern struct debuggerinfo_struct kgdb_info[]; |
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index e8fd6868682d..481a7bd2dfe7 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * Copyright (C) 2000-2001 VERITAS Software Corporation. | 6 | * Copyright (C) 2000-2001 VERITAS Software Corporation. |
7 | * Copyright (C) 2002-2004 Timesys Corporation | 7 | * Copyright (C) 2002-2004 Timesys Corporation |
8 | * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> | 8 | * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> |
9 | * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> | 9 | * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz> |
10 | * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> | 10 | * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> |
11 | * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. | 11 | * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. |
12 | * Copyright (C) 2005-2009 Wind River Systems, Inc. | 12 | * Copyright (C) 2005-2009 Wind River Systems, Inc. |
@@ -52,17 +52,6 @@ static unsigned long gdb_regs[(NUMREGBYTES + | |||
52 | * GDB remote protocol parser: | 52 | * GDB remote protocol parser: |
53 | */ | 53 | */ |
54 | 54 | ||
55 | static int hex(char ch) | ||
56 | { | ||
57 | if ((ch >= 'a') && (ch <= 'f')) | ||
58 | return ch - 'a' + 10; | ||
59 | if ((ch >= '0') && (ch <= '9')) | ||
60 | return ch - '0'; | ||
61 | if ((ch >= 'A') && (ch <= 'F')) | ||
62 | return ch - 'A' + 10; | ||
63 | return -1; | ||
64 | } | ||
65 | |||
66 | #ifdef CONFIG_KGDB_KDB | 55 | #ifdef CONFIG_KGDB_KDB |
67 | static int gdbstub_read_wait(void) | 56 | static int gdbstub_read_wait(void) |
68 | { | 57 | { |
@@ -123,8 +112,8 @@ static void get_packet(char *buffer) | |||
123 | buffer[count] = 0; | 112 | buffer[count] = 0; |
124 | 113 | ||
125 | if (ch == '#') { | 114 | if (ch == '#') { |
126 | xmitcsum = hex(gdbstub_read_wait()) << 4; | 115 | xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; |
127 | xmitcsum += hex(gdbstub_read_wait()); | 116 | xmitcsum += hex_to_bin(gdbstub_read_wait()); |
128 | 117 | ||
129 | if (checksum != xmitcsum) | 118 | if (checksum != xmitcsum) |
130 | /* failed checksum */ | 119 | /* failed checksum */ |
@@ -236,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len) | |||
236 | * buf. Return a pointer to the last char put in buf (null). May | 225 | * buf. Return a pointer to the last char put in buf (null). May |
237 | * return an error. | 226 | * return an error. |
238 | */ | 227 | */ |
239 | int kgdb_mem2hex(char *mem, char *buf, int count) | 228 | char *kgdb_mem2hex(char *mem, char *buf, int count) |
240 | { | 229 | { |
241 | char *tmp; | 230 | char *tmp; |
242 | int err; | 231 | int err; |
@@ -248,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count) | |||
248 | tmp = buf + count; | 237 | tmp = buf + count; |
249 | 238 | ||
250 | err = probe_kernel_read(tmp, mem, count); | 239 | err = probe_kernel_read(tmp, mem, count); |
251 | if (!err) { | 240 | if (err) |
252 | while (count > 0) { | 241 | return NULL; |
253 | buf = pack_hex_byte(buf, *tmp); | 242 | while (count > 0) { |
254 | tmp++; | 243 | buf = pack_hex_byte(buf, *tmp); |
255 | count--; | 244 | tmp++; |
256 | } | 245 | count--; |
257 | |||
258 | *buf = 0; | ||
259 | } | 246 | } |
247 | *buf = 0; | ||
260 | 248 | ||
261 | return err; | 249 | return buf; |
262 | } | 250 | } |
263 | 251 | ||
264 | /* | 252 | /* |
@@ -280,8 +268,8 @@ int kgdb_hex2mem(char *buf, char *mem, int count) | |||
280 | tmp_hex = tmp_raw - 1; | 268 | tmp_hex = tmp_raw - 1; |
281 | while (tmp_hex >= buf) { | 269 | while (tmp_hex >= buf) { |
282 | tmp_raw--; | 270 | tmp_raw--; |
283 | *tmp_raw = hex(*tmp_hex--); | 271 | *tmp_raw = hex_to_bin(*tmp_hex--); |
284 | *tmp_raw |= hex(*tmp_hex--) << 4; | 272 | *tmp_raw |= hex_to_bin(*tmp_hex--) << 4; |
285 | } | 273 | } |
286 | 274 | ||
287 | return probe_kernel_write(mem, tmp_raw, count); | 275 | return probe_kernel_write(mem, tmp_raw, count); |
@@ -304,7 +292,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) | |||
304 | (*ptr)++; | 292 | (*ptr)++; |
305 | } | 293 | } |
306 | while (**ptr) { | 294 | while (**ptr) { |
307 | hex_val = hex(**ptr); | 295 | hex_val = hex_to_bin(**ptr); |
308 | if (hex_val < 0) | 296 | if (hex_val < 0) |
309 | break; | 297 | break; |
310 | 298 | ||
@@ -339,6 +327,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count) | |||
339 | return probe_kernel_write(mem, c, size); | 327 | return probe_kernel_write(mem, c, size); |
340 | } | 328 | } |
341 | 329 | ||
330 | #if DBG_MAX_REG_NUM > 0 | ||
331 | void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) | ||
332 | { | ||
333 | int i; | ||
334 | int idx = 0; | ||
335 | char *ptr = (char *)gdb_regs; | ||
336 | |||
337 | for (i = 0; i < DBG_MAX_REG_NUM; i++) { | ||
338 | dbg_get_reg(i, ptr + idx, regs); | ||
339 | idx += dbg_reg_def[i].size; | ||
340 | } | ||
341 | } | ||
342 | |||
343 | void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) | ||
344 | { | ||
345 | int i; | ||
346 | int idx = 0; | ||
347 | char *ptr = (char *)gdb_regs; | ||
348 | |||
349 | for (i = 0; i < DBG_MAX_REG_NUM; i++) { | ||
350 | dbg_set_reg(i, ptr + idx, regs); | ||
351 | idx += dbg_reg_def[i].size; | ||
352 | } | ||
353 | } | ||
354 | #endif /* DBG_MAX_REG_NUM > 0 */ | ||
355 | |||
342 | /* Write memory due to an 'M' or 'X' packet. */ | 356 | /* Write memory due to an 'M' or 'X' packet. */ |
343 | static int write_mem_msg(int binary) | 357 | static int write_mem_msg(int binary) |
344 | { | 358 | { |
@@ -378,28 +392,31 @@ static void error_packet(char *pkt, int error) | |||
378 | * remapped to negative TIDs. | 392 | * remapped to negative TIDs. |
379 | */ | 393 | */ |
380 | 394 | ||
381 | #define BUF_THREAD_ID_SIZE 16 | 395 | #define BUF_THREAD_ID_SIZE 8 |
382 | 396 | ||
383 | static char *pack_threadid(char *pkt, unsigned char *id) | 397 | static char *pack_threadid(char *pkt, unsigned char *id) |
384 | { | 398 | { |
385 | char *limit; | 399 | unsigned char *limit; |
400 | int lzero = 1; | ||
401 | |||
402 | limit = id + (BUF_THREAD_ID_SIZE / 2); | ||
403 | while (id < limit) { | ||
404 | if (!lzero || *id != 0) { | ||
405 | pkt = pack_hex_byte(pkt, *id); | ||
406 | lzero = 0; | ||
407 | } | ||
408 | id++; | ||
409 | } | ||
386 | 410 | ||
387 | limit = pkt + BUF_THREAD_ID_SIZE; | 411 | if (lzero) |
388 | while (pkt < limit) | 412 | pkt = pack_hex_byte(pkt, 0); |
389 | pkt = pack_hex_byte(pkt, *id++); | ||
390 | 413 | ||
391 | return pkt; | 414 | return pkt; |
392 | } | 415 | } |
393 | 416 | ||
394 | static void int_to_threadref(unsigned char *id, int value) | 417 | static void int_to_threadref(unsigned char *id, int value) |
395 | { | 418 | { |
396 | unsigned char *scan; | 419 | put_unaligned_be32(value, id); |
397 | int i = 4; | ||
398 | |||
399 | scan = (unsigned char *)id; | ||
400 | while (i--) | ||
401 | *scan++ = 0; | ||
402 | put_unaligned_be32(value, scan); | ||
403 | } | 420 | } |
404 | 421 | ||
405 | static struct task_struct *getthread(struct pt_regs *regs, int tid) | 422 | static struct task_struct *getthread(struct pt_regs *regs, int tid) |
@@ -463,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks) | |||
463 | pack_hex_byte(&remcom_out_buffer[1], ks->signo); | 480 | pack_hex_byte(&remcom_out_buffer[1], ks->signo); |
464 | } | 481 | } |
465 | 482 | ||
466 | /* Handle the 'g' get registers request */ | 483 | static void gdb_get_regs_helper(struct kgdb_state *ks) |
467 | static void gdb_cmd_getregs(struct kgdb_state *ks) | ||
468 | { | 484 | { |
469 | struct task_struct *thread; | 485 | struct task_struct *thread; |
470 | void *local_debuggerinfo; | 486 | void *local_debuggerinfo; |
@@ -505,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) | |||
505 | */ | 521 | */ |
506 | sleeping_thread_to_gdb_regs(gdb_regs, thread); | 522 | sleeping_thread_to_gdb_regs(gdb_regs, thread); |
507 | } | 523 | } |
524 | } | ||
525 | |||
526 | /* Handle the 'g' get registers request */ | ||
527 | static void gdb_cmd_getregs(struct kgdb_state *ks) | ||
528 | { | ||
529 | gdb_get_regs_helper(ks); | ||
508 | kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); | 530 | kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); |
509 | } | 531 | } |
510 | 532 | ||
@@ -527,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks) | |||
527 | char *ptr = &remcom_in_buffer[1]; | 549 | char *ptr = &remcom_in_buffer[1]; |
528 | unsigned long length; | 550 | unsigned long length; |
529 | unsigned long addr; | 551 | unsigned long addr; |
530 | int err; | 552 | char *err; |
531 | 553 | ||
532 | if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && | 554 | if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && |
533 | kgdb_hex2long(&ptr, &length) > 0) { | 555 | kgdb_hex2long(&ptr, &length) > 0) { |
534 | err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); | 556 | err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); |
535 | if (err) | 557 | if (!err) |
536 | error_packet(remcom_out_buffer, err); | 558 | error_packet(remcom_out_buffer, -EINVAL); |
537 | } else { | 559 | } else { |
538 | error_packet(remcom_out_buffer, -EINVAL); | 560 | error_packet(remcom_out_buffer, -EINVAL); |
539 | } | 561 | } |
@@ -550,6 +572,60 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks) | |||
550 | strcpy(remcom_out_buffer, "OK"); | 572 | strcpy(remcom_out_buffer, "OK"); |
551 | } | 573 | } |
552 | 574 | ||
575 | #if DBG_MAX_REG_NUM > 0 | ||
576 | static char *gdb_hex_reg_helper(int regnum, char *out) | ||
577 | { | ||
578 | int i; | ||
579 | int offset = 0; | ||
580 | |||
581 | for (i = 0; i < regnum; i++) | ||
582 | offset += dbg_reg_def[i].size; | ||
583 | return kgdb_mem2hex((char *)gdb_regs + offset, out, | ||
584 | dbg_reg_def[i].size); | ||
585 | } | ||
586 | |||
587 | /* Handle the 'p' individual regster get */ | ||
588 | static void gdb_cmd_reg_get(struct kgdb_state *ks) | ||
589 | { | ||
590 | unsigned long regnum; | ||
591 | char *ptr = &remcom_in_buffer[1]; | ||
592 | |||
593 | kgdb_hex2long(&ptr, ®num); | ||
594 | if (regnum >= DBG_MAX_REG_NUM) { | ||
595 | error_packet(remcom_out_buffer, -EINVAL); | ||
596 | return; | ||
597 | } | ||
598 | gdb_get_regs_helper(ks); | ||
599 | gdb_hex_reg_helper(regnum, remcom_out_buffer); | ||
600 | } | ||
601 | |||
602 | /* Handle the 'P' individual regster set */ | ||
603 | static void gdb_cmd_reg_set(struct kgdb_state *ks) | ||
604 | { | ||
605 | unsigned long regnum; | ||
606 | char *ptr = &remcom_in_buffer[1]; | ||
607 | int i = 0; | ||
608 | |||
609 | kgdb_hex2long(&ptr, ®num); | ||
610 | if (*ptr++ != '=' || | ||
611 | !(!kgdb_usethread || kgdb_usethread == current) || | ||
612 | !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) { | ||
613 | error_packet(remcom_out_buffer, -EINVAL); | ||
614 | return; | ||
615 | } | ||
616 | memset(gdb_regs, 0, sizeof(gdb_regs)); | ||
617 | while (i < sizeof(gdb_regs) * 2) | ||
618 | if (hex_to_bin(ptr[i]) >= 0) | ||
619 | i++; | ||
620 | else | ||
621 | break; | ||
622 | i = i / 2; | ||
623 | kgdb_hex2mem(ptr, (char *)gdb_regs, i); | ||
624 | dbg_set_reg(regnum, gdb_regs, ks->linux_regs); | ||
625 | strcpy(remcom_out_buffer, "OK"); | ||
626 | } | ||
627 | #endif /* DBG_MAX_REG_NUM > 0 */ | ||
628 | |||
553 | /* Handle the 'X' memory binary write bytes */ | 629 | /* Handle the 'X' memory binary write bytes */ |
554 | static void gdb_cmd_binwrite(struct kgdb_state *ks) | 630 | static void gdb_cmd_binwrite(struct kgdb_state *ks) |
555 | { | 631 | { |
@@ -612,7 +688,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) | |||
612 | { | 688 | { |
613 | struct task_struct *g; | 689 | struct task_struct *g; |
614 | struct task_struct *p; | 690 | struct task_struct *p; |
615 | unsigned char thref[8]; | 691 | unsigned char thref[BUF_THREAD_ID_SIZE]; |
616 | char *ptr; | 692 | char *ptr; |
617 | int i; | 693 | int i; |
618 | int cpu; | 694 | int cpu; |
@@ -632,8 +708,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) | |||
632 | for_each_online_cpu(cpu) { | 708 | for_each_online_cpu(cpu) { |
633 | ks->thr_query = 0; | 709 | ks->thr_query = 0; |
634 | int_to_threadref(thref, -cpu - 2); | 710 | int_to_threadref(thref, -cpu - 2); |
635 | pack_threadid(ptr, thref); | 711 | ptr = pack_threadid(ptr, thref); |
636 | ptr += BUF_THREAD_ID_SIZE; | ||
637 | *(ptr++) = ','; | 712 | *(ptr++) = ','; |
638 | i++; | 713 | i++; |
639 | } | 714 | } |
@@ -642,8 +717,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) | |||
642 | do_each_thread(g, p) { | 717 | do_each_thread(g, p) { |
643 | if (i >= ks->thr_query && !finished) { | 718 | if (i >= ks->thr_query && !finished) { |
644 | int_to_threadref(thref, p->pid); | 719 | int_to_threadref(thref, p->pid); |
645 | pack_threadid(ptr, thref); | 720 | ptr = pack_threadid(ptr, thref); |
646 | ptr += BUF_THREAD_ID_SIZE; | ||
647 | *(ptr++) = ','; | 721 | *(ptr++) = ','; |
648 | ks->thr_query++; | 722 | ks->thr_query++; |
649 | if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) | 723 | if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) |
@@ -858,11 +932,14 @@ int gdb_serial_stub(struct kgdb_state *ks) | |||
858 | int error = 0; | 932 | int error = 0; |
859 | int tmp; | 933 | int tmp; |
860 | 934 | ||
861 | /* Clear the out buffer. */ | 935 | /* Initialize comm buffer and globals. */ |
862 | memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); | 936 | memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); |
937 | kgdb_usethread = kgdb_info[ks->cpu].task; | ||
938 | ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); | ||
939 | ks->pass_exception = 0; | ||
863 | 940 | ||
864 | if (kgdb_connected) { | 941 | if (kgdb_connected) { |
865 | unsigned char thref[8]; | 942 | unsigned char thref[BUF_THREAD_ID_SIZE]; |
866 | char *ptr; | 943 | char *ptr; |
867 | 944 | ||
868 | /* Reply to host that an exception has occurred */ | 945 | /* Reply to host that an exception has occurred */ |
@@ -876,10 +953,6 @@ int gdb_serial_stub(struct kgdb_state *ks) | |||
876 | put_packet(remcom_out_buffer); | 953 | put_packet(remcom_out_buffer); |
877 | } | 954 | } |
878 | 955 | ||
879 | kgdb_usethread = kgdb_info[ks->cpu].task; | ||
880 | ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); | ||
881 | ks->pass_exception = 0; | ||
882 | |||
883 | while (1) { | 956 | while (1) { |
884 | error = 0; | 957 | error = 0; |
885 | 958 | ||
@@ -904,6 +977,14 @@ int gdb_serial_stub(struct kgdb_state *ks) | |||
904 | case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ | 977 | case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ |
905 | gdb_cmd_memwrite(ks); | 978 | gdb_cmd_memwrite(ks); |
906 | break; | 979 | break; |
980 | #if DBG_MAX_REG_NUM > 0 | ||
981 | case 'p': /* pXX Return gdb register XX (in hex) */ | ||
982 | gdb_cmd_reg_get(ks); | ||
983 | break; | ||
984 | case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */ | ||
985 | gdb_cmd_reg_set(ks); | ||
986 | break; | ||
987 | #endif /* DBG_MAX_REG_NUM > 0 */ | ||
907 | case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ | 988 | case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ |
908 | gdb_cmd_binwrite(ks); | 989 | gdb_cmd_binwrite(ks); |
909 | break; | 990 | break; |
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 75bd9b3ebbb7..20059ef4459a 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c | |||
@@ -274,7 +274,6 @@ static int kdb_bp(int argc, const char **argv) | |||
274 | int i, bpno; | 274 | int i, bpno; |
275 | kdb_bp_t *bp, *bp_check; | 275 | kdb_bp_t *bp, *bp_check; |
276 | int diag; | 276 | int diag; |
277 | int free; | ||
278 | char *symname = NULL; | 277 | char *symname = NULL; |
279 | long offset = 0ul; | 278 | long offset = 0ul; |
280 | int nextarg; | 279 | int nextarg; |
@@ -305,7 +304,6 @@ static int kdb_bp(int argc, const char **argv) | |||
305 | /* | 304 | /* |
306 | * Find an empty bp structure to allocate | 305 | * Find an empty bp structure to allocate |
307 | */ | 306 | */ |
308 | free = KDB_MAXBPT; | ||
309 | for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { | 307 | for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { |
310 | if (bp->bp_free) | 308 | if (bp->bp_free) |
311 | break; | 309 | break; |
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index bf6e8270e957..dd0b1b7dd02c 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c | |||
@@ -86,7 +86,7 @@ int kdb_stub(struct kgdb_state *ks) | |||
86 | } | 86 | } |
87 | /* Set initial kdb state variables */ | 87 | /* Set initial kdb state variables */ |
88 | KDB_STATE_CLEAR(KGDB_TRANS); | 88 | KDB_STATE_CLEAR(KGDB_TRANS); |
89 | kdb_initial_cpu = ks->cpu; | 89 | kdb_initial_cpu = atomic_read(&kgdb_active); |
90 | kdb_current_task = kgdb_info[ks->cpu].task; | 90 | kdb_current_task = kgdb_info[ks->cpu].task; |
91 | kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; | 91 | kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; |
92 | /* Remove any breakpoints as needed by kdb and clear single step */ | 92 | /* Remove any breakpoints as needed by kdb and clear single step */ |
@@ -105,7 +105,6 @@ int kdb_stub(struct kgdb_state *ks) | |||
105 | ks->pass_exception = 1; | 105 | ks->pass_exception = 1; |
106 | KDB_FLAG_SET(CATASTROPHIC); | 106 | KDB_FLAG_SET(CATASTROPHIC); |
107 | } | 107 | } |
108 | kdb_initial_cpu = ks->cpu; | ||
109 | if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { | 108 | if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { |
110 | KDB_STATE_CLEAR(SSBPT); | 109 | KDB_STATE_CLEAR(SSBPT); |
111 | KDB_STATE_CLEAR(DOING_SS); | 110 | KDB_STATE_CLEAR(DOING_SS); |
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index c9b7f4f90bba..96fdaac46a80 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
@@ -823,4 +823,4 @@ int kdb_printf(const char *fmt, ...) | |||
823 | 823 | ||
824 | return r; | 824 | return r; |
825 | } | 825 | } |
826 | 826 | EXPORT_SYMBOL_GPL(kdb_printf); | |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index ebe4a287419e..d7bda21a106b 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value) | |||
312 | 312 | ||
313 | if (endp == arg) { | 313 | if (endp == arg) { |
314 | /* | 314 | /* |
315 | * Try base 16, for us folks too lazy to type the | 315 | * Also try base 16, for us folks too lazy to type the |
316 | * leading 0x... | 316 | * leading 0x... |
317 | */ | 317 | */ |
318 | val = simple_strtoul(arg, &endp, 16); | 318 | val = simple_strtoul(arg, &endp, 16); |
@@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value) | |||
325 | return 0; | 325 | return 0; |
326 | } | 326 | } |
327 | 327 | ||
328 | int kdbgetu64arg(const char *arg, u64 *value) | ||
329 | { | ||
330 | char *endp; | ||
331 | u64 val; | ||
332 | |||
333 | val = simple_strtoull(arg, &endp, 0); | ||
334 | |||
335 | if (endp == arg) { | ||
336 | |||
337 | val = simple_strtoull(arg, &endp, 16); | ||
338 | if (endp == arg) | ||
339 | return KDB_BADINT; | ||
340 | } | ||
341 | |||
342 | *value = val; | ||
343 | |||
344 | return 0; | ||
345 | } | ||
346 | |||
328 | /* | 347 | /* |
329 | * kdb_set - This function implements the 'set' command. Alter an | 348 | * kdb_set - This function implements the 'set' command. Alter an |
330 | * existing environment variable or create a new one. | 349 | * existing environment variable or create a new one. |
@@ -1730,13 +1749,13 @@ static int kdb_go(int argc, const char **argv) | |||
1730 | int nextarg; | 1749 | int nextarg; |
1731 | long offset; | 1750 | long offset; |
1732 | 1751 | ||
1752 | if (raw_smp_processor_id() != kdb_initial_cpu) { | ||
1753 | kdb_printf("go must execute on the entry cpu, " | ||
1754 | "please use \"cpu %d\" and then execute go\n", | ||
1755 | kdb_initial_cpu); | ||
1756 | return KDB_BADCPUNUM; | ||
1757 | } | ||
1733 | if (argc == 1) { | 1758 | if (argc == 1) { |
1734 | if (raw_smp_processor_id() != kdb_initial_cpu) { | ||
1735 | kdb_printf("go <address> must be issued from the " | ||
1736 | "initial cpu, do cpu %d first\n", | ||
1737 | kdb_initial_cpu); | ||
1738 | return KDB_ARGCOUNT; | ||
1739 | } | ||
1740 | nextarg = 1; | 1759 | nextarg = 1; |
1741 | diag = kdbgetaddrarg(argc, argv, &nextarg, | 1760 | diag = kdbgetaddrarg(argc, argv, &nextarg, |
1742 | &addr, &offset, NULL); | 1761 | &addr, &offset, NULL); |
@@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv) | |||
1770 | */ | 1789 | */ |
1771 | static int kdb_rd(int argc, const char **argv) | 1790 | static int kdb_rd(int argc, const char **argv) |
1772 | { | 1791 | { |
1773 | int diag = kdb_check_regs(); | 1792 | int len = kdb_check_regs(); |
1774 | if (diag) | 1793 | #if DBG_MAX_REG_NUM > 0 |
1775 | return diag; | 1794 | int i; |
1795 | char *rname; | ||
1796 | int rsize; | ||
1797 | u64 reg64; | ||
1798 | u32 reg32; | ||
1799 | u16 reg16; | ||
1800 | u8 reg8; | ||
1801 | |||
1802 | if (len) | ||
1803 | return len; | ||
1804 | |||
1805 | for (i = 0; i < DBG_MAX_REG_NUM; i++) { | ||
1806 | rsize = dbg_reg_def[i].size * 2; | ||
1807 | if (rsize > 16) | ||
1808 | rsize = 2; | ||
1809 | if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) { | ||
1810 | len = 0; | ||
1811 | kdb_printf("\n"); | ||
1812 | } | ||
1813 | if (len) | ||
1814 | len += kdb_printf(" "); | ||
1815 | switch(dbg_reg_def[i].size * 8) { | ||
1816 | case 8: | ||
1817 | rname = dbg_get_reg(i, ®8, kdb_current_regs); | ||
1818 | if (!rname) | ||
1819 | break; | ||
1820 | len += kdb_printf("%s: %02x", rname, reg8); | ||
1821 | break; | ||
1822 | case 16: | ||
1823 | rname = dbg_get_reg(i, ®16, kdb_current_regs); | ||
1824 | if (!rname) | ||
1825 | break; | ||
1826 | len += kdb_printf("%s: %04x", rname, reg16); | ||
1827 | break; | ||
1828 | case 32: | ||
1829 | rname = dbg_get_reg(i, ®32, kdb_current_regs); | ||
1830 | if (!rname) | ||
1831 | break; | ||
1832 | len += kdb_printf("%s: %08x", rname, reg32); | ||
1833 | break; | ||
1834 | case 64: | ||
1835 | rname = dbg_get_reg(i, ®64, kdb_current_regs); | ||
1836 | if (!rname) | ||
1837 | break; | ||
1838 | len += kdb_printf("%s: %016llx", rname, reg64); | ||
1839 | break; | ||
1840 | default: | ||
1841 | len += kdb_printf("%s: ??", dbg_reg_def[i].name); | ||
1842 | } | ||
1843 | } | ||
1844 | kdb_printf("\n"); | ||
1845 | #else | ||
1846 | if (len) | ||
1847 | return len; | ||
1776 | 1848 | ||
1777 | kdb_dumpregs(kdb_current_regs); | 1849 | kdb_dumpregs(kdb_current_regs); |
1850 | #endif | ||
1778 | return 0; | 1851 | return 0; |
1779 | } | 1852 | } |
1780 | 1853 | ||
@@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv) | |||
1782 | * kdb_rm - This function implements the 'rm' (register modify) command. | 1855 | * kdb_rm - This function implements the 'rm' (register modify) command. |
1783 | * rm register-name new-contents | 1856 | * rm register-name new-contents |
1784 | * Remarks: | 1857 | * Remarks: |
1785 | * Currently doesn't allow modification of control or | 1858 | * Allows register modification with the same restrictions as gdb |
1786 | * debug registers. | ||
1787 | */ | 1859 | */ |
1788 | static int kdb_rm(int argc, const char **argv) | 1860 | static int kdb_rm(int argc, const char **argv) |
1789 | { | 1861 | { |
1862 | #if DBG_MAX_REG_NUM > 0 | ||
1790 | int diag; | 1863 | int diag; |
1791 | int ind = 0; | 1864 | const char *rname; |
1792 | unsigned long contents; | 1865 | int i; |
1866 | u64 reg64; | ||
1867 | u32 reg32; | ||
1868 | u16 reg16; | ||
1869 | u8 reg8; | ||
1793 | 1870 | ||
1794 | if (argc != 2) | 1871 | if (argc != 2) |
1795 | return KDB_ARGCOUNT; | 1872 | return KDB_ARGCOUNT; |
1796 | /* | 1873 | /* |
1797 | * Allow presence or absence of leading '%' symbol. | 1874 | * Allow presence or absence of leading '%' symbol. |
1798 | */ | 1875 | */ |
1799 | if (argv[1][0] == '%') | 1876 | rname = argv[1]; |
1800 | ind = 1; | 1877 | if (*rname == '%') |
1878 | rname++; | ||
1801 | 1879 | ||
1802 | diag = kdbgetularg(argv[2], &contents); | 1880 | diag = kdbgetu64arg(argv[2], ®64); |
1803 | if (diag) | 1881 | if (diag) |
1804 | return diag; | 1882 | return diag; |
1805 | 1883 | ||
1806 | diag = kdb_check_regs(); | 1884 | diag = kdb_check_regs(); |
1807 | if (diag) | 1885 | if (diag) |
1808 | return diag; | 1886 | return diag; |
1887 | |||
1888 | diag = KDB_BADREG; | ||
1889 | for (i = 0; i < DBG_MAX_REG_NUM; i++) { | ||
1890 | if (strcmp(rname, dbg_reg_def[i].name) == 0) { | ||
1891 | diag = 0; | ||
1892 | break; | ||
1893 | } | ||
1894 | } | ||
1895 | if (!diag) { | ||
1896 | switch(dbg_reg_def[i].size * 8) { | ||
1897 | case 8: | ||
1898 | reg8 = reg64; | ||
1899 | dbg_set_reg(i, ®8, kdb_current_regs); | ||
1900 | break; | ||
1901 | case 16: | ||
1902 | reg16 = reg64; | ||
1903 | dbg_set_reg(i, ®16, kdb_current_regs); | ||
1904 | break; | ||
1905 | case 32: | ||
1906 | reg32 = reg64; | ||
1907 | dbg_set_reg(i, ®32, kdb_current_regs); | ||
1908 | break; | ||
1909 | case 64: | ||
1910 | dbg_set_reg(i, ®64, kdb_current_regs); | ||
1911 | break; | ||
1912 | } | ||
1913 | } | ||
1914 | return diag; | ||
1915 | #else | ||
1809 | kdb_printf("ERROR: Register set currently not implemented\n"); | 1916 | kdb_printf("ERROR: Register set currently not implemented\n"); |
1810 | return 0; | 1917 | return 0; |
1918 | #endif | ||
1811 | } | 1919 | } |
1812 | 1920 | ||
1813 | #if defined(CONFIG_MAGIC_SYSRQ) | 1921 | #if defined(CONFIG_MAGIC_SYSRQ) |
@@ -1821,7 +1929,7 @@ static int kdb_sr(int argc, const char **argv) | |||
1821 | if (argc != 1) | 1929 | if (argc != 1) |
1822 | return KDB_ARGCOUNT; | 1930 | return KDB_ARGCOUNT; |
1823 | kdb_trap_printk++; | 1931 | kdb_trap_printk++; |
1824 | __handle_sysrq(*argv[1], NULL, 0); | 1932 | __handle_sysrq(*argv[1], false); |
1825 | kdb_trap_printk--; | 1933 | kdb_trap_printk--; |
1826 | 1934 | ||
1827 | return 0; | 1935 | return 0; |
@@ -2440,6 +2548,7 @@ static void kdb_sysinfo(struct sysinfo *val) | |||
2440 | */ | 2548 | */ |
2441 | static int kdb_summary(int argc, const char **argv) | 2549 | static int kdb_summary(int argc, const char **argv) |
2442 | { | 2550 | { |
2551 | struct timespec now; | ||
2443 | struct kdb_tm tm; | 2552 | struct kdb_tm tm; |
2444 | struct sysinfo val; | 2553 | struct sysinfo val; |
2445 | 2554 | ||
@@ -2454,7 +2563,8 @@ static int kdb_summary(int argc, const char **argv) | |||
2454 | kdb_printf("domainname %s\n", init_uts_ns.name.domainname); | 2563 | kdb_printf("domainname %s\n", init_uts_ns.name.domainname); |
2455 | kdb_printf("ccversion %s\n", __stringify(CCVERSION)); | 2564 | kdb_printf("ccversion %s\n", __stringify(CCVERSION)); |
2456 | 2565 | ||
2457 | kdb_gmtime(&xtime, &tm); | 2566 | now = __current_kernel_time(); |
2567 | kdb_gmtime(&now, &tm); | ||
2458 | kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " | 2568 | kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " |
2459 | "tz_minuteswest %d\n", | 2569 | "tz_minuteswest %d\n", |
2460 | 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, | 2570 | 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, |
@@ -2673,6 +2783,8 @@ int kdb_register_repeat(char *cmd, | |||
2673 | 2783 | ||
2674 | return 0; | 2784 | return 0; |
2675 | } | 2785 | } |
2786 | EXPORT_SYMBOL_GPL(kdb_register_repeat); | ||
2787 | |||
2676 | 2788 | ||
2677 | /* | 2789 | /* |
2678 | * kdb_register - Compatibility register function for commands that do | 2790 | * kdb_register - Compatibility register function for commands that do |
@@ -2695,6 +2807,7 @@ int kdb_register(char *cmd, | |||
2695 | return kdb_register_repeat(cmd, func, usage, help, minlen, | 2807 | return kdb_register_repeat(cmd, func, usage, help, minlen, |
2696 | KDB_REPEAT_NONE); | 2808 | KDB_REPEAT_NONE); |
2697 | } | 2809 | } |
2810 | EXPORT_SYMBOL_GPL(kdb_register); | ||
2698 | 2811 | ||
2699 | /* | 2812 | /* |
2700 | * kdb_unregister - This function is used to unregister a kernel | 2813 | * kdb_unregister - This function is used to unregister a kernel |
@@ -2713,7 +2826,7 @@ int kdb_unregister(char *cmd) | |||
2713 | /* | 2826 | /* |
2714 | * find the command. | 2827 | * find the command. |
2715 | */ | 2828 | */ |
2716 | for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) { | 2829 | for_each_kdbcmd(kp, i) { |
2717 | if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { | 2830 | if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { |
2718 | kp->cmd_name = NULL; | 2831 | kp->cmd_name = NULL; |
2719 | return 0; | 2832 | return 0; |
@@ -2723,6 +2836,7 @@ int kdb_unregister(char *cmd) | |||
2723 | /* Couldn't find it. */ | 2836 | /* Couldn't find it. */ |
2724 | return 1; | 2837 | return 1; |
2725 | } | 2838 | } |
2839 | EXPORT_SYMBOL_GPL(kdb_unregister); | ||
2726 | 2840 | ||
2727 | /* Initialize the kdb command table. */ | 2841 | /* Initialize the kdb command table. */ |
2728 | static void __init kdb_inittab(void) | 2842 | static void __init kdb_inittab(void) |
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 97d3ba69775d..35d69ed1dfb5 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
@@ -15,29 +15,6 @@ | |||
15 | #include <linux/kgdb.h> | 15 | #include <linux/kgdb.h> |
16 | #include "../debug_core.h" | 16 | #include "../debug_core.h" |
17 | 17 | ||
18 | /* Kernel Debugger Error codes. Must not overlap with command codes. */ | ||
19 | #define KDB_NOTFOUND (-1) | ||
20 | #define KDB_ARGCOUNT (-2) | ||
21 | #define KDB_BADWIDTH (-3) | ||
22 | #define KDB_BADRADIX (-4) | ||
23 | #define KDB_NOTENV (-5) | ||
24 | #define KDB_NOENVVALUE (-6) | ||
25 | #define KDB_NOTIMP (-7) | ||
26 | #define KDB_ENVFULL (-8) | ||
27 | #define KDB_ENVBUFFULL (-9) | ||
28 | #define KDB_TOOMANYBPT (-10) | ||
29 | #define KDB_TOOMANYDBREGS (-11) | ||
30 | #define KDB_DUPBPT (-12) | ||
31 | #define KDB_BPTNOTFOUND (-13) | ||
32 | #define KDB_BADMODE (-14) | ||
33 | #define KDB_BADINT (-15) | ||
34 | #define KDB_INVADDRFMT (-16) | ||
35 | #define KDB_BADREG (-17) | ||
36 | #define KDB_BADCPUNUM (-18) | ||
37 | #define KDB_BADLENGTH (-19) | ||
38 | #define KDB_NOBP (-20) | ||
39 | #define KDB_BADADDR (-21) | ||
40 | |||
41 | /* Kernel Debugger Command codes. Must not overlap with error codes. */ | 18 | /* Kernel Debugger Command codes. Must not overlap with error codes. */ |
42 | #define KDB_CMD_GO (-1001) | 19 | #define KDB_CMD_GO (-1001) |
43 | #define KDB_CMD_CPU (-1002) | 20 | #define KDB_CMD_CPU (-1002) |
@@ -93,17 +70,6 @@ | |||
93 | */ | 70 | */ |
94 | #define KDB_MAXBPT 16 | 71 | #define KDB_MAXBPT 16 |
95 | 72 | ||
96 | /* Maximum number of arguments to a function */ | ||
97 | #define KDB_MAXARGS 16 | ||
98 | |||
99 | typedef enum { | ||
100 | KDB_REPEAT_NONE = 0, /* Do not repeat this command */ | ||
101 | KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */ | ||
102 | KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */ | ||
103 | } kdb_repeat_t; | ||
104 | |||
105 | typedef int (*kdb_func_t)(int, const char **); | ||
106 | |||
107 | /* Symbol table format returned by kallsyms. */ | 73 | /* Symbol table format returned by kallsyms. */ |
108 | typedef struct __ksymtab { | 74 | typedef struct __ksymtab { |
109 | unsigned long value; /* Address of symbol */ | 75 | unsigned long value; /* Address of symbol */ |
@@ -123,11 +89,6 @@ extern int kallsyms_symbol_next(char *prefix_name, int flag); | |||
123 | extern int kallsyms_symbol_complete(char *prefix_name, int max_len); | 89 | extern int kallsyms_symbol_complete(char *prefix_name, int max_len); |
124 | 90 | ||
125 | /* Exported Symbols for kernel loadable modules to use. */ | 91 | /* Exported Symbols for kernel loadable modules to use. */ |
126 | extern int kdb_register(char *, kdb_func_t, char *, char *, short); | ||
127 | extern int kdb_register_repeat(char *, kdb_func_t, char *, char *, | ||
128 | short, kdb_repeat_t); | ||
129 | extern int kdb_unregister(char *); | ||
130 | |||
131 | extern int kdb_getarea_size(void *, unsigned long, size_t); | 92 | extern int kdb_getarea_size(void *, unsigned long, size_t); |
132 | extern int kdb_putarea_size(unsigned long, void *, size_t); | 93 | extern int kdb_putarea_size(unsigned long, void *, size_t); |
133 | 94 | ||
@@ -144,9 +105,8 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t); | |||
144 | extern int kdb_putword(unsigned long, unsigned long, size_t); | 105 | extern int kdb_putword(unsigned long, unsigned long, size_t); |
145 | 106 | ||
146 | extern int kdbgetularg(const char *, unsigned long *); | 107 | extern int kdbgetularg(const char *, unsigned long *); |
147 | extern int kdb_set(int, const char **); | 108 | extern int kdbgetu64arg(const char *, u64 *); |
148 | extern char *kdbgetenv(const char *); | 109 | extern char *kdbgetenv(const char *); |
149 | extern int kdbgetintenv(const char *, int *); | ||
150 | extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, | 110 | extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, |
151 | long *, char **); | 111 | long *, char **); |
152 | extern int kdbgetsymval(const char *, kdb_symtab_t *); | 112 | extern int kdbgetsymval(const char *, kdb_symtab_t *); |
@@ -257,7 +217,6 @@ extern void kdb_ps1(const struct task_struct *p); | |||
257 | extern void kdb_print_nameval(const char *name, unsigned long val); | 217 | extern void kdb_print_nameval(const char *name, unsigned long val); |
258 | extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); | 218 | extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); |
259 | extern void kdb_meminfo_proc_show(void); | 219 | extern void kdb_meminfo_proc_show(void); |
260 | extern const char *kdb_walk_kallsyms(loff_t *pos); | ||
261 | extern char *kdb_getstr(char *, size_t, char *); | 220 | extern char *kdb_getstr(char *, size_t, char *); |
262 | 221 | ||
263 | /* Defines for kdb_symbol_print */ | 222 | /* Defines for kdb_symbol_print */ |
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 45344d5c53dd..6b2485dcb050 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c | |||
@@ -82,8 +82,8 @@ static char *kdb_name_table[100]; /* arbitrary size */ | |||
82 | int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) | 82 | int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) |
83 | { | 83 | { |
84 | int ret = 0; | 84 | int ret = 0; |
85 | unsigned long symbolsize; | 85 | unsigned long symbolsize = 0; |
86 | unsigned long offset; | 86 | unsigned long offset = 0; |
87 | #define knt1_size 128 /* must be >= kallsyms table size */ | 87 | #define knt1_size 128 /* must be >= kallsyms table size */ |
88 | char *knt1 = NULL; | 88 | char *knt1 = NULL; |
89 | 89 | ||
diff --git a/kernel/early_res.c b/kernel/early_res.c deleted file mode 100644 index 7bfae887f211..000000000000 --- a/kernel/early_res.c +++ /dev/null | |||
@@ -1,590 +0,0 @@ | |||
1 | /* | ||
2 | * early_res, could be used to replace bootmem | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/types.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/bootmem.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/early_res.h> | ||
10 | #include <linux/slab.h> | ||
11 | #include <linux/kmemleak.h> | ||
12 | |||
13 | /* | ||
14 | * Early reserved memory areas. | ||
15 | */ | ||
16 | /* | ||
17 | * need to make sure this one is bigger enough before | ||
18 | * find_fw_memmap_area could be used | ||
19 | */ | ||
20 | #define MAX_EARLY_RES_X 32 | ||
21 | |||
22 | struct early_res { | ||
23 | u64 start, end; | ||
24 | char name[15]; | ||
25 | char overlap_ok; | ||
26 | }; | ||
27 | static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; | ||
28 | |||
29 | static int max_early_res __initdata = MAX_EARLY_RES_X; | ||
30 | static struct early_res *early_res __initdata = &early_res_x[0]; | ||
31 | static int early_res_count __initdata; | ||
32 | |||
33 | static int __init find_overlapped_early(u64 start, u64 end) | ||
34 | { | ||
35 | int i; | ||
36 | struct early_res *r; | ||
37 | |||
38 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
39 | r = &early_res[i]; | ||
40 | if (end > r->start && start < r->end) | ||
41 | break; | ||
42 | } | ||
43 | |||
44 | return i; | ||
45 | } | ||
46 | |||
47 | /* | ||
48 | * Drop the i-th range from the early reservation map, | ||
49 | * by copying any higher ranges down one over it, and | ||
50 | * clearing what had been the last slot. | ||
51 | */ | ||
52 | static void __init drop_range(int i) | ||
53 | { | ||
54 | int j; | ||
55 | |||
56 | for (j = i + 1; j < max_early_res && early_res[j].end; j++) | ||
57 | ; | ||
58 | |||
59 | memmove(&early_res[i], &early_res[i + 1], | ||
60 | (j - 1 - i) * sizeof(struct early_res)); | ||
61 | |||
62 | early_res[j - 1].end = 0; | ||
63 | early_res_count--; | ||
64 | } | ||
65 | |||
66 | static void __init drop_range_partial(int i, u64 start, u64 end) | ||
67 | { | ||
68 | u64 common_start, common_end; | ||
69 | u64 old_start, old_end; | ||
70 | |||
71 | old_start = early_res[i].start; | ||
72 | old_end = early_res[i].end; | ||
73 | common_start = max(old_start, start); | ||
74 | common_end = min(old_end, end); | ||
75 | |||
76 | /* no overlap ? */ | ||
77 | if (common_start >= common_end) | ||
78 | return; | ||
79 | |||
80 | if (old_start < common_start) { | ||
81 | /* make head segment */ | ||
82 | early_res[i].end = common_start; | ||
83 | if (old_end > common_end) { | ||
84 | char name[15]; | ||
85 | |||
86 | /* | ||
87 | * Save a local copy of the name, since the | ||
88 | * early_res array could get resized inside | ||
89 | * reserve_early_without_check() -> | ||
90 | * __check_and_double_early_res(), which would | ||
91 | * make the current name pointer invalid. | ||
92 | */ | ||
93 | strncpy(name, early_res[i].name, | ||
94 | sizeof(early_res[i].name) - 1); | ||
95 | /* add another for left over on tail */ | ||
96 | reserve_early_without_check(common_end, old_end, name); | ||
97 | } | ||
98 | return; | ||
99 | } else { | ||
100 | if (old_end > common_end) { | ||
101 | /* reuse the entry for tail left */ | ||
102 | early_res[i].start = common_end; | ||
103 | return; | ||
104 | } | ||
105 | /* all covered */ | ||
106 | drop_range(i); | ||
107 | } | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * Split any existing ranges that: | ||
112 | * 1) are marked 'overlap_ok', and | ||
113 | * 2) overlap with the stated range [start, end) | ||
114 | * into whatever portion (if any) of the existing range is entirely | ||
115 | * below or entirely above the stated range. Drop the portion | ||
116 | * of the existing range that overlaps with the stated range, | ||
117 | * which will allow the caller of this routine to then add that | ||
118 | * stated range without conflicting with any existing range. | ||
119 | */ | ||
120 | static void __init drop_overlaps_that_are_ok(u64 start, u64 end) | ||
121 | { | ||
122 | int i; | ||
123 | struct early_res *r; | ||
124 | u64 lower_start, lower_end; | ||
125 | u64 upper_start, upper_end; | ||
126 | char name[15]; | ||
127 | |||
128 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
129 | r = &early_res[i]; | ||
130 | |||
131 | /* Continue past non-overlapping ranges */ | ||
132 | if (end <= r->start || start >= r->end) | ||
133 | continue; | ||
134 | |||
135 | /* | ||
136 | * Leave non-ok overlaps as is; let caller | ||
137 | * panic "Overlapping early reservations" | ||
138 | * when it hits this overlap. | ||
139 | */ | ||
140 | if (!r->overlap_ok) | ||
141 | return; | ||
142 | |||
143 | /* | ||
144 | * We have an ok overlap. We will drop it from the early | ||
145 | * reservation map, and add back in any non-overlapping | ||
146 | * portions (lower or upper) as separate, overlap_ok, | ||
147 | * non-overlapping ranges. | ||
148 | */ | ||
149 | |||
150 | /* 1. Note any non-overlapping (lower or upper) ranges. */ | ||
151 | strncpy(name, r->name, sizeof(name) - 1); | ||
152 | |||
153 | lower_start = lower_end = 0; | ||
154 | upper_start = upper_end = 0; | ||
155 | if (r->start < start) { | ||
156 | lower_start = r->start; | ||
157 | lower_end = start; | ||
158 | } | ||
159 | if (r->end > end) { | ||
160 | upper_start = end; | ||
161 | upper_end = r->end; | ||
162 | } | ||
163 | |||
164 | /* 2. Drop the original ok overlapping range */ | ||
165 | drop_range(i); | ||
166 | |||
167 | i--; /* resume for-loop on copied down entry */ | ||
168 | |||
169 | /* 3. Add back in any non-overlapping ranges. */ | ||
170 | if (lower_end) | ||
171 | reserve_early_overlap_ok(lower_start, lower_end, name); | ||
172 | if (upper_end) | ||
173 | reserve_early_overlap_ok(upper_start, upper_end, name); | ||
174 | } | ||
175 | } | ||
176 | |||
177 | static void __init __reserve_early(u64 start, u64 end, char *name, | ||
178 | int overlap_ok) | ||
179 | { | ||
180 | int i; | ||
181 | struct early_res *r; | ||
182 | |||
183 | i = find_overlapped_early(start, end); | ||
184 | if (i >= max_early_res) | ||
185 | panic("Too many early reservations"); | ||
186 | r = &early_res[i]; | ||
187 | if (r->end) | ||
188 | panic("Overlapping early reservations " | ||
189 | "%llx-%llx %s to %llx-%llx %s\n", | ||
190 | start, end - 1, name ? name : "", r->start, | ||
191 | r->end - 1, r->name); | ||
192 | r->start = start; | ||
193 | r->end = end; | ||
194 | r->overlap_ok = overlap_ok; | ||
195 | if (name) | ||
196 | strncpy(r->name, name, sizeof(r->name) - 1); | ||
197 | early_res_count++; | ||
198 | } | ||
199 | |||
200 | /* | ||
201 | * A few early reservtations come here. | ||
202 | * | ||
203 | * The 'overlap_ok' in the name of this routine does -not- mean it | ||
204 | * is ok for these reservations to overlap an earlier reservation. | ||
205 | * Rather it means that it is ok for subsequent reservations to | ||
206 | * overlap this one. | ||
207 | * | ||
208 | * Use this entry point to reserve early ranges when you are doing | ||
209 | * so out of "Paranoia", reserving perhaps more memory than you need, | ||
210 | * just in case, and don't mind a subsequent overlapping reservation | ||
211 | * that is known to be needed. | ||
212 | * | ||
213 | * The drop_overlaps_that_are_ok() call here isn't really needed. | ||
214 | * It would be needed if we had two colliding 'overlap_ok' | ||
215 | * reservations, so that the second such would not panic on the | ||
216 | * overlap with the first. We don't have any such as of this | ||
217 | * writing, but might as well tolerate such if it happens in | ||
218 | * the future. | ||
219 | */ | ||
220 | void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) | ||
221 | { | ||
222 | drop_overlaps_that_are_ok(start, end); | ||
223 | __reserve_early(start, end, name, 1); | ||
224 | } | ||
225 | |||
226 | static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) | ||
227 | { | ||
228 | u64 start, end, size, mem; | ||
229 | struct early_res *new; | ||
230 | |||
231 | /* do we have enough slots left ? */ | ||
232 | if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) | ||
233 | return; | ||
234 | |||
235 | /* double it */ | ||
236 | mem = -1ULL; | ||
237 | size = sizeof(struct early_res) * max_early_res * 2; | ||
238 | if (early_res == early_res_x) | ||
239 | start = 0; | ||
240 | else | ||
241 | start = early_res[0].end; | ||
242 | end = ex_start; | ||
243 | if (start + size < end) | ||
244 | mem = find_fw_memmap_area(start, end, size, | ||
245 | sizeof(struct early_res)); | ||
246 | if (mem == -1ULL) { | ||
247 | start = ex_end; | ||
248 | end = get_max_mapped(); | ||
249 | if (start + size < end) | ||
250 | mem = find_fw_memmap_area(start, end, size, | ||
251 | sizeof(struct early_res)); | ||
252 | } | ||
253 | if (mem == -1ULL) | ||
254 | panic("can not find more space for early_res array"); | ||
255 | |||
256 | new = __va(mem); | ||
257 | /* save the first one for own */ | ||
258 | new[0].start = mem; | ||
259 | new[0].end = mem + size; | ||
260 | new[0].overlap_ok = 0; | ||
261 | /* copy old to new */ | ||
262 | if (early_res == early_res_x) { | ||
263 | memcpy(&new[1], &early_res[0], | ||
264 | sizeof(struct early_res) * max_early_res); | ||
265 | memset(&new[max_early_res+1], 0, | ||
266 | sizeof(struct early_res) * (max_early_res - 1)); | ||
267 | early_res_count++; | ||
268 | } else { | ||
269 | memcpy(&new[1], &early_res[1], | ||
270 | sizeof(struct early_res) * (max_early_res - 1)); | ||
271 | memset(&new[max_early_res], 0, | ||
272 | sizeof(struct early_res) * max_early_res); | ||
273 | } | ||
274 | memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); | ||
275 | early_res = new; | ||
276 | max_early_res *= 2; | ||
277 | printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", | ||
278 | max_early_res, mem, mem + size - 1); | ||
279 | } | ||
280 | |||
281 | /* | ||
282 | * Most early reservations come here. | ||
283 | * | ||
284 | * We first have drop_overlaps_that_are_ok() drop any pre-existing | ||
285 | * 'overlap_ok' ranges, so that we can then reserve this memory | ||
286 | * range without risk of panic'ing on an overlapping overlap_ok | ||
287 | * early reservation. | ||
288 | */ | ||
289 | void __init reserve_early(u64 start, u64 end, char *name) | ||
290 | { | ||
291 | if (start >= end) | ||
292 | return; | ||
293 | |||
294 | __check_and_double_early_res(start, end); | ||
295 | |||
296 | drop_overlaps_that_are_ok(start, end); | ||
297 | __reserve_early(start, end, name, 0); | ||
298 | } | ||
299 | |||
300 | void __init reserve_early_without_check(u64 start, u64 end, char *name) | ||
301 | { | ||
302 | struct early_res *r; | ||
303 | |||
304 | if (start >= end) | ||
305 | return; | ||
306 | |||
307 | __check_and_double_early_res(start, end); | ||
308 | |||
309 | r = &early_res[early_res_count]; | ||
310 | |||
311 | r->start = start; | ||
312 | r->end = end; | ||
313 | r->overlap_ok = 0; | ||
314 | if (name) | ||
315 | strncpy(r->name, name, sizeof(r->name) - 1); | ||
316 | early_res_count++; | ||
317 | } | ||
318 | |||
319 | void __init free_early(u64 start, u64 end) | ||
320 | { | ||
321 | struct early_res *r; | ||
322 | int i; | ||
323 | |||
324 | kmemleak_free_part(__va(start), end - start); | ||
325 | |||
326 | i = find_overlapped_early(start, end); | ||
327 | r = &early_res[i]; | ||
328 | if (i >= max_early_res || r->end != end || r->start != start) | ||
329 | panic("free_early on not reserved area: %llx-%llx!", | ||
330 | start, end - 1); | ||
331 | |||
332 | drop_range(i); | ||
333 | } | ||
334 | |||
335 | void __init free_early_partial(u64 start, u64 end) | ||
336 | { | ||
337 | struct early_res *r; | ||
338 | int i; | ||
339 | |||
340 | kmemleak_free_part(__va(start), end - start); | ||
341 | |||
342 | if (start == end) | ||
343 | return; | ||
344 | |||
345 | if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end)) | ||
346 | return; | ||
347 | |||
348 | try_next: | ||
349 | i = find_overlapped_early(start, end); | ||
350 | if (i >= max_early_res) | ||
351 | return; | ||
352 | |||
353 | r = &early_res[i]; | ||
354 | /* hole ? */ | ||
355 | if (r->end >= end && r->start <= start) { | ||
356 | drop_range_partial(i, start, end); | ||
357 | return; | ||
358 | } | ||
359 | |||
360 | drop_range_partial(i, start, end); | ||
361 | goto try_next; | ||
362 | } | ||
363 | |||
364 | #ifdef CONFIG_NO_BOOTMEM | ||
365 | static void __init subtract_early_res(struct range *range, int az) | ||
366 | { | ||
367 | int i, count; | ||
368 | u64 final_start, final_end; | ||
369 | int idx = 0; | ||
370 | |||
371 | count = 0; | ||
372 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
373 | count++; | ||
374 | |||
375 | /* need to skip first one ?*/ | ||
376 | if (early_res != early_res_x) | ||
377 | idx = 1; | ||
378 | |||
379 | #define DEBUG_PRINT_EARLY_RES 1 | ||
380 | |||
381 | #if DEBUG_PRINT_EARLY_RES | ||
382 | printk(KERN_INFO "Subtract (%d early reservations)\n", count); | ||
383 | #endif | ||
384 | for (i = idx; i < count; i++) { | ||
385 | struct early_res *r = &early_res[i]; | ||
386 | #if DEBUG_PRINT_EARLY_RES | ||
387 | printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, | ||
388 | r->start, r->end, r->name); | ||
389 | #endif | ||
390 | final_start = PFN_DOWN(r->start); | ||
391 | final_end = PFN_UP(r->end); | ||
392 | if (final_start >= final_end) | ||
393 | continue; | ||
394 | subtract_range(range, az, final_start, final_end); | ||
395 | } | ||
396 | |||
397 | } | ||
398 | |||
399 | int __init get_free_all_memory_range(struct range **rangep, int nodeid) | ||
400 | { | ||
401 | int i, count; | ||
402 | u64 start = 0, end; | ||
403 | u64 size; | ||
404 | u64 mem; | ||
405 | struct range *range; | ||
406 | int nr_range; | ||
407 | |||
408 | count = 0; | ||
409 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
410 | count++; | ||
411 | |||
412 | count *= 2; | ||
413 | |||
414 | size = sizeof(struct range) * count; | ||
415 | end = get_max_mapped(); | ||
416 | #ifdef MAX_DMA32_PFN | ||
417 | if (end > (MAX_DMA32_PFN << PAGE_SHIFT)) | ||
418 | start = MAX_DMA32_PFN << PAGE_SHIFT; | ||
419 | #endif | ||
420 | mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); | ||
421 | if (mem == -1ULL) | ||
422 | panic("can not find more space for range free"); | ||
423 | |||
424 | range = __va(mem); | ||
425 | /* use early_node_map[] and early_res to get range array at first */ | ||
426 | memset(range, 0, size); | ||
427 | nr_range = 0; | ||
428 | |||
429 | /* need to go over early_node_map to find out good range for node */ | ||
430 | nr_range = add_from_early_node_map(range, count, nr_range, nodeid); | ||
431 | #ifdef CONFIG_X86_32 | ||
432 | subtract_range(range, count, max_low_pfn, -1ULL); | ||
433 | #endif | ||
434 | subtract_early_res(range, count); | ||
435 | nr_range = clean_sort_range(range, count); | ||
436 | |||
437 | /* need to clear it ? */ | ||
438 | if (nodeid == MAX_NUMNODES) { | ||
439 | memset(&early_res[0], 0, | ||
440 | sizeof(struct early_res) * max_early_res); | ||
441 | early_res = NULL; | ||
442 | max_early_res = 0; | ||
443 | } | ||
444 | |||
445 | *rangep = range; | ||
446 | return nr_range; | ||
447 | } | ||
448 | #else | ||
449 | void __init early_res_to_bootmem(u64 start, u64 end) | ||
450 | { | ||
451 | int i, count; | ||
452 | u64 final_start, final_end; | ||
453 | int idx = 0; | ||
454 | |||
455 | count = 0; | ||
456 | for (i = 0; i < max_early_res && early_res[i].end; i++) | ||
457 | count++; | ||
458 | |||
459 | /* need to skip first one ?*/ | ||
460 | if (early_res != early_res_x) | ||
461 | idx = 1; | ||
462 | |||
463 | printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", | ||
464 | count - idx, max_early_res, start, end); | ||
465 | for (i = idx; i < count; i++) { | ||
466 | struct early_res *r = &early_res[i]; | ||
467 | printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, | ||
468 | r->start, r->end, r->name); | ||
469 | final_start = max(start, r->start); | ||
470 | final_end = min(end, r->end); | ||
471 | if (final_start >= final_end) { | ||
472 | printk(KERN_CONT "\n"); | ||
473 | continue; | ||
474 | } | ||
475 | printk(KERN_CONT " ==> [%010llx - %010llx]\n", | ||
476 | final_start, final_end); | ||
477 | reserve_bootmem_generic(final_start, final_end - final_start, | ||
478 | BOOTMEM_DEFAULT); | ||
479 | } | ||
480 | /* clear them */ | ||
481 | memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); | ||
482 | early_res = NULL; | ||
483 | max_early_res = 0; | ||
484 | early_res_count = 0; | ||
485 | } | ||
486 | #endif | ||
487 | |||
488 | /* Check for already reserved areas */ | ||
489 | static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) | ||
490 | { | ||
491 | int i; | ||
492 | u64 addr = *addrp; | ||
493 | int changed = 0; | ||
494 | struct early_res *r; | ||
495 | again: | ||
496 | i = find_overlapped_early(addr, addr + size); | ||
497 | r = &early_res[i]; | ||
498 | if (i < max_early_res && r->end) { | ||
499 | *addrp = addr = round_up(r->end, align); | ||
500 | changed = 1; | ||
501 | goto again; | ||
502 | } | ||
503 | return changed; | ||
504 | } | ||
505 | |||
506 | /* Check for already reserved areas */ | ||
507 | static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) | ||
508 | { | ||
509 | int i; | ||
510 | u64 addr = *addrp, last; | ||
511 | u64 size = *sizep; | ||
512 | int changed = 0; | ||
513 | again: | ||
514 | last = addr + size; | ||
515 | for (i = 0; i < max_early_res && early_res[i].end; i++) { | ||
516 | struct early_res *r = &early_res[i]; | ||
517 | if (last > r->start && addr < r->start) { | ||
518 | size = r->start - addr; | ||
519 | changed = 1; | ||
520 | goto again; | ||
521 | } | ||
522 | if (last > r->end && addr < r->end) { | ||
523 | addr = round_up(r->end, align); | ||
524 | size = last - addr; | ||
525 | changed = 1; | ||
526 | goto again; | ||
527 | } | ||
528 | if (last <= r->end && addr >= r->start) { | ||
529 | (*sizep)++; | ||
530 | return 0; | ||
531 | } | ||
532 | } | ||
533 | if (changed) { | ||
534 | *addrp = addr; | ||
535 | *sizep = size; | ||
536 | } | ||
537 | return changed; | ||
538 | } | ||
539 | |||
540 | /* | ||
541 | * Find a free area with specified alignment in a specific range. | ||
542 | * only with the area.between start to end is active range from early_node_map | ||
543 | * so they are good as RAM | ||
544 | */ | ||
545 | u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, | ||
546 | u64 size, u64 align) | ||
547 | { | ||
548 | u64 addr, last; | ||
549 | |||
550 | addr = round_up(ei_start, align); | ||
551 | if (addr < start) | ||
552 | addr = round_up(start, align); | ||
553 | if (addr >= ei_last) | ||
554 | goto out; | ||
555 | while (bad_addr(&addr, size, align) && addr+size <= ei_last) | ||
556 | ; | ||
557 | last = addr + size; | ||
558 | if (last > ei_last) | ||
559 | goto out; | ||
560 | if (last > end) | ||
561 | goto out; | ||
562 | |||
563 | return addr; | ||
564 | |||
565 | out: | ||
566 | return -1ULL; | ||
567 | } | ||
568 | |||
569 | u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, | ||
570 | u64 *sizep, u64 align) | ||
571 | { | ||
572 | u64 addr, last; | ||
573 | |||
574 | addr = round_up(ei_start, align); | ||
575 | if (addr < start) | ||
576 | addr = round_up(start, align); | ||
577 | if (addr >= ei_last) | ||
578 | goto out; | ||
579 | *sizep = ei_last - addr; | ||
580 | while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) | ||
581 | ; | ||
582 | last = addr + *sizep; | ||
583 | if (last > ei_last) | ||
584 | goto out; | ||
585 | |||
586 | return addr; | ||
587 | |||
588 | out: | ||
589 | return -1ULL; | ||
590 | } | ||
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index dd62f8e714ca..0dbeae374225 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c | |||
@@ -134,23 +134,14 @@ unregister: | |||
134 | return 0; | 134 | return 0; |
135 | } | 135 | } |
136 | 136 | ||
137 | int | 137 | int __set_personality(unsigned int personality) |
138 | __set_personality(unsigned int personality) | ||
139 | { | 138 | { |
140 | struct exec_domain *ep, *oep; | 139 | struct exec_domain *oep = current_thread_info()->exec_domain; |
141 | |||
142 | ep = lookup_exec_domain(personality); | ||
143 | if (ep == current_thread_info()->exec_domain) { | ||
144 | current->personality = personality; | ||
145 | module_put(ep->module); | ||
146 | return 0; | ||
147 | } | ||
148 | 140 | ||
141 | current_thread_info()->exec_domain = lookup_exec_domain(personality); | ||
149 | current->personality = personality; | 142 | current->personality = personality; |
150 | oep = current_thread_info()->exec_domain; | ||
151 | current_thread_info()->exec_domain = ep; | ||
152 | |||
153 | module_put(oep->module); | 143 | module_put(oep->module); |
144 | |||
154 | return 0; | 145 | return 0; |
155 | } | 146 | } |
156 | 147 | ||
@@ -192,11 +183,8 @@ SYSCALL_DEFINE1(personality, unsigned int, personality) | |||
192 | { | 183 | { |
193 | unsigned int old = current->personality; | 184 | unsigned int old = current->personality; |
194 | 185 | ||
195 | if (personality != 0xffffffff) { | 186 | if (personality != 0xffffffff) |
196 | set_personality(personality); | 187 | set_personality(personality); |
197 | if (current->personality != personality) | ||
198 | return -EINVAL; | ||
199 | } | ||
200 | 188 | ||
201 | return old; | 189 | return old; |
202 | } | 190 | } |
diff --git a/kernel/exit.c b/kernel/exit.c index ceffc67b564a..b194febf5799 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -50,6 +50,7 @@ | |||
50 | #include <linux/perf_event.h> | 50 | #include <linux/perf_event.h> |
51 | #include <trace/events/sched.h> | 51 | #include <trace/events/sched.h> |
52 | #include <linux/hw_breakpoint.h> | 52 | #include <linux/hw_breakpoint.h> |
53 | #include <linux/oom.h> | ||
53 | 54 | ||
54 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
55 | #include <asm/unistd.h> | 56 | #include <asm/unistd.h> |
@@ -149,9 +150,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) | |||
149 | { | 150 | { |
150 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); | 151 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); |
151 | 152 | ||
152 | #ifdef CONFIG_PERF_EVENTS | 153 | perf_event_delayed_put(tsk); |
153 | WARN_ON_ONCE(tsk->perf_event_ctxp); | ||
154 | #endif | ||
155 | trace_sched_process_free(tsk); | 154 | trace_sched_process_free(tsk); |
156 | put_task_struct(tsk); | 155 | put_task_struct(tsk); |
157 | } | 156 | } |
@@ -689,6 +688,8 @@ static void exit_mm(struct task_struct * tsk) | |||
689 | enter_lazy_tlb(mm, current); | 688 | enter_lazy_tlb(mm, current); |
690 | /* We don't want this task to be frozen prematurely */ | 689 | /* We don't want this task to be frozen prematurely */ |
691 | clear_freeze_flag(tsk); | 690 | clear_freeze_flag(tsk); |
691 | if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
692 | atomic_dec(&mm->oom_disable_count); | ||
692 | task_unlock(tsk); | 693 | task_unlock(tsk); |
693 | mm_update_next_owner(mm); | 694 | mm_update_next_owner(mm); |
694 | mmput(mm); | 695 | mmput(mm); |
@@ -702,6 +703,8 @@ static void exit_mm(struct task_struct * tsk) | |||
702 | * space. | 703 | * space. |
703 | */ | 704 | */ |
704 | static struct task_struct *find_new_reaper(struct task_struct *father) | 705 | static struct task_struct *find_new_reaper(struct task_struct *father) |
706 | __releases(&tasklist_lock) | ||
707 | __acquires(&tasklist_lock) | ||
705 | { | 708 | { |
706 | struct pid_namespace *pid_ns = task_active_pid_ns(father); | 709 | struct pid_namespace *pid_ns = task_active_pid_ns(father); |
707 | struct task_struct *thread; | 710 | struct task_struct *thread; |
@@ -771,9 +774,12 @@ static void forget_original_parent(struct task_struct *father) | |||
771 | struct task_struct *p, *n, *reaper; | 774 | struct task_struct *p, *n, *reaper; |
772 | LIST_HEAD(dead_children); | 775 | LIST_HEAD(dead_children); |
773 | 776 | ||
774 | exit_ptrace(father); | ||
775 | |||
776 | write_lock_irq(&tasklist_lock); | 777 | write_lock_irq(&tasklist_lock); |
778 | /* | ||
779 | * Note that exit_ptrace() and find_new_reaper() might | ||
780 | * drop tasklist_lock and reacquire it. | ||
781 | */ | ||
782 | exit_ptrace(father); | ||
777 | reaper = find_new_reaper(father); | 783 | reaper = find_new_reaper(father); |
778 | 784 | ||
779 | list_for_each_entry_safe(p, n, &father->children, sibling) { | 785 | list_for_each_entry_safe(p, n, &father->children, sibling) { |
@@ -1383,8 +1389,7 @@ static int wait_task_stopped(struct wait_opts *wo, | |||
1383 | if (!unlikely(wo->wo_flags & WNOWAIT)) | 1389 | if (!unlikely(wo->wo_flags & WNOWAIT)) |
1384 | *p_code = 0; | 1390 | *p_code = 0; |
1385 | 1391 | ||
1386 | /* don't need the RCU readlock here as we're holding a spinlock */ | 1392 | uid = task_uid(p); |
1387 | uid = __task_cred(p)->uid; | ||
1388 | unlock_sig: | 1393 | unlock_sig: |
1389 | spin_unlock_irq(&p->sighand->siglock); | 1394 | spin_unlock_irq(&p->sighand->siglock); |
1390 | if (!exit_code) | 1395 | if (!exit_code) |
@@ -1457,7 +1462,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | |||
1457 | } | 1462 | } |
1458 | if (!unlikely(wo->wo_flags & WNOWAIT)) | 1463 | if (!unlikely(wo->wo_flags & WNOWAIT)) |
1459 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; | 1464 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; |
1460 | uid = __task_cred(p)->uid; | 1465 | uid = task_uid(p); |
1461 | spin_unlock_irq(&p->sighand->siglock); | 1466 | spin_unlock_irq(&p->sighand->siglock); |
1462 | 1467 | ||
1463 | pid = task_pid_vnr(p); | 1468 | pid = task_pid_vnr(p); |
diff --git a/kernel/fork.c b/kernel/fork.c index b6cce14ba047..3b159c5991b7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -65,6 +65,7 @@ | |||
65 | #include <linux/perf_event.h> | 65 | #include <linux/perf_event.h> |
66 | #include <linux/posix-timers.h> | 66 | #include <linux/posix-timers.h> |
67 | #include <linux/user-return-notifier.h> | 67 | #include <linux/user-return-notifier.h> |
68 | #include <linux/oom.h> | ||
68 | 69 | ||
69 | #include <asm/pgtable.h> | 70 | #include <asm/pgtable.h> |
70 | #include <asm/pgalloc.h> | 71 | #include <asm/pgalloc.h> |
@@ -300,7 +301,7 @@ out: | |||
300 | #ifdef CONFIG_MMU | 301 | #ifdef CONFIG_MMU |
301 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | 302 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
302 | { | 303 | { |
303 | struct vm_area_struct *mpnt, *tmp, **pprev; | 304 | struct vm_area_struct *mpnt, *tmp, *prev, **pprev; |
304 | struct rb_node **rb_link, *rb_parent; | 305 | struct rb_node **rb_link, *rb_parent; |
305 | int retval; | 306 | int retval; |
306 | unsigned long charge; | 307 | unsigned long charge; |
@@ -328,6 +329,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
328 | if (retval) | 329 | if (retval) |
329 | goto out; | 330 | goto out; |
330 | 331 | ||
332 | prev = NULL; | ||
331 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { | 333 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { |
332 | struct file *file; | 334 | struct file *file; |
333 | 335 | ||
@@ -355,11 +357,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
355 | if (IS_ERR(pol)) | 357 | if (IS_ERR(pol)) |
356 | goto fail_nomem_policy; | 358 | goto fail_nomem_policy; |
357 | vma_set_policy(tmp, pol); | 359 | vma_set_policy(tmp, pol); |
360 | tmp->vm_mm = mm; | ||
358 | if (anon_vma_fork(tmp, mpnt)) | 361 | if (anon_vma_fork(tmp, mpnt)) |
359 | goto fail_nomem_anon_vma_fork; | 362 | goto fail_nomem_anon_vma_fork; |
360 | tmp->vm_flags &= ~VM_LOCKED; | 363 | tmp->vm_flags &= ~VM_LOCKED; |
361 | tmp->vm_mm = mm; | 364 | tmp->vm_next = tmp->vm_prev = NULL; |
362 | tmp->vm_next = NULL; | ||
363 | file = tmp->vm_file; | 365 | file = tmp->vm_file; |
364 | if (file) { | 366 | if (file) { |
365 | struct inode *inode = file->f_path.dentry->d_inode; | 367 | struct inode *inode = file->f_path.dentry->d_inode; |
@@ -392,6 +394,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
392 | */ | 394 | */ |
393 | *pprev = tmp; | 395 | *pprev = tmp; |
394 | pprev = &tmp->vm_next; | 396 | pprev = &tmp->vm_next; |
397 | tmp->vm_prev = prev; | ||
398 | prev = tmp; | ||
395 | 399 | ||
396 | __vma_link_rb(mm, tmp, rb_link, rb_parent); | 400 | __vma_link_rb(mm, tmp, rb_link, rb_parent); |
397 | rb_link = &tmp->vm_rb.rb_right; | 401 | rb_link = &tmp->vm_rb.rb_right; |
@@ -485,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | |||
485 | mm->cached_hole_size = ~0UL; | 489 | mm->cached_hole_size = ~0UL; |
486 | mm_init_aio(mm); | 490 | mm_init_aio(mm); |
487 | mm_init_owner(mm, p); | 491 | mm_init_owner(mm, p); |
492 | atomic_set(&mm->oom_disable_count, 0); | ||
488 | 493 | ||
489 | if (likely(!mm_alloc_pgd(mm))) { | 494 | if (likely(!mm_alloc_pgd(mm))) { |
490 | mm->def_flags = 0; | 495 | mm->def_flags = 0; |
@@ -738,6 +743,8 @@ good_mm: | |||
738 | /* Initializing for Swap token stuff */ | 743 | /* Initializing for Swap token stuff */ |
739 | mm->token_priority = 0; | 744 | mm->token_priority = 0; |
740 | mm->last_interval = 0; | 745 | mm->last_interval = 0; |
746 | if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
747 | atomic_inc(&mm->oom_disable_count); | ||
741 | 748 | ||
742 | tsk->mm = mm; | 749 | tsk->mm = mm; |
743 | tsk->active_mm = mm; | 750 | tsk->active_mm = mm; |
@@ -752,13 +759,13 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) | |||
752 | struct fs_struct *fs = current->fs; | 759 | struct fs_struct *fs = current->fs; |
753 | if (clone_flags & CLONE_FS) { | 760 | if (clone_flags & CLONE_FS) { |
754 | /* tsk->fs is already what we want */ | 761 | /* tsk->fs is already what we want */ |
755 | write_lock(&fs->lock); | 762 | spin_lock(&fs->lock); |
756 | if (fs->in_exec) { | 763 | if (fs->in_exec) { |
757 | write_unlock(&fs->lock); | 764 | spin_unlock(&fs->lock); |
758 | return -EAGAIN; | 765 | return -EAGAIN; |
759 | } | 766 | } |
760 | fs->users++; | 767 | fs->users++; |
761 | write_unlock(&fs->lock); | 768 | spin_unlock(&fs->lock); |
762 | return 0; | 769 | return 0; |
763 | } | 770 | } |
764 | tsk->fs = copy_fs_struct(fs); | 771 | tsk->fs = copy_fs_struct(fs); |
@@ -899,6 +906,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
899 | tty_audit_fork(sig); | 906 | tty_audit_fork(sig); |
900 | 907 | ||
901 | sig->oom_adj = current->signal->oom_adj; | 908 | sig->oom_adj = current->signal->oom_adj; |
909 | sig->oom_score_adj = current->signal->oom_score_adj; | ||
910 | |||
911 | mutex_init(&sig->cred_guard_mutex); | ||
902 | 912 | ||
903 | return 0; | 913 | return 0; |
904 | } | 914 | } |
@@ -907,7 +917,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) | |||
907 | { | 917 | { |
908 | unsigned long new_flags = p->flags; | 918 | unsigned long new_flags = p->flags; |
909 | 919 | ||
910 | new_flags &= ~PF_SUPERPRIV; | 920 | new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); |
911 | new_flags |= PF_FORKNOEXEC; | 921 | new_flags |= PF_FORKNOEXEC; |
912 | new_flags |= PF_STARTING; | 922 | new_flags |= PF_STARTING; |
913 | p->flags = new_flags; | 923 | p->flags = new_flags; |
@@ -1295,8 +1305,13 @@ bad_fork_cleanup_io: | |||
1295 | bad_fork_cleanup_namespaces: | 1305 | bad_fork_cleanup_namespaces: |
1296 | exit_task_namespaces(p); | 1306 | exit_task_namespaces(p); |
1297 | bad_fork_cleanup_mm: | 1307 | bad_fork_cleanup_mm: |
1298 | if (p->mm) | 1308 | if (p->mm) { |
1309 | task_lock(p); | ||
1310 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
1311 | atomic_dec(&p->mm->oom_disable_count); | ||
1312 | task_unlock(p); | ||
1299 | mmput(p->mm); | 1313 | mmput(p->mm); |
1314 | } | ||
1300 | bad_fork_cleanup_signal: | 1315 | bad_fork_cleanup_signal: |
1301 | if (!(clone_flags & CLONE_THREAD)) | 1316 | if (!(clone_flags & CLONE_THREAD)) |
1302 | free_signal_struct(p->signal); | 1317 | free_signal_struct(p->signal); |
@@ -1675,13 +1690,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1675 | 1690 | ||
1676 | if (new_fs) { | 1691 | if (new_fs) { |
1677 | fs = current->fs; | 1692 | fs = current->fs; |
1678 | write_lock(&fs->lock); | 1693 | spin_lock(&fs->lock); |
1679 | current->fs = new_fs; | 1694 | current->fs = new_fs; |
1680 | if (--fs->users) | 1695 | if (--fs->users) |
1681 | new_fs = NULL; | 1696 | new_fs = NULL; |
1682 | else | 1697 | else |
1683 | new_fs = fs; | 1698 | new_fs = fs; |
1684 | write_unlock(&fs->lock); | 1699 | spin_unlock(&fs->lock); |
1685 | } | 1700 | } |
1686 | 1701 | ||
1687 | if (new_mm) { | 1702 | if (new_mm) { |
@@ -1689,6 +1704,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1689 | active_mm = current->active_mm; | 1704 | active_mm = current->active_mm; |
1690 | current->mm = new_mm; | 1705 | current->mm = new_mm; |
1691 | current->active_mm = new_mm; | 1706 | current->active_mm = new_mm; |
1707 | if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { | ||
1708 | atomic_dec(&mm->oom_disable_count); | ||
1709 | atomic_inc(&new_mm->oom_disable_count); | ||
1710 | } | ||
1692 | activate_mm(active_mm, new_mm); | 1711 | activate_mm(active_mm, new_mm); |
1693 | new_mm = mm; | 1712 | new_mm = mm; |
1694 | } | 1713 | } |
diff --git a/kernel/futex.c b/kernel/futex.c index 6a3a5fa1526d..6c683b37f2ce 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -91,6 +91,7 @@ struct futex_pi_state { | |||
91 | 91 | ||
92 | /** | 92 | /** |
93 | * struct futex_q - The hashed futex queue entry, one per waiting task | 93 | * struct futex_q - The hashed futex queue entry, one per waiting task |
94 | * @list: priority-sorted list of tasks waiting on this futex | ||
94 | * @task: the task waiting on the futex | 95 | * @task: the task waiting on the futex |
95 | * @lock_ptr: the hash bucket lock | 96 | * @lock_ptr: the hash bucket lock |
96 | * @key: the key the futex is hashed on | 97 | * @key: the key the futex is hashed on |
@@ -104,7 +105,7 @@ struct futex_pi_state { | |||
104 | * | 105 | * |
105 | * A futex_q has a woken state, just like tasks have TASK_RUNNING. | 106 | * A futex_q has a woken state, just like tasks have TASK_RUNNING. |
106 | * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. | 107 | * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. |
107 | * The order of wakup is always to make the first condition true, then | 108 | * The order of wakeup is always to make the first condition true, then |
108 | * the second. | 109 | * the second. |
109 | * | 110 | * |
110 | * PI futexes are typically woken before they are removed from the hash list via | 111 | * PI futexes are typically woken before they are removed from the hash list via |
@@ -168,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key) | |||
168 | 169 | ||
169 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { | 170 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { |
170 | case FUT_OFF_INODE: | 171 | case FUT_OFF_INODE: |
171 | atomic_inc(&key->shared.inode->i_count); | 172 | ihold(key->shared.inode); |
172 | break; | 173 | break; |
173 | case FUT_OFF_MMSHARED: | 174 | case FUT_OFF_MMSHARED: |
174 | atomic_inc(&key->private.mm->mm_count); | 175 | atomic_inc(&key->private.mm->mm_count); |
@@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key) | |||
295 | * Slow path to fixup the fault we just took in the atomic write | 296 | * Slow path to fixup the fault we just took in the atomic write |
296 | * access to @uaddr. | 297 | * access to @uaddr. |
297 | * | 298 | * |
298 | * We have no generic implementation of a non destructive write to the | 299 | * We have no generic implementation of a non-destructive write to the |
299 | * user address. We know that we faulted in the atomic pagefault | 300 | * user address. We know that we faulted in the atomic pagefault |
300 | * disabled section so we can as well avoid the #PF overhead by | 301 | * disabled section so we can as well avoid the #PF overhead by |
301 | * calling get_user_pages() right away. | 302 | * calling get_user_pages() right away. |
@@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
515 | */ | 516 | */ |
516 | pi_state = this->pi_state; | 517 | pi_state = this->pi_state; |
517 | /* | 518 | /* |
518 | * Userspace might have messed up non PI and PI futexes | 519 | * Userspace might have messed up non-PI and PI futexes |
519 | */ | 520 | */ |
520 | if (unlikely(!pi_state)) | 521 | if (unlikely(!pi_state)) |
521 | return -EINVAL; | 522 | return -EINVAL; |
@@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q) | |||
736 | 737 | ||
737 | /* | 738 | /* |
738 | * We set q->lock_ptr = NULL _before_ we wake up the task. If | 739 | * We set q->lock_ptr = NULL _before_ we wake up the task. If |
739 | * a non futex wake up happens on another CPU then the task | 740 | * a non-futex wake up happens on another CPU then the task |
740 | * might exit and p would dereference a non existing task | 741 | * might exit and p would dereference a non-existing task |
741 | * struct. Prevent this by holding a reference on p across the | 742 | * struct. Prevent this by holding a reference on p across the |
742 | * wake up. | 743 | * wake up. |
743 | */ | 744 | */ |
@@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
1131 | 1132 | ||
1132 | /** | 1133 | /** |
1133 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 | 1134 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 |
1134 | * uaddr1: source futex user address | 1135 | * @uaddr1: source futex user address |
1135 | * uaddr2: target futex user address | 1136 | * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED |
1136 | * nr_wake: number of waiters to wake (must be 1 for requeue_pi) | 1137 | * @uaddr2: target futex user address |
1137 | * nr_requeue: number of waiters to requeue (0-INT_MAX) | 1138 | * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) |
1138 | * requeue_pi: if we are attempting to requeue from a non-pi futex to a | 1139 | * @nr_requeue: number of waiters to requeue (0-INT_MAX) |
1140 | * @cmpval: @uaddr1 expected value (or %NULL) | ||
1141 | * @requeue_pi: if we are attempting to requeue from a non-pi futex to a | ||
1139 | * pi futex (pi to pi requeue is not supported) | 1142 | * pi futex (pi to pi requeue is not supported) |
1140 | * | 1143 | * |
1141 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire | 1144 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire |
@@ -1360,10 +1363,10 @@ out: | |||
1360 | 1363 | ||
1361 | /* The key must be already stored in q->key. */ | 1364 | /* The key must be already stored in q->key. */ |
1362 | static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | 1365 | static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) |
1366 | __acquires(&hb->lock) | ||
1363 | { | 1367 | { |
1364 | struct futex_hash_bucket *hb; | 1368 | struct futex_hash_bucket *hb; |
1365 | 1369 | ||
1366 | get_futex_key_refs(&q->key); | ||
1367 | hb = hash_futex(&q->key); | 1370 | hb = hash_futex(&q->key); |
1368 | q->lock_ptr = &hb->lock; | 1371 | q->lock_ptr = &hb->lock; |
1369 | 1372 | ||
@@ -1373,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | |||
1373 | 1376 | ||
1374 | static inline void | 1377 | static inline void |
1375 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) | 1378 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) |
1379 | __releases(&hb->lock) | ||
1376 | { | 1380 | { |
1377 | spin_unlock(&hb->lock); | 1381 | spin_unlock(&hb->lock); |
1378 | drop_futex_key_refs(&q->key); | ||
1379 | } | 1382 | } |
1380 | 1383 | ||
1381 | /** | 1384 | /** |
@@ -1391,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) | |||
1391 | * an example). | 1394 | * an example). |
1392 | */ | 1395 | */ |
1393 | static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | 1396 | static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) |
1397 | __releases(&hb->lock) | ||
1394 | { | 1398 | { |
1395 | int prio; | 1399 | int prio; |
1396 | 1400 | ||
@@ -1471,6 +1475,7 @@ retry: | |||
1471 | * and dropped here. | 1475 | * and dropped here. |
1472 | */ | 1476 | */ |
1473 | static void unqueue_me_pi(struct futex_q *q) | 1477 | static void unqueue_me_pi(struct futex_q *q) |
1478 | __releases(q->lock_ptr) | ||
1474 | { | 1479 | { |
1475 | WARN_ON(plist_node_empty(&q->list)); | 1480 | WARN_ON(plist_node_empty(&q->list)); |
1476 | plist_del(&q->list, &q->list.plist); | 1481 | plist_del(&q->list, &q->list.plist); |
@@ -1480,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q) | |||
1480 | q->pi_state = NULL; | 1485 | q->pi_state = NULL; |
1481 | 1486 | ||
1482 | spin_unlock(q->lock_ptr); | 1487 | spin_unlock(q->lock_ptr); |
1483 | |||
1484 | drop_futex_key_refs(&q->key); | ||
1485 | } | 1488 | } |
1486 | 1489 | ||
1487 | /* | 1490 | /* |
@@ -1812,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared, | |||
1812 | } | 1815 | } |
1813 | 1816 | ||
1814 | retry: | 1817 | retry: |
1815 | /* Prepare to wait on uaddr. */ | 1818 | /* |
1819 | * Prepare to wait on uaddr. On success, holds hb lock and increments | ||
1820 | * q.key refs. | ||
1821 | */ | ||
1816 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 1822 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); |
1817 | if (ret) | 1823 | if (ret) |
1818 | goto out; | 1824 | goto out; |
@@ -1822,28 +1828,27 @@ retry: | |||
1822 | 1828 | ||
1823 | /* If we were woken (and unqueued), we succeeded, whatever. */ | 1829 | /* If we were woken (and unqueued), we succeeded, whatever. */ |
1824 | ret = 0; | 1830 | ret = 0; |
1831 | /* unqueue_me() drops q.key ref */ | ||
1825 | if (!unqueue_me(&q)) | 1832 | if (!unqueue_me(&q)) |
1826 | goto out_put_key; | 1833 | goto out; |
1827 | ret = -ETIMEDOUT; | 1834 | ret = -ETIMEDOUT; |
1828 | if (to && !to->task) | 1835 | if (to && !to->task) |
1829 | goto out_put_key; | 1836 | goto out; |
1830 | 1837 | ||
1831 | /* | 1838 | /* |
1832 | * We expect signal_pending(current), but we might be the | 1839 | * We expect signal_pending(current), but we might be the |
1833 | * victim of a spurious wakeup as well. | 1840 | * victim of a spurious wakeup as well. |
1834 | */ | 1841 | */ |
1835 | if (!signal_pending(current)) { | 1842 | if (!signal_pending(current)) |
1836 | put_futex_key(fshared, &q.key); | ||
1837 | goto retry; | 1843 | goto retry; |
1838 | } | ||
1839 | 1844 | ||
1840 | ret = -ERESTARTSYS; | 1845 | ret = -ERESTARTSYS; |
1841 | if (!abs_time) | 1846 | if (!abs_time) |
1842 | goto out_put_key; | 1847 | goto out; |
1843 | 1848 | ||
1844 | restart = ¤t_thread_info()->restart_block; | 1849 | restart = ¤t_thread_info()->restart_block; |
1845 | restart->fn = futex_wait_restart; | 1850 | restart->fn = futex_wait_restart; |
1846 | restart->futex.uaddr = (u32 *)uaddr; | 1851 | restart->futex.uaddr = uaddr; |
1847 | restart->futex.val = val; | 1852 | restart->futex.val = val; |
1848 | restart->futex.time = abs_time->tv64; | 1853 | restart->futex.time = abs_time->tv64; |
1849 | restart->futex.bitset = bitset; | 1854 | restart->futex.bitset = bitset; |
@@ -1856,8 +1861,6 @@ retry: | |||
1856 | 1861 | ||
1857 | ret = -ERESTART_RESTARTBLOCK; | 1862 | ret = -ERESTART_RESTARTBLOCK; |
1858 | 1863 | ||
1859 | out_put_key: | ||
1860 | put_futex_key(fshared, &q.key); | ||
1861 | out: | 1864 | out: |
1862 | if (to) { | 1865 | if (to) { |
1863 | hrtimer_cancel(&to->timer); | 1866 | hrtimer_cancel(&to->timer); |
@@ -1869,7 +1872,7 @@ out: | |||
1869 | 1872 | ||
1870 | static long futex_wait_restart(struct restart_block *restart) | 1873 | static long futex_wait_restart(struct restart_block *restart) |
1871 | { | 1874 | { |
1872 | u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; | 1875 | u32 __user *uaddr = restart->futex.uaddr; |
1873 | int fshared = 0; | 1876 | int fshared = 0; |
1874 | ktime_t t, *tp = NULL; | 1877 | ktime_t t, *tp = NULL; |
1875 | 1878 | ||
@@ -2236,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2236 | q.rt_waiter = &rt_waiter; | 2239 | q.rt_waiter = &rt_waiter; |
2237 | q.requeue_pi_key = &key2; | 2240 | q.requeue_pi_key = &key2; |
2238 | 2241 | ||
2239 | /* Prepare to wait on uaddr. */ | 2242 | /* |
2243 | * Prepare to wait on uaddr. On success, increments q.key (key1) ref | ||
2244 | * count. | ||
2245 | */ | ||
2240 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 2246 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); |
2241 | if (ret) | 2247 | if (ret) |
2242 | goto out_key2; | 2248 | goto out_key2; |
@@ -2254,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
2254 | * In order for us to be here, we know our q.key == key2, and since | 2260 | * In order for us to be here, we know our q.key == key2, and since |
2255 | * we took the hb->lock above, we also know that futex_requeue() has | 2261 | * we took the hb->lock above, we also know that futex_requeue() has |
2256 | * completed and we no longer have to concern ourselves with a wakeup | 2262 | * completed and we no longer have to concern ourselves with a wakeup |
2257 | * race with the atomic proxy lock acquition by the requeue code. | 2263 | * race with the atomic proxy lock acquisition by the requeue code. The |
2264 | * futex_requeue dropped our key1 reference and incremented our key2 | ||
2265 | * reference count. | ||
2258 | */ | 2266 | */ |
2259 | 2267 | ||
2260 | /* Check if the requeue code acquired the second futex for us. */ | 2268 | /* Check if the requeue code acquired the second futex for us. */ |
@@ -2458,7 +2466,7 @@ retry: | |||
2458 | */ | 2466 | */ |
2459 | static inline int fetch_robust_entry(struct robust_list __user **entry, | 2467 | static inline int fetch_robust_entry(struct robust_list __user **entry, |
2460 | struct robust_list __user * __user *head, | 2468 | struct robust_list __user * __user *head, |
2461 | int *pi) | 2469 | unsigned int *pi) |
2462 | { | 2470 | { |
2463 | unsigned long uentry; | 2471 | unsigned long uentry; |
2464 | 2472 | ||
@@ -2647,7 +2655,7 @@ static int __init futex_init(void) | |||
2647 | * of the complex code paths. Also we want to prevent | 2655 | * of the complex code paths. Also we want to prevent |
2648 | * registration of robust lists in that case. NULL is | 2656 | * registration of robust lists in that case. NULL is |
2649 | * guaranteed to fault and we get -EFAULT on functional | 2657 | * guaranteed to fault and we get -EFAULT on functional |
2650 | * implementation, the non functional ones will return | 2658 | * implementation, the non-functional ones will return |
2651 | * -ENOSYS. | 2659 | * -ENOSYS. |
2652 | */ | 2660 | */ |
2653 | curval = cmpxchg_futex_value_locked(NULL, 0, 0); | 2661 | curval = cmpxchg_futex_value_locked(NULL, 0, 0); |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d49afb2395e5..06da4dfc339b 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
@@ -19,7 +19,7 @@ | |||
19 | */ | 19 | */ |
20 | static inline int | 20 | static inline int |
21 | fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, | 21 | fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, |
22 | compat_uptr_t __user *head, int *pi) | 22 | compat_uptr_t __user *head, unsigned int *pi) |
23 | { | 23 | { |
24 | if (get_user(*uentry, head)) | 24 | if (get_user(*uentry, head)) |
25 | return -EFAULT; | 25 | return -EFAULT; |
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index ef3c3f88a7a3..9bd0934f6c33 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c | |||
@@ -33,10 +33,11 @@ | |||
33 | * @children: child nodes | 33 | * @children: child nodes |
34 | * @all: list head for list of all nodes | 34 | * @all: list head for list of all nodes |
35 | * @parent: parent node | 35 | * @parent: parent node |
36 | * @info: associated profiling data structure if not a directory | 36 | * @loaded_info: array of pointers to profiling data sets for loaded object |
37 | * @ghost: when an object file containing profiling data is unloaded we keep a | 37 | * files. |
38 | * copy of the profiling data here to allow collecting coverage data | 38 | * @num_loaded: number of profiling data sets for loaded object files. |
39 | * for cleanup code. Such a node is called a "ghost". | 39 | * @unloaded_info: accumulated copy of profiling data sets for unloaded |
40 | * object files. Used only when gcov_persist=1. | ||
40 | * @dentry: main debugfs entry, either a directory or data file | 41 | * @dentry: main debugfs entry, either a directory or data file |
41 | * @links: associated symbolic links | 42 | * @links: associated symbolic links |
42 | * @name: data file basename | 43 | * @name: data file basename |
@@ -51,10 +52,11 @@ struct gcov_node { | |||
51 | struct list_head children; | 52 | struct list_head children; |
52 | struct list_head all; | 53 | struct list_head all; |
53 | struct gcov_node *parent; | 54 | struct gcov_node *parent; |
54 | struct gcov_info *info; | 55 | struct gcov_info **loaded_info; |
55 | struct gcov_info *ghost; | 56 | struct gcov_info *unloaded_info; |
56 | struct dentry *dentry; | 57 | struct dentry *dentry; |
57 | struct dentry **links; | 58 | struct dentry **links; |
59 | int num_loaded; | ||
58 | char name[0]; | 60 | char name[0]; |
59 | }; | 61 | }; |
60 | 62 | ||
@@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = { | |||
136 | }; | 138 | }; |
137 | 139 | ||
138 | /* | 140 | /* |
139 | * Return the profiling data set for a given node. This can either be the | 141 | * Return a profiling data set associated with the given node. This is |
140 | * original profiling data structure or a duplicate (also called "ghost") | 142 | * either a data set for a loaded object file or a data set copy in case |
141 | * in case the associated object file has been unloaded. | 143 | * all associated object files have been unloaded. |
142 | */ | 144 | */ |
143 | static struct gcov_info *get_node_info(struct gcov_node *node) | 145 | static struct gcov_info *get_node_info(struct gcov_node *node) |
144 | { | 146 | { |
145 | if (node->info) | 147 | if (node->num_loaded > 0) |
146 | return node->info; | 148 | return node->loaded_info[0]; |
147 | 149 | ||
148 | return node->ghost; | 150 | return node->unloaded_info; |
151 | } | ||
152 | |||
153 | /* | ||
154 | * Return a newly allocated profiling data set which contains the sum of | ||
155 | * all profiling data associated with the given node. | ||
156 | */ | ||
157 | static struct gcov_info *get_accumulated_info(struct gcov_node *node) | ||
158 | { | ||
159 | struct gcov_info *info; | ||
160 | int i = 0; | ||
161 | |||
162 | if (node->unloaded_info) | ||
163 | info = gcov_info_dup(node->unloaded_info); | ||
164 | else | ||
165 | info = gcov_info_dup(node->loaded_info[i++]); | ||
166 | if (!info) | ||
167 | return NULL; | ||
168 | for (; i < node->num_loaded; i++) | ||
169 | gcov_info_add(info, node->loaded_info[i]); | ||
170 | |||
171 | return info; | ||
149 | } | 172 | } |
150 | 173 | ||
151 | /* | 174 | /* |
@@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file) | |||
163 | mutex_lock(&node_lock); | 186 | mutex_lock(&node_lock); |
164 | /* | 187 | /* |
165 | * Read from a profiling data copy to minimize reference tracking | 188 | * Read from a profiling data copy to minimize reference tracking |
166 | * complexity and concurrent access. | 189 | * complexity and concurrent access and to keep accumulating multiple |
190 | * profiling data sets associated with one node simple. | ||
167 | */ | 191 | */ |
168 | info = gcov_info_dup(get_node_info(node)); | 192 | info = get_accumulated_info(node); |
169 | if (!info) | 193 | if (!info) |
170 | goto out_unlock; | 194 | goto out_unlock; |
171 | iter = gcov_iter_new(info); | 195 | iter = gcov_iter_new(info); |
@@ -225,12 +249,25 @@ static struct gcov_node *get_node_by_name(const char *name) | |||
225 | return NULL; | 249 | return NULL; |
226 | } | 250 | } |
227 | 251 | ||
252 | /* | ||
253 | * Reset all profiling data associated with the specified node. | ||
254 | */ | ||
255 | static void reset_node(struct gcov_node *node) | ||
256 | { | ||
257 | int i; | ||
258 | |||
259 | if (node->unloaded_info) | ||
260 | gcov_info_reset(node->unloaded_info); | ||
261 | for (i = 0; i < node->num_loaded; i++) | ||
262 | gcov_info_reset(node->loaded_info[i]); | ||
263 | } | ||
264 | |||
228 | static void remove_node(struct gcov_node *node); | 265 | static void remove_node(struct gcov_node *node); |
229 | 266 | ||
230 | /* | 267 | /* |
231 | * write() implementation for gcov data files. Reset profiling data for the | 268 | * write() implementation for gcov data files. Reset profiling data for the |
232 | * associated file. If the object file has been unloaded (i.e. this is | 269 | * corresponding file. If all associated object files have been unloaded, |
233 | * a "ghost" node), remove the debug fs node as well. | 270 | * remove the debug fs node as well. |
234 | */ | 271 | */ |
235 | static ssize_t gcov_seq_write(struct file *file, const char __user *addr, | 272 | static ssize_t gcov_seq_write(struct file *file, const char __user *addr, |
236 | size_t len, loff_t *pos) | 273 | size_t len, loff_t *pos) |
@@ -245,10 +282,10 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr, | |||
245 | node = get_node_by_name(info->filename); | 282 | node = get_node_by_name(info->filename); |
246 | if (node) { | 283 | if (node) { |
247 | /* Reset counts or remove node for unloaded modules. */ | 284 | /* Reset counts or remove node for unloaded modules. */ |
248 | if (node->ghost) | 285 | if (node->num_loaded == 0) |
249 | remove_node(node); | 286 | remove_node(node); |
250 | else | 287 | else |
251 | gcov_info_reset(node->info); | 288 | reset_node(node); |
252 | } | 289 | } |
253 | /* Reset counts for open file. */ | 290 | /* Reset counts for open file. */ |
254 | gcov_info_reset(info); | 291 | gcov_info_reset(info); |
@@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info, | |||
378 | INIT_LIST_HEAD(&node->list); | 415 | INIT_LIST_HEAD(&node->list); |
379 | INIT_LIST_HEAD(&node->children); | 416 | INIT_LIST_HEAD(&node->children); |
380 | INIT_LIST_HEAD(&node->all); | 417 | INIT_LIST_HEAD(&node->all); |
381 | node->info = info; | 418 | if (node->loaded_info) { |
419 | node->loaded_info[0] = info; | ||
420 | node->num_loaded = 1; | ||
421 | } | ||
382 | node->parent = parent; | 422 | node->parent = parent; |
383 | if (name) | 423 | if (name) |
384 | strcpy(node->name, name); | 424 | strcpy(node->name, name); |
@@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent, | |||
394 | struct gcov_node *node; | 434 | struct gcov_node *node; |
395 | 435 | ||
396 | node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); | 436 | node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); |
397 | if (!node) { | 437 | if (!node) |
398 | pr_warning("out of memory\n"); | 438 | goto err_nomem; |
399 | return NULL; | 439 | if (info) { |
440 | node->loaded_info = kcalloc(1, sizeof(struct gcov_info *), | ||
441 | GFP_KERNEL); | ||
442 | if (!node->loaded_info) | ||
443 | goto err_nomem; | ||
400 | } | 444 | } |
401 | init_node(node, info, name, parent); | 445 | init_node(node, info, name, parent); |
402 | /* Differentiate between gcov data file nodes and directory nodes. */ | 446 | /* Differentiate between gcov data file nodes and directory nodes. */ |
@@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent, | |||
416 | list_add(&node->all, &all_head); | 460 | list_add(&node->all, &all_head); |
417 | 461 | ||
418 | return node; | 462 | return node; |
463 | |||
464 | err_nomem: | ||
465 | kfree(node); | ||
466 | pr_warning("out of memory\n"); | ||
467 | return NULL; | ||
419 | } | 468 | } |
420 | 469 | ||
421 | /* Remove symbolic links associated with node. */ | 470 | /* Remove symbolic links associated with node. */ |
@@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node) | |||
441 | list_del(&node->all); | 490 | list_del(&node->all); |
442 | debugfs_remove(node->dentry); | 491 | debugfs_remove(node->dentry); |
443 | remove_links(node); | 492 | remove_links(node); |
444 | if (node->ghost) | 493 | kfree(node->loaded_info); |
445 | gcov_info_free(node->ghost); | 494 | if (node->unloaded_info) |
495 | gcov_info_free(node->unloaded_info); | ||
446 | kfree(node); | 496 | kfree(node); |
447 | } | 497 | } |
448 | 498 | ||
@@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent, | |||
477 | 527 | ||
478 | /* | 528 | /* |
479 | * write() implementation for reset file. Reset all profiling data to zero | 529 | * write() implementation for reset file. Reset all profiling data to zero |
480 | * and remove ghost nodes. | 530 | * and remove nodes for which all associated object files are unloaded. |
481 | */ | 531 | */ |
482 | static ssize_t reset_write(struct file *file, const char __user *addr, | 532 | static ssize_t reset_write(struct file *file, const char __user *addr, |
483 | size_t len, loff_t *pos) | 533 | size_t len, loff_t *pos) |
@@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr, | |||
487 | mutex_lock(&node_lock); | 537 | mutex_lock(&node_lock); |
488 | restart: | 538 | restart: |
489 | list_for_each_entry(node, &all_head, all) { | 539 | list_for_each_entry(node, &all_head, all) { |
490 | if (node->info) | 540 | if (node->num_loaded > 0) |
491 | gcov_info_reset(node->info); | 541 | reset_node(node); |
492 | else if (list_empty(&node->children)) { | 542 | else if (list_empty(&node->children)) { |
493 | remove_node(node); | 543 | remove_node(node); |
494 | /* Several nodes may have gone - restart loop. */ | 544 | /* Several nodes may have gone - restart loop. */ |
@@ -511,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len, | |||
511 | static const struct file_operations gcov_reset_fops = { | 561 | static const struct file_operations gcov_reset_fops = { |
512 | .write = reset_write, | 562 | .write = reset_write, |
513 | .read = reset_read, | 563 | .read = reset_read, |
564 | .llseek = noop_llseek, | ||
514 | }; | 565 | }; |
515 | 566 | ||
516 | /* | 567 | /* |
@@ -564,37 +615,115 @@ err_remove: | |||
564 | } | 615 | } |
565 | 616 | ||
566 | /* | 617 | /* |
567 | * The profiling data set associated with this node is being unloaded. Store a | 618 | * Associate a profiling data set with an existing node. Needs to be called |
568 | * copy of the profiling data and turn this node into a "ghost". | 619 | * with node_lock held. |
569 | */ | 620 | */ |
570 | static int ghost_node(struct gcov_node *node) | 621 | static void add_info(struct gcov_node *node, struct gcov_info *info) |
571 | { | 622 | { |
572 | node->ghost = gcov_info_dup(node->info); | 623 | struct gcov_info **loaded_info; |
573 | if (!node->ghost) { | 624 | int num = node->num_loaded; |
574 | pr_warning("could not save data for '%s' (out of memory)\n", | 625 | |
575 | node->info->filename); | 626 | /* |
576 | return -ENOMEM; | 627 | * Prepare new array. This is done first to simplify cleanup in |
628 | * case the new data set is incompatible, the node only contains | ||
629 | * unloaded data sets and there's not enough memory for the array. | ||
630 | */ | ||
631 | loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); | ||
632 | if (!loaded_info) { | ||
633 | pr_warning("could not add '%s' (out of memory)\n", | ||
634 | info->filename); | ||
635 | return; | ||
636 | } | ||
637 | memcpy(loaded_info, node->loaded_info, | ||
638 | num * sizeof(struct gcov_info *)); | ||
639 | loaded_info[num] = info; | ||
640 | /* Check if the new data set is compatible. */ | ||
641 | if (num == 0) { | ||
642 | /* | ||
643 | * A module was unloaded, modified and reloaded. The new | ||
644 | * data set replaces the copy of the last one. | ||
645 | */ | ||
646 | if (!gcov_info_is_compatible(node->unloaded_info, info)) { | ||
647 | pr_warning("discarding saved data for %s " | ||
648 | "(incompatible version)\n", info->filename); | ||
649 | gcov_info_free(node->unloaded_info); | ||
650 | node->unloaded_info = NULL; | ||
651 | } | ||
652 | } else { | ||
653 | /* | ||
654 | * Two different versions of the same object file are loaded. | ||
655 | * The initial one takes precedence. | ||
656 | */ | ||
657 | if (!gcov_info_is_compatible(node->loaded_info[0], info)) { | ||
658 | pr_warning("could not add '%s' (incompatible " | ||
659 | "version)\n", info->filename); | ||
660 | kfree(loaded_info); | ||
661 | return; | ||
662 | } | ||
577 | } | 663 | } |
578 | node->info = NULL; | 664 | /* Overwrite previous array. */ |
665 | kfree(node->loaded_info); | ||
666 | node->loaded_info = loaded_info; | ||
667 | node->num_loaded = num + 1; | ||
668 | } | ||
579 | 669 | ||
580 | return 0; | 670 | /* |
671 | * Return the index of a profiling data set associated with a node. | ||
672 | */ | ||
673 | static int get_info_index(struct gcov_node *node, struct gcov_info *info) | ||
674 | { | ||
675 | int i; | ||
676 | |||
677 | for (i = 0; i < node->num_loaded; i++) { | ||
678 | if (node->loaded_info[i] == info) | ||
679 | return i; | ||
680 | } | ||
681 | return -ENOENT; | ||
581 | } | 682 | } |
582 | 683 | ||
583 | /* | 684 | /* |
584 | * Profiling data for this node has been loaded again. Add profiling data | 685 | * Save the data of a profiling data set which is being unloaded. |
585 | * from previous instantiation and turn this node into a regular node. | ||
586 | */ | 686 | */ |
587 | static void revive_node(struct gcov_node *node, struct gcov_info *info) | 687 | static void save_info(struct gcov_node *node, struct gcov_info *info) |
588 | { | 688 | { |
589 | if (gcov_info_is_compatible(node->ghost, info)) | 689 | if (node->unloaded_info) |
590 | gcov_info_add(info, node->ghost); | 690 | gcov_info_add(node->unloaded_info, info); |
591 | else { | 691 | else { |
592 | pr_warning("discarding saved data for '%s' (version changed)\n", | 692 | node->unloaded_info = gcov_info_dup(info); |
693 | if (!node->unloaded_info) { | ||
694 | pr_warning("could not save data for '%s' " | ||
695 | "(out of memory)\n", info->filename); | ||
696 | } | ||
697 | } | ||
698 | } | ||
699 | |||
700 | /* | ||
701 | * Disassociate a profiling data set from a node. Needs to be called with | ||
702 | * node_lock held. | ||
703 | */ | ||
704 | static void remove_info(struct gcov_node *node, struct gcov_info *info) | ||
705 | { | ||
706 | int i; | ||
707 | |||
708 | i = get_info_index(node, info); | ||
709 | if (i < 0) { | ||
710 | pr_warning("could not remove '%s' (not found)\n", | ||
593 | info->filename); | 711 | info->filename); |
712 | return; | ||
594 | } | 713 | } |
595 | gcov_info_free(node->ghost); | 714 | if (gcov_persist) |
596 | node->ghost = NULL; | 715 | save_info(node, info); |
597 | node->info = info; | 716 | /* Shrink array. */ |
717 | node->loaded_info[i] = node->loaded_info[node->num_loaded - 1]; | ||
718 | node->num_loaded--; | ||
719 | if (node->num_loaded > 0) | ||
720 | return; | ||
721 | /* Last loaded data set was removed. */ | ||
722 | kfree(node->loaded_info); | ||
723 | node->loaded_info = NULL; | ||
724 | node->num_loaded = 0; | ||
725 | if (!node->unloaded_info) | ||
726 | remove_node(node); | ||
598 | } | 727 | } |
599 | 728 | ||
600 | /* | 729 | /* |
@@ -609,30 +738,18 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) | |||
609 | node = get_node_by_name(info->filename); | 738 | node = get_node_by_name(info->filename); |
610 | switch (action) { | 739 | switch (action) { |
611 | case GCOV_ADD: | 740 | case GCOV_ADD: |
612 | /* Add new node or revive ghost. */ | 741 | if (node) |
613 | if (!node) { | 742 | add_info(node, info); |
743 | else | ||
614 | add_node(info); | 744 | add_node(info); |
615 | break; | ||
616 | } | ||
617 | if (gcov_persist) | ||
618 | revive_node(node, info); | ||
619 | else { | ||
620 | pr_warning("could not add '%s' (already exists)\n", | ||
621 | info->filename); | ||
622 | } | ||
623 | break; | 745 | break; |
624 | case GCOV_REMOVE: | 746 | case GCOV_REMOVE: |
625 | /* Remove node or turn into ghost. */ | 747 | if (node) |
626 | if (!node) { | 748 | remove_info(node, info); |
749 | else { | ||
627 | pr_warning("could not remove '%s' (not found)\n", | 750 | pr_warning("could not remove '%s' (not found)\n", |
628 | info->filename); | 751 | info->filename); |
629 | break; | ||
630 | } | 752 | } |
631 | if (gcov_persist) { | ||
632 | if (!ghost_node(node)) | ||
633 | break; | ||
634 | } | ||
635 | remove_node(node); | ||
636 | break; | 753 | break; |
637 | } | 754 | } |
638 | mutex_unlock(&node_lock); | 755 | mutex_unlock(&node_lock); |
diff --git a/kernel/groups.c b/kernel/groups.c index 53b1916c9492..253dc0f35cf4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
@@ -143,10 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp) | |||
143 | right = group_info->ngroups; | 143 | right = group_info->ngroups; |
144 | while (left < right) { | 144 | while (left < right) { |
145 | unsigned int mid = (left+right)/2; | 145 | unsigned int mid = (left+right)/2; |
146 | int cmp = grp - GROUP_AT(group_info, mid); | 146 | if (grp > GROUP_AT(group_info, mid)) |
147 | if (cmp > 0) | ||
148 | left = mid + 1; | 147 | left = mid + 1; |
149 | else if (cmp < 0) | 148 | else if (grp < GROUP_AT(group_info, mid)) |
150 | right = mid; | 149 | right = mid; |
151 | else | 150 | else |
152 | return 1; | 151 | return 1; |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 5c69e996bd0f..72206cf5c6cf 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) | |||
90 | do { | 90 | do { |
91 | seq = read_seqbegin(&xtime_lock); | 91 | seq = read_seqbegin(&xtime_lock); |
92 | xts = __current_kernel_time(); | 92 | xts = __current_kernel_time(); |
93 | tom = wall_to_monotonic; | 93 | tom = __get_wall_to_monotonic(); |
94 | } while (read_seqretry(&xtime_lock, seq)); | 94 | } while (read_seqretry(&xtime_lock, seq)); |
95 | 95 | ||
96 | xtim = timespec_to_ktime(xts); | 96 | xtim = timespec_to_ktime(xts); |
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, | |||
144 | static int hrtimer_get_target(int this_cpu, int pinned) | 144 | static int hrtimer_get_target(int this_cpu, int pinned) |
145 | { | 145 | { |
146 | #ifdef CONFIG_NO_HZ | 146 | #ifdef CONFIG_NO_HZ |
147 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { | 147 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) |
148 | int preferred_cpu = get_nohz_load_balancer(); | 148 | return get_nohz_timer_target(); |
149 | |||
150 | if (preferred_cpu >= 0) | ||
151 | return preferred_cpu; | ||
152 | } | ||
153 | #endif | 149 | #endif |
154 | return this_cpu; | 150 | return this_cpu; |
155 | } | 151 | } |
@@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, | |||
612 | static void retrigger_next_event(void *arg) | 608 | static void retrigger_next_event(void *arg) |
613 | { | 609 | { |
614 | struct hrtimer_cpu_base *base; | 610 | struct hrtimer_cpu_base *base; |
615 | struct timespec realtime_offset; | 611 | struct timespec realtime_offset, wtm; |
616 | unsigned long seq; | 612 | unsigned long seq; |
617 | 613 | ||
618 | if (!hrtimer_hres_active()) | 614 | if (!hrtimer_hres_active()) |
@@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg) | |||
620 | 616 | ||
621 | do { | 617 | do { |
622 | seq = read_seqbegin(&xtime_lock); | 618 | seq = read_seqbegin(&xtime_lock); |
623 | set_normalized_timespec(&realtime_offset, | 619 | wtm = __get_wall_to_monotonic(); |
624 | -wall_to_monotonic.tv_sec, | ||
625 | -wall_to_monotonic.tv_nsec); | ||
626 | } while (read_seqretry(&xtime_lock, seq)); | 620 | } while (read_seqretry(&xtime_lock, seq)); |
621 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | ||
627 | 622 | ||
628 | base = &__get_cpu_var(hrtimer_bases); | 623 | base = &__get_cpu_var(hrtimer_bases); |
629 | 624 | ||
@@ -936,6 +931,7 @@ static inline int | |||
936 | remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) | 931 | remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) |
937 | { | 932 | { |
938 | if (hrtimer_is_queued(timer)) { | 933 | if (hrtimer_is_queued(timer)) { |
934 | unsigned long state; | ||
939 | int reprogram; | 935 | int reprogram; |
940 | 936 | ||
941 | /* | 937 | /* |
@@ -949,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) | |||
949 | debug_deactivate(timer); | 945 | debug_deactivate(timer); |
950 | timer_stats_hrtimer_clear_start_info(timer); | 946 | timer_stats_hrtimer_clear_start_info(timer); |
951 | reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); | 947 | reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); |
952 | __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, | 948 | /* |
953 | reprogram); | 949 | * We must preserve the CALLBACK state flag here, |
950 | * otherwise we could move the timer base in | ||
951 | * switch_hrtimer_base. | ||
952 | */ | ||
953 | state = timer->state & HRTIMER_STATE_CALLBACK; | ||
954 | __remove_hrtimer(timer, base, state, reprogram); | ||
954 | return 1; | 955 | return 1; |
955 | } | 956 | } |
956 | return 0; | 957 | return 0; |
@@ -1096,11 +1097,10 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); | |||
1096 | */ | 1097 | */ |
1097 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) | 1098 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) |
1098 | { | 1099 | { |
1099 | struct hrtimer_clock_base *base; | ||
1100 | unsigned long flags; | 1100 | unsigned long flags; |
1101 | ktime_t rem; | 1101 | ktime_t rem; |
1102 | 1102 | ||
1103 | base = lock_hrtimer_base(timer, &flags); | 1103 | lock_hrtimer_base(timer, &flags); |
1104 | rem = hrtimer_expires_remaining(timer); | 1104 | rem = hrtimer_expires_remaining(timer); |
1105 | unlock_hrtimer_base(timer, &flags); | 1105 | unlock_hrtimer_base(timer, &flags); |
1106 | 1106 | ||
@@ -1237,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) | |||
1237 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | 1237 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); |
1238 | enqueue_hrtimer(timer, base); | 1238 | enqueue_hrtimer(timer, base); |
1239 | } | 1239 | } |
1240 | |||
1241 | WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); | ||
1242 | |||
1240 | timer->state &= ~HRTIMER_STATE_CALLBACK; | 1243 | timer->state &= ~HRTIMER_STATE_CALLBACK; |
1241 | } | 1244 | } |
1242 | 1245 | ||
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0c642d51aac2..53ead174da2f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
98 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | 98 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" |
99 | " disables this message.\n"); | 99 | " disables this message.\n"); |
100 | sched_show_task(t); | 100 | sched_show_task(t); |
101 | __debug_show_held_locks(t); | 101 | debug_show_held_locks(t); |
102 | 102 | ||
103 | touch_nmi_watchdog(); | 103 | touch_nmi_watchdog(); |
104 | 104 | ||
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
111 | * periodically exit the critical section and enter a new one. | 111 | * periodically exit the critical section and enter a new one. |
112 | * | 112 | * |
113 | * For preemptible RCU it is sufficient to call rcu_read_unlock in order | 113 | * For preemptible RCU it is sufficient to call rcu_read_unlock in order |
114 | * exit the grace period. For classic RCU, a reschedule is required. | 114 | * to exit the grace period. For classic RCU, a reschedule is required. |
115 | */ | 115 | */ |
116 | static void rcu_lock_break(struct task_struct *g, struct task_struct *t) | 116 | static void rcu_lock_break(struct task_struct *g, struct task_struct *t) |
117 | { | 117 | { |
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 7a56b22e0602..2c9120f0afca 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/init.h> | 42 | #include <linux/init.h> |
43 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
44 | #include <linux/list.h> | ||
44 | #include <linux/cpu.h> | 45 | #include <linux/cpu.h> |
45 | #include <linux/smp.h> | 46 | #include <linux/smp.h> |
46 | 47 | ||
@@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); | |||
62 | 63 | ||
63 | static int nr_slots[TYPE_MAX]; | 64 | static int nr_slots[TYPE_MAX]; |
64 | 65 | ||
66 | /* Keep track of the breakpoints attached to tasks */ | ||
67 | static LIST_HEAD(bp_task_head); | ||
68 | |||
65 | static int constraints_initialized; | 69 | static int constraints_initialized; |
66 | 70 | ||
67 | /* Gather the number of total pinned and un-pinned bp in a cpuset */ | 71 | /* Gather the number of total pinned and un-pinned bp in a cpuset */ |
@@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) | |||
103 | return 0; | 107 | return 0; |
104 | } | 108 | } |
105 | 109 | ||
106 | static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) | 110 | /* |
111 | * Count the number of breakpoints of the same type and same task. | ||
112 | * The given event must be not on the list. | ||
113 | */ | ||
114 | static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) | ||
107 | { | 115 | { |
108 | struct perf_event_context *ctx = tsk->perf_event_ctxp; | 116 | struct task_struct *tsk = bp->hw.bp_target; |
109 | struct list_head *list; | 117 | struct perf_event *iter; |
110 | struct perf_event *bp; | ||
111 | unsigned long flags; | ||
112 | int count = 0; | 118 | int count = 0; |
113 | 119 | ||
114 | if (WARN_ONCE(!ctx, "No perf context for this task")) | 120 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { |
115 | return 0; | 121 | if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) |
116 | 122 | count += hw_breakpoint_weight(iter); | |
117 | list = &ctx->event_list; | ||
118 | |||
119 | raw_spin_lock_irqsave(&ctx->lock, flags); | ||
120 | |||
121 | /* | ||
122 | * The current breakpoint counter is not included in the list | ||
123 | * at the open() callback time | ||
124 | */ | ||
125 | list_for_each_entry(bp, list, event_entry) { | ||
126 | if (bp->attr.type == PERF_TYPE_BREAKPOINT) | ||
127 | if (find_slot_idx(bp) == type) | ||
128 | count += hw_breakpoint_weight(bp); | ||
129 | } | 123 | } |
130 | 124 | ||
131 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | ||
132 | |||
133 | return count; | 125 | return count; |
134 | } | 126 | } |
135 | 127 | ||
@@ -142,14 +134,14 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
142 | enum bp_type_idx type) | 134 | enum bp_type_idx type) |
143 | { | 135 | { |
144 | int cpu = bp->cpu; | 136 | int cpu = bp->cpu; |
145 | struct task_struct *tsk = bp->ctx->task; | 137 | struct task_struct *tsk = bp->hw.bp_target; |
146 | 138 | ||
147 | if (cpu >= 0) { | 139 | if (cpu >= 0) { |
148 | slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); | 140 | slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); |
149 | if (!tsk) | 141 | if (!tsk) |
150 | slots->pinned += max_task_bp_pinned(cpu, type); | 142 | slots->pinned += max_task_bp_pinned(cpu, type); |
151 | else | 143 | else |
152 | slots->pinned += task_bp_pinned(tsk, type); | 144 | slots->pinned += task_bp_pinned(bp, type); |
153 | slots->flexible = per_cpu(nr_bp_flexible[type], cpu); | 145 | slots->flexible = per_cpu(nr_bp_flexible[type], cpu); |
154 | 146 | ||
155 | return; | 147 | return; |
@@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
162 | if (!tsk) | 154 | if (!tsk) |
163 | nr += max_task_bp_pinned(cpu, type); | 155 | nr += max_task_bp_pinned(cpu, type); |
164 | else | 156 | else |
165 | nr += task_bp_pinned(tsk, type); | 157 | nr += task_bp_pinned(bp, type); |
166 | 158 | ||
167 | if (nr > slots->pinned) | 159 | if (nr > slots->pinned) |
168 | slots->pinned = nr; | 160 | slots->pinned = nr; |
@@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight) | |||
188 | /* | 180 | /* |
189 | * Add a pinned breakpoint for the given task in our constraint table | 181 | * Add a pinned breakpoint for the given task in our constraint table |
190 | */ | 182 | */ |
191 | static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, | 183 | static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, |
192 | enum bp_type_idx type, int weight) | 184 | enum bp_type_idx type, int weight) |
193 | { | 185 | { |
194 | unsigned int *tsk_pinned; | 186 | unsigned int *tsk_pinned; |
@@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, | |||
196 | int old_idx = 0; | 188 | int old_idx = 0; |
197 | int idx = 0; | 189 | int idx = 0; |
198 | 190 | ||
199 | old_count = task_bp_pinned(tsk, type); | 191 | old_count = task_bp_pinned(bp, type); |
200 | old_idx = old_count - 1; | 192 | old_idx = old_count - 1; |
201 | idx = old_idx + weight; | 193 | idx = old_idx + weight; |
202 | 194 | ||
195 | /* tsk_pinned[n] is the number of tasks having n breakpoints */ | ||
203 | tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); | 196 | tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); |
204 | if (enable) { | 197 | if (enable) { |
205 | tsk_pinned[idx]++; | 198 | tsk_pinned[idx]++; |
@@ -220,25 +213,43 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, | |||
220 | int weight) | 213 | int weight) |
221 | { | 214 | { |
222 | int cpu = bp->cpu; | 215 | int cpu = bp->cpu; |
223 | struct task_struct *tsk = bp->ctx->task; | 216 | struct task_struct *tsk = bp->hw.bp_target; |
217 | |||
218 | /* Pinned counter cpu profiling */ | ||
219 | if (!tsk) { | ||
220 | |||
221 | if (enable) | ||
222 | per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; | ||
223 | else | ||
224 | per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; | ||
225 | return; | ||
226 | } | ||
224 | 227 | ||
225 | /* Pinned counter task profiling */ | 228 | /* Pinned counter task profiling */ |
226 | if (tsk) { | ||
227 | if (cpu >= 0) { | ||
228 | toggle_bp_task_slot(tsk, cpu, enable, type, weight); | ||
229 | return; | ||
230 | } | ||
231 | 229 | ||
230 | if (!enable) | ||
231 | list_del(&bp->hw.bp_list); | ||
232 | |||
233 | if (cpu >= 0) { | ||
234 | toggle_bp_task_slot(bp, cpu, enable, type, weight); | ||
235 | } else { | ||
232 | for_each_online_cpu(cpu) | 236 | for_each_online_cpu(cpu) |
233 | toggle_bp_task_slot(tsk, cpu, enable, type, weight); | 237 | toggle_bp_task_slot(bp, cpu, enable, type, weight); |
234 | return; | ||
235 | } | 238 | } |
236 | 239 | ||
237 | /* Pinned counter cpu profiling */ | ||
238 | if (enable) | 240 | if (enable) |
239 | per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; | 241 | list_add_tail(&bp->hw.bp_list, &bp_task_head); |
240 | else | 242 | } |
241 | per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; | 243 | |
244 | /* | ||
245 | * Function to perform processor-specific cleanup during unregistration | ||
246 | */ | ||
247 | __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) | ||
248 | { | ||
249 | /* | ||
250 | * A weak stub function here for those archs that don't define | ||
251 | * it inside arch/.../kernel/hw_breakpoint.c | ||
252 | */ | ||
242 | } | 253 | } |
243 | 254 | ||
244 | /* | 255 | /* |
@@ -301,6 +312,10 @@ static int __reserve_bp_slot(struct perf_event *bp) | |||
301 | weight = hw_breakpoint_weight(bp); | 312 | weight = hw_breakpoint_weight(bp); |
302 | 313 | ||
303 | fetch_bp_busy_slots(&slots, bp, type); | 314 | fetch_bp_busy_slots(&slots, bp, type); |
315 | /* | ||
316 | * Simulate the addition of this breakpoint to the constraints | ||
317 | * and see the result. | ||
318 | */ | ||
304 | fetch_this_slot(&slots, weight); | 319 | fetch_this_slot(&slots, weight); |
305 | 320 | ||
306 | /* Flexible counters need to keep at least one slot */ | 321 | /* Flexible counters need to keep at least one slot */ |
@@ -339,6 +354,7 @@ void release_bp_slot(struct perf_event *bp) | |||
339 | { | 354 | { |
340 | mutex_lock(&nr_bp_mutex); | 355 | mutex_lock(&nr_bp_mutex); |
341 | 356 | ||
357 | arch_unregister_hw_breakpoint(bp); | ||
342 | __release_bp_slot(bp); | 358 | __release_bp_slot(bp); |
343 | 359 | ||
344 | mutex_unlock(&nr_bp_mutex); | 360 | mutex_unlock(&nr_bp_mutex); |
@@ -417,7 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, | |||
417 | perf_overflow_handler_t triggered, | 433 | perf_overflow_handler_t triggered, |
418 | struct task_struct *tsk) | 434 | struct task_struct *tsk) |
419 | { | 435 | { |
420 | return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); | 436 | return perf_event_create_kernel_counter(attr, -1, tsk, triggered); |
421 | } | 437 | } |
422 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); | 438 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); |
423 | 439 | ||
@@ -499,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, | |||
499 | get_online_cpus(); | 515 | get_online_cpus(); |
500 | for_each_online_cpu(cpu) { | 516 | for_each_online_cpu(cpu) { |
501 | pevent = per_cpu_ptr(cpu_events, cpu); | 517 | pevent = per_cpu_ptr(cpu_events, cpu); |
502 | bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); | 518 | bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); |
503 | 519 | ||
504 | *pevent = bp; | 520 | *pevent = bp; |
505 | 521 | ||
@@ -549,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = { | |||
549 | .priority = 0x7fffffff | 565 | .priority = 0x7fffffff |
550 | }; | 566 | }; |
551 | 567 | ||
568 | static void bp_perf_event_destroy(struct perf_event *event) | ||
569 | { | ||
570 | release_bp_slot(event); | ||
571 | } | ||
572 | |||
573 | static int hw_breakpoint_event_init(struct perf_event *bp) | ||
574 | { | ||
575 | int err; | ||
576 | |||
577 | if (bp->attr.type != PERF_TYPE_BREAKPOINT) | ||
578 | return -ENOENT; | ||
579 | |||
580 | err = register_perf_hw_breakpoint(bp); | ||
581 | if (err) | ||
582 | return err; | ||
583 | |||
584 | bp->destroy = bp_perf_event_destroy; | ||
585 | |||
586 | return 0; | ||
587 | } | ||
588 | |||
589 | static int hw_breakpoint_add(struct perf_event *bp, int flags) | ||
590 | { | ||
591 | if (!(flags & PERF_EF_START)) | ||
592 | bp->hw.state = PERF_HES_STOPPED; | ||
593 | |||
594 | return arch_install_hw_breakpoint(bp); | ||
595 | } | ||
596 | |||
597 | static void hw_breakpoint_del(struct perf_event *bp, int flags) | ||
598 | { | ||
599 | arch_uninstall_hw_breakpoint(bp); | ||
600 | } | ||
601 | |||
602 | static void hw_breakpoint_start(struct perf_event *bp, int flags) | ||
603 | { | ||
604 | bp->hw.state = 0; | ||
605 | } | ||
606 | |||
607 | static void hw_breakpoint_stop(struct perf_event *bp, int flags) | ||
608 | { | ||
609 | bp->hw.state = PERF_HES_STOPPED; | ||
610 | } | ||
611 | |||
612 | static struct pmu perf_breakpoint = { | ||
613 | .task_ctx_nr = perf_sw_context, /* could eventually get its own */ | ||
614 | |||
615 | .event_init = hw_breakpoint_event_init, | ||
616 | .add = hw_breakpoint_add, | ||
617 | .del = hw_breakpoint_del, | ||
618 | .start = hw_breakpoint_start, | ||
619 | .stop = hw_breakpoint_stop, | ||
620 | .read = hw_breakpoint_pmu_read, | ||
621 | }; | ||
622 | |||
552 | static int __init init_hw_breakpoint(void) | 623 | static int __init init_hw_breakpoint(void) |
553 | { | 624 | { |
554 | unsigned int **task_bp_pinned; | 625 | unsigned int **task_bp_pinned; |
@@ -570,6 +641,8 @@ static int __init init_hw_breakpoint(void) | |||
570 | 641 | ||
571 | constraints_initialized = 1; | 642 | constraints_initialized = 1; |
572 | 643 | ||
644 | perf_pmu_register(&perf_breakpoint); | ||
645 | |||
573 | return register_die_notifier(&hw_breakpoint_exceptions_nb); | 646 | return register_die_notifier(&hw_breakpoint_exceptions_nb); |
574 | 647 | ||
575 | err_alloc: | 648 | err_alloc: |
@@ -585,8 +658,3 @@ static int __init init_hw_breakpoint(void) | |||
585 | core_initcall(init_hw_breakpoint); | 658 | core_initcall(init_hw_breakpoint); |
586 | 659 | ||
587 | 660 | ||
588 | struct pmu perf_ops_bp = { | ||
589 | .enable = arch_install_hw_breakpoint, | ||
590 | .disable = arch_uninstall_hw_breakpoint, | ||
591 | .read = hw_breakpoint_pmu_read, | ||
592 | }; | ||
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig new file mode 100644 index 000000000000..31d766bf5d2e --- /dev/null +++ b/kernel/irq/Kconfig | |||
@@ -0,0 +1,53 @@ | |||
1 | config HAVE_GENERIC_HARDIRQS | ||
2 | def_bool n | ||
3 | |||
4 | if HAVE_GENERIC_HARDIRQS | ||
5 | menu "IRQ subsystem" | ||
6 | # | ||
7 | # Interrupt subsystem related configuration options | ||
8 | # | ||
9 | config GENERIC_HARDIRQS | ||
10 | def_bool y | ||
11 | |||
12 | config GENERIC_HARDIRQS_NO__DO_IRQ | ||
13 | def_bool y | ||
14 | |||
15 | # Select this to disable the deprecated stuff | ||
16 | config GENERIC_HARDIRQS_NO_DEPRECATED | ||
17 | def_bool n | ||
18 | |||
19 | # Options selectable by the architecture code | ||
20 | config HAVE_SPARSE_IRQ | ||
21 | def_bool n | ||
22 | |||
23 | config GENERIC_IRQ_PROBE | ||
24 | def_bool n | ||
25 | |||
26 | config GENERIC_PENDING_IRQ | ||
27 | def_bool n | ||
28 | |||
29 | config AUTO_IRQ_AFFINITY | ||
30 | def_bool n | ||
31 | |||
32 | config IRQ_PER_CPU | ||
33 | def_bool n | ||
34 | |||
35 | config HARDIRQS_SW_RESEND | ||
36 | def_bool n | ||
37 | |||
38 | config SPARSE_IRQ | ||
39 | bool "Support sparse irq numbering" | ||
40 | depends on HAVE_SPARSE_IRQ | ||
41 | ---help--- | ||
42 | |||
43 | Sparse irq numbering is useful for distro kernels that want | ||
44 | to define a high CONFIG_NR_CPUS value but still want to have | ||
45 | low kernel memory footprint on smaller machines. | ||
46 | |||
47 | ( Sparse irqs can also be beneficial on NUMA boxes, as they spread | ||
48 | out the interrupt descriptors in a more NUMA-friendly way. ) | ||
49 | |||
50 | If you don't know what to do here, say N. | ||
51 | |||
52 | endmenu | ||
53 | endif | ||
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 7d047808419d..54329cd7b3ee 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
@@ -1,7 +1,6 @@ | |||
1 | 1 | ||
2 | obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o | 2 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o |
3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
4 | obj-$(CONFIG_PROC_FS) += proc.o | 4 | obj-$(CONFIG_PROC_FS) += proc.o |
5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
6 | obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o | ||
7 | obj-$(CONFIG_PM_SLEEP) += pm.o | 6 | obj-$(CONFIG_PM_SLEEP) += pm.o |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 2295a31ef110..505798f86c36 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
@@ -57,9 +57,10 @@ unsigned long probe_irq_on(void) | |||
57 | * Some chips need to know about probing in | 57 | * Some chips need to know about probing in |
58 | * progress: | 58 | * progress: |
59 | */ | 59 | */ |
60 | if (desc->chip->set_type) | 60 | if (desc->irq_data.chip->irq_set_type) |
61 | desc->chip->set_type(i, IRQ_TYPE_PROBE); | 61 | desc->irq_data.chip->irq_set_type(&desc->irq_data, |
62 | desc->chip->startup(i); | 62 | IRQ_TYPE_PROBE); |
63 | desc->irq_data.chip->irq_startup(&desc->irq_data); | ||
63 | } | 64 | } |
64 | raw_spin_unlock_irq(&desc->lock); | 65 | raw_spin_unlock_irq(&desc->lock); |
65 | } | 66 | } |
@@ -76,7 +77,7 @@ unsigned long probe_irq_on(void) | |||
76 | raw_spin_lock_irq(&desc->lock); | 77 | raw_spin_lock_irq(&desc->lock); |
77 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { | 78 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { |
78 | desc->status |= IRQ_AUTODETECT | IRQ_WAITING; | 79 | desc->status |= IRQ_AUTODETECT | IRQ_WAITING; |
79 | if (desc->chip->startup(i)) | 80 | if (desc->irq_data.chip->irq_startup(&desc->irq_data)) |
80 | desc->status |= IRQ_PENDING; | 81 | desc->status |= IRQ_PENDING; |
81 | } | 82 | } |
82 | raw_spin_unlock_irq(&desc->lock); | 83 | raw_spin_unlock_irq(&desc->lock); |
@@ -98,7 +99,7 @@ unsigned long probe_irq_on(void) | |||
98 | /* It triggered already - consider it spurious. */ | 99 | /* It triggered already - consider it spurious. */ |
99 | if (!(status & IRQ_WAITING)) { | 100 | if (!(status & IRQ_WAITING)) { |
100 | desc->status = status & ~IRQ_AUTODETECT; | 101 | desc->status = status & ~IRQ_AUTODETECT; |
101 | desc->chip->shutdown(i); | 102 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); |
102 | } else | 103 | } else |
103 | if (i < 32) | 104 | if (i < 32) |
104 | mask |= 1 << i; | 105 | mask |= 1 << i; |
@@ -137,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val) | |||
137 | mask |= 1 << i; | 138 | mask |= 1 << i; |
138 | 139 | ||
139 | desc->status = status & ~IRQ_AUTODETECT; | 140 | desc->status = status & ~IRQ_AUTODETECT; |
140 | desc->chip->shutdown(i); | 141 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); |
141 | } | 142 | } |
142 | raw_spin_unlock_irq(&desc->lock); | 143 | raw_spin_unlock_irq(&desc->lock); |
143 | } | 144 | } |
@@ -181,7 +182,7 @@ int probe_irq_off(unsigned long val) | |||
181 | nr_of_irqs++; | 182 | nr_of_irqs++; |
182 | } | 183 | } |
183 | desc->status = status & ~IRQ_AUTODETECT; | 184 | desc->status = status & ~IRQ_AUTODETECT; |
184 | desc->chip->shutdown(i); | 185 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); |
185 | } | 186 | } |
186 | raw_spin_unlock_irq(&desc->lock); | 187 | raw_spin_unlock_irq(&desc->lock); |
187 | } | 188 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b7091d5ca2f8..baa5c4acad83 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -18,108 +18,6 @@ | |||
18 | 18 | ||
19 | #include "internals.h" | 19 | #include "internals.h" |
20 | 20 | ||
21 | static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) | ||
22 | { | ||
23 | struct irq_desc *desc; | ||
24 | unsigned long flags; | ||
25 | |||
26 | desc = irq_to_desc(irq); | ||
27 | if (!desc) { | ||
28 | WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); | ||
29 | return; | ||
30 | } | ||
31 | |||
32 | /* Ensure we don't have left over values from a previous use of this irq */ | ||
33 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
34 | desc->status = IRQ_DISABLED; | ||
35 | desc->chip = &no_irq_chip; | ||
36 | desc->handle_irq = handle_bad_irq; | ||
37 | desc->depth = 1; | ||
38 | desc->msi_desc = NULL; | ||
39 | desc->handler_data = NULL; | ||
40 | if (!keep_chip_data) | ||
41 | desc->chip_data = NULL; | ||
42 | desc->action = NULL; | ||
43 | desc->irq_count = 0; | ||
44 | desc->irqs_unhandled = 0; | ||
45 | #ifdef CONFIG_SMP | ||
46 | cpumask_setall(desc->affinity); | ||
47 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
48 | cpumask_clear(desc->pending_mask); | ||
49 | #endif | ||
50 | #endif | ||
51 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
52 | } | ||
53 | |||
54 | /** | ||
55 | * dynamic_irq_init - initialize a dynamically allocated irq | ||
56 | * @irq: irq number to initialize | ||
57 | */ | ||
58 | void dynamic_irq_init(unsigned int irq) | ||
59 | { | ||
60 | dynamic_irq_init_x(irq, false); | ||
61 | } | ||
62 | |||
63 | /** | ||
64 | * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq | ||
65 | * @irq: irq number to initialize | ||
66 | * | ||
67 | * does not set irq_to_desc(irq)->chip_data to NULL | ||
68 | */ | ||
69 | void dynamic_irq_init_keep_chip_data(unsigned int irq) | ||
70 | { | ||
71 | dynamic_irq_init_x(irq, true); | ||
72 | } | ||
73 | |||
74 | static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) | ||
75 | { | ||
76 | struct irq_desc *desc = irq_to_desc(irq); | ||
77 | unsigned long flags; | ||
78 | |||
79 | if (!desc) { | ||
80 | WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); | ||
81 | return; | ||
82 | } | ||
83 | |||
84 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
85 | if (desc->action) { | ||
86 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
87 | WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", | ||
88 | irq); | ||
89 | return; | ||
90 | } | ||
91 | desc->msi_desc = NULL; | ||
92 | desc->handler_data = NULL; | ||
93 | if (!keep_chip_data) | ||
94 | desc->chip_data = NULL; | ||
95 | desc->handle_irq = handle_bad_irq; | ||
96 | desc->chip = &no_irq_chip; | ||
97 | desc->name = NULL; | ||
98 | clear_kstat_irqs(desc); | ||
99 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
100 | } | ||
101 | |||
102 | /** | ||
103 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | ||
104 | * @irq: irq number to initialize | ||
105 | */ | ||
106 | void dynamic_irq_cleanup(unsigned int irq) | ||
107 | { | ||
108 | dynamic_irq_cleanup_x(irq, false); | ||
109 | } | ||
110 | |||
111 | /** | ||
112 | * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq | ||
113 | * @irq: irq number to initialize | ||
114 | * | ||
115 | * does not set irq_to_desc(irq)->chip_data to NULL | ||
116 | */ | ||
117 | void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) | ||
118 | { | ||
119 | dynamic_irq_cleanup_x(irq, true); | ||
120 | } | ||
121 | |||
122 | |||
123 | /** | 21 | /** |
124 | * set_irq_chip - set the irq chip for an irq | 22 | * set_irq_chip - set the irq chip for an irq |
125 | * @irq: irq number | 23 | * @irq: irq number |
@@ -140,7 +38,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) | |||
140 | 38 | ||
141 | raw_spin_lock_irqsave(&desc->lock, flags); | 39 | raw_spin_lock_irqsave(&desc->lock, flags); |
142 | irq_chip_set_defaults(chip); | 40 | irq_chip_set_defaults(chip); |
143 | desc->chip = chip; | 41 | desc->irq_data.chip = chip; |
144 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 42 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
145 | 43 | ||
146 | return 0; | 44 | return 0; |
@@ -193,7 +91,7 @@ int set_irq_data(unsigned int irq, void *data) | |||
193 | } | 91 | } |
194 | 92 | ||
195 | raw_spin_lock_irqsave(&desc->lock, flags); | 93 | raw_spin_lock_irqsave(&desc->lock, flags); |
196 | desc->handler_data = data; | 94 | desc->irq_data.handler_data = data; |
197 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 95 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
198 | return 0; | 96 | return 0; |
199 | } | 97 | } |
@@ -218,7 +116,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) | |||
218 | } | 116 | } |
219 | 117 | ||
220 | raw_spin_lock_irqsave(&desc->lock, flags); | 118 | raw_spin_lock_irqsave(&desc->lock, flags); |
221 | desc->msi_desc = entry; | 119 | desc->irq_data.msi_desc = entry; |
222 | if (entry) | 120 | if (entry) |
223 | entry->irq = irq; | 121 | entry->irq = irq; |
224 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 122 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
@@ -243,19 +141,27 @@ int set_irq_chip_data(unsigned int irq, void *data) | |||
243 | return -EINVAL; | 141 | return -EINVAL; |
244 | } | 142 | } |
245 | 143 | ||
246 | if (!desc->chip) { | 144 | if (!desc->irq_data.chip) { |
247 | printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); | 145 | printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); |
248 | return -EINVAL; | 146 | return -EINVAL; |
249 | } | 147 | } |
250 | 148 | ||
251 | raw_spin_lock_irqsave(&desc->lock, flags); | 149 | raw_spin_lock_irqsave(&desc->lock, flags); |
252 | desc->chip_data = data; | 150 | desc->irq_data.chip_data = data; |
253 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 151 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
254 | 152 | ||
255 | return 0; | 153 | return 0; |
256 | } | 154 | } |
257 | EXPORT_SYMBOL(set_irq_chip_data); | 155 | EXPORT_SYMBOL(set_irq_chip_data); |
258 | 156 | ||
157 | struct irq_data *irq_get_irq_data(unsigned int irq) | ||
158 | { | ||
159 | struct irq_desc *desc = irq_to_desc(irq); | ||
160 | |||
161 | return desc ? &desc->irq_data : NULL; | ||
162 | } | ||
163 | EXPORT_SYMBOL_GPL(irq_get_irq_data); | ||
164 | |||
259 | /** | 165 | /** |
260 | * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq | 166 | * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq |
261 | * | 167 | * |
@@ -287,93 +193,216 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread); | |||
287 | /* | 193 | /* |
288 | * default enable function | 194 | * default enable function |
289 | */ | 195 | */ |
290 | static void default_enable(unsigned int irq) | 196 | static void default_enable(struct irq_data *data) |
291 | { | 197 | { |
292 | struct irq_desc *desc = irq_to_desc(irq); | 198 | struct irq_desc *desc = irq_data_to_desc(data); |
293 | 199 | ||
294 | desc->chip->unmask(irq); | 200 | desc->irq_data.chip->irq_unmask(&desc->irq_data); |
295 | desc->status &= ~IRQ_MASKED; | 201 | desc->status &= ~IRQ_MASKED; |
296 | } | 202 | } |
297 | 203 | ||
298 | /* | 204 | /* |
299 | * default disable function | 205 | * default disable function |
300 | */ | 206 | */ |
301 | static void default_disable(unsigned int irq) | 207 | static void default_disable(struct irq_data *data) |
302 | { | 208 | { |
303 | } | 209 | } |
304 | 210 | ||
305 | /* | 211 | /* |
306 | * default startup function | 212 | * default startup function |
307 | */ | 213 | */ |
308 | static unsigned int default_startup(unsigned int irq) | 214 | static unsigned int default_startup(struct irq_data *data) |
309 | { | 215 | { |
310 | struct irq_desc *desc = irq_to_desc(irq); | 216 | struct irq_desc *desc = irq_data_to_desc(data); |
311 | 217 | ||
312 | desc->chip->enable(irq); | 218 | desc->irq_data.chip->irq_enable(data); |
313 | return 0; | 219 | return 0; |
314 | } | 220 | } |
315 | 221 | ||
316 | /* | 222 | /* |
317 | * default shutdown function | 223 | * default shutdown function |
318 | */ | 224 | */ |
319 | static void default_shutdown(unsigned int irq) | 225 | static void default_shutdown(struct irq_data *data) |
320 | { | 226 | { |
321 | struct irq_desc *desc = irq_to_desc(irq); | 227 | struct irq_desc *desc = irq_data_to_desc(data); |
322 | 228 | ||
323 | desc->chip->mask(irq); | 229 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
324 | desc->status |= IRQ_MASKED; | 230 | desc->status |= IRQ_MASKED; |
325 | } | 231 | } |
326 | 232 | ||
233 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | ||
234 | /* Temporary migration helpers */ | ||
235 | static void compat_irq_mask(struct irq_data *data) | ||
236 | { | ||
237 | data->chip->mask(data->irq); | ||
238 | } | ||
239 | |||
240 | static void compat_irq_unmask(struct irq_data *data) | ||
241 | { | ||
242 | data->chip->unmask(data->irq); | ||
243 | } | ||
244 | |||
245 | static void compat_irq_ack(struct irq_data *data) | ||
246 | { | ||
247 | data->chip->ack(data->irq); | ||
248 | } | ||
249 | |||
250 | static void compat_irq_mask_ack(struct irq_data *data) | ||
251 | { | ||
252 | data->chip->mask_ack(data->irq); | ||
253 | } | ||
254 | |||
255 | static void compat_irq_eoi(struct irq_data *data) | ||
256 | { | ||
257 | data->chip->eoi(data->irq); | ||
258 | } | ||
259 | |||
260 | static void compat_irq_enable(struct irq_data *data) | ||
261 | { | ||
262 | data->chip->enable(data->irq); | ||
263 | } | ||
264 | |||
265 | static void compat_irq_disable(struct irq_data *data) | ||
266 | { | ||
267 | data->chip->disable(data->irq); | ||
268 | } | ||
269 | |||
270 | static void compat_irq_shutdown(struct irq_data *data) | ||
271 | { | ||
272 | data->chip->shutdown(data->irq); | ||
273 | } | ||
274 | |||
275 | static unsigned int compat_irq_startup(struct irq_data *data) | ||
276 | { | ||
277 | return data->chip->startup(data->irq); | ||
278 | } | ||
279 | |||
280 | static int compat_irq_set_affinity(struct irq_data *data, | ||
281 | const struct cpumask *dest, bool force) | ||
282 | { | ||
283 | return data->chip->set_affinity(data->irq, dest); | ||
284 | } | ||
285 | |||
286 | static int compat_irq_set_type(struct irq_data *data, unsigned int type) | ||
287 | { | ||
288 | return data->chip->set_type(data->irq, type); | ||
289 | } | ||
290 | |||
291 | static int compat_irq_set_wake(struct irq_data *data, unsigned int on) | ||
292 | { | ||
293 | return data->chip->set_wake(data->irq, on); | ||
294 | } | ||
295 | |||
296 | static int compat_irq_retrigger(struct irq_data *data) | ||
297 | { | ||
298 | return data->chip->retrigger(data->irq); | ||
299 | } | ||
300 | |||
301 | static void compat_bus_lock(struct irq_data *data) | ||
302 | { | ||
303 | data->chip->bus_lock(data->irq); | ||
304 | } | ||
305 | |||
306 | static void compat_bus_sync_unlock(struct irq_data *data) | ||
307 | { | ||
308 | data->chip->bus_sync_unlock(data->irq); | ||
309 | } | ||
310 | #endif | ||
311 | |||
327 | /* | 312 | /* |
328 | * Fixup enable/disable function pointers | 313 | * Fixup enable/disable function pointers |
329 | */ | 314 | */ |
330 | void irq_chip_set_defaults(struct irq_chip *chip) | 315 | void irq_chip_set_defaults(struct irq_chip *chip) |
331 | { | 316 | { |
332 | if (!chip->enable) | 317 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED |
333 | chip->enable = default_enable; | ||
334 | if (!chip->disable) | ||
335 | chip->disable = default_disable; | ||
336 | if (!chip->startup) | ||
337 | chip->startup = default_startup; | ||
338 | /* | 318 | /* |
339 | * We use chip->disable, when the user provided its own. When | 319 | * Compat fixup functions need to be before we set the |
340 | * we have default_disable set for chip->disable, then we need | 320 | * defaults for enable/disable/startup/shutdown |
321 | */ | ||
322 | if (chip->enable) | ||
323 | chip->irq_enable = compat_irq_enable; | ||
324 | if (chip->disable) | ||
325 | chip->irq_disable = compat_irq_disable; | ||
326 | if (chip->shutdown) | ||
327 | chip->irq_shutdown = compat_irq_shutdown; | ||
328 | if (chip->startup) | ||
329 | chip->irq_startup = compat_irq_startup; | ||
330 | #endif | ||
331 | /* | ||
332 | * The real defaults | ||
333 | */ | ||
334 | if (!chip->irq_enable) | ||
335 | chip->irq_enable = default_enable; | ||
336 | if (!chip->irq_disable) | ||
337 | chip->irq_disable = default_disable; | ||
338 | if (!chip->irq_startup) | ||
339 | chip->irq_startup = default_startup; | ||
340 | /* | ||
341 | * We use chip->irq_disable, when the user provided its own. When | ||
342 | * we have default_disable set for chip->irq_disable, then we need | ||
341 | * to use default_shutdown, otherwise the irq line is not | 343 | * to use default_shutdown, otherwise the irq line is not |
342 | * disabled on free_irq(): | 344 | * disabled on free_irq(): |
343 | */ | 345 | */ |
344 | if (!chip->shutdown) | 346 | if (!chip->irq_shutdown) |
345 | chip->shutdown = chip->disable != default_disable ? | 347 | chip->irq_shutdown = chip->irq_disable != default_disable ? |
346 | chip->disable : default_shutdown; | 348 | chip->irq_disable : default_shutdown; |
347 | if (!chip->name) | 349 | |
348 | chip->name = chip->typename; | 350 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED |
349 | if (!chip->end) | 351 | if (!chip->end) |
350 | chip->end = dummy_irq_chip.end; | 352 | chip->end = dummy_irq_chip.end; |
353 | |||
354 | /* | ||
355 | * Now fix up the remaining compat handlers | ||
356 | */ | ||
357 | if (chip->bus_lock) | ||
358 | chip->irq_bus_lock = compat_bus_lock; | ||
359 | if (chip->bus_sync_unlock) | ||
360 | chip->irq_bus_sync_unlock = compat_bus_sync_unlock; | ||
361 | if (chip->mask) | ||
362 | chip->irq_mask = compat_irq_mask; | ||
363 | if (chip->unmask) | ||
364 | chip->irq_unmask = compat_irq_unmask; | ||
365 | if (chip->ack) | ||
366 | chip->irq_ack = compat_irq_ack; | ||
367 | if (chip->mask_ack) | ||
368 | chip->irq_mask_ack = compat_irq_mask_ack; | ||
369 | if (chip->eoi) | ||
370 | chip->irq_eoi = compat_irq_eoi; | ||
371 | if (chip->set_affinity) | ||
372 | chip->irq_set_affinity = compat_irq_set_affinity; | ||
373 | if (chip->set_type) | ||
374 | chip->irq_set_type = compat_irq_set_type; | ||
375 | if (chip->set_wake) | ||
376 | chip->irq_set_wake = compat_irq_set_wake; | ||
377 | if (chip->retrigger) | ||
378 | chip->irq_retrigger = compat_irq_retrigger; | ||
379 | #endif | ||
351 | } | 380 | } |
352 | 381 | ||
353 | static inline void mask_ack_irq(struct irq_desc *desc, int irq) | 382 | static inline void mask_ack_irq(struct irq_desc *desc) |
354 | { | 383 | { |
355 | if (desc->chip->mask_ack) | 384 | if (desc->irq_data.chip->irq_mask_ack) |
356 | desc->chip->mask_ack(irq); | 385 | desc->irq_data.chip->irq_mask_ack(&desc->irq_data); |
357 | else { | 386 | else { |
358 | desc->chip->mask(irq); | 387 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
359 | if (desc->chip->ack) | 388 | if (desc->irq_data.chip->irq_ack) |
360 | desc->chip->ack(irq); | 389 | desc->irq_data.chip->irq_ack(&desc->irq_data); |
361 | } | 390 | } |
362 | desc->status |= IRQ_MASKED; | 391 | desc->status |= IRQ_MASKED; |
363 | } | 392 | } |
364 | 393 | ||
365 | static inline void mask_irq(struct irq_desc *desc, int irq) | 394 | static inline void mask_irq(struct irq_desc *desc) |
366 | { | 395 | { |
367 | if (desc->chip->mask) { | 396 | if (desc->irq_data.chip->irq_mask) { |
368 | desc->chip->mask(irq); | 397 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
369 | desc->status |= IRQ_MASKED; | 398 | desc->status |= IRQ_MASKED; |
370 | } | 399 | } |
371 | } | 400 | } |
372 | 401 | ||
373 | static inline void unmask_irq(struct irq_desc *desc, int irq) | 402 | static inline void unmask_irq(struct irq_desc *desc) |
374 | { | 403 | { |
375 | if (desc->chip->unmask) { | 404 | if (desc->irq_data.chip->irq_unmask) { |
376 | desc->chip->unmask(irq); | 405 | desc->irq_data.chip->irq_unmask(&desc->irq_data); |
377 | desc->status &= ~IRQ_MASKED; | 406 | desc->status &= ~IRQ_MASKED; |
378 | } | 407 | } |
379 | } | 408 | } |
@@ -476,7 +505,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
476 | irqreturn_t action_ret; | 505 | irqreturn_t action_ret; |
477 | 506 | ||
478 | raw_spin_lock(&desc->lock); | 507 | raw_spin_lock(&desc->lock); |
479 | mask_ack_irq(desc, irq); | 508 | mask_ack_irq(desc); |
480 | 509 | ||
481 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 510 | if (unlikely(desc->status & IRQ_INPROGRESS)) |
482 | goto out_unlock; | 511 | goto out_unlock; |
@@ -502,7 +531,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
502 | desc->status &= ~IRQ_INPROGRESS; | 531 | desc->status &= ~IRQ_INPROGRESS; |
503 | 532 | ||
504 | if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) | 533 | if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) |
505 | unmask_irq(desc, irq); | 534 | unmask_irq(desc); |
506 | out_unlock: | 535 | out_unlock: |
507 | raw_spin_unlock(&desc->lock); | 536 | raw_spin_unlock(&desc->lock); |
508 | } | 537 | } |
@@ -539,7 +568,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | |||
539 | action = desc->action; | 568 | action = desc->action; |
540 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | 569 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { |
541 | desc->status |= IRQ_PENDING; | 570 | desc->status |= IRQ_PENDING; |
542 | mask_irq(desc, irq); | 571 | mask_irq(desc); |
543 | goto out; | 572 | goto out; |
544 | } | 573 | } |
545 | 574 | ||
@@ -554,7 +583,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | |||
554 | raw_spin_lock(&desc->lock); | 583 | raw_spin_lock(&desc->lock); |
555 | desc->status &= ~IRQ_INPROGRESS; | 584 | desc->status &= ~IRQ_INPROGRESS; |
556 | out: | 585 | out: |
557 | desc->chip->eoi(irq); | 586 | desc->irq_data.chip->irq_eoi(&desc->irq_data); |
558 | 587 | ||
559 | raw_spin_unlock(&desc->lock); | 588 | raw_spin_unlock(&desc->lock); |
560 | } | 589 | } |
@@ -590,14 +619,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
590 | if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || | 619 | if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || |
591 | !desc->action)) { | 620 | !desc->action)) { |
592 | desc->status |= (IRQ_PENDING | IRQ_MASKED); | 621 | desc->status |= (IRQ_PENDING | IRQ_MASKED); |
593 | mask_ack_irq(desc, irq); | 622 | mask_ack_irq(desc); |
594 | goto out_unlock; | 623 | goto out_unlock; |
595 | } | 624 | } |
596 | kstat_incr_irqs_this_cpu(irq, desc); | 625 | kstat_incr_irqs_this_cpu(irq, desc); |
597 | 626 | ||
598 | /* Start handling the irq */ | 627 | /* Start handling the irq */ |
599 | if (desc->chip->ack) | 628 | desc->irq_data.chip->irq_ack(&desc->irq_data); |
600 | desc->chip->ack(irq); | ||
601 | 629 | ||
602 | /* Mark the IRQ currently in progress.*/ | 630 | /* Mark the IRQ currently in progress.*/ |
603 | desc->status |= IRQ_INPROGRESS; | 631 | desc->status |= IRQ_INPROGRESS; |
@@ -607,7 +635,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
607 | irqreturn_t action_ret; | 635 | irqreturn_t action_ret; |
608 | 636 | ||
609 | if (unlikely(!action)) { | 637 | if (unlikely(!action)) { |
610 | mask_irq(desc, irq); | 638 | mask_irq(desc); |
611 | goto out_unlock; | 639 | goto out_unlock; |
612 | } | 640 | } |
613 | 641 | ||
@@ -619,7 +647,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
619 | if (unlikely((desc->status & | 647 | if (unlikely((desc->status & |
620 | (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == | 648 | (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == |
621 | (IRQ_PENDING | IRQ_MASKED))) { | 649 | (IRQ_PENDING | IRQ_MASKED))) { |
622 | unmask_irq(desc, irq); | 650 | unmask_irq(desc); |
623 | } | 651 | } |
624 | 652 | ||
625 | desc->status &= ~IRQ_PENDING; | 653 | desc->status &= ~IRQ_PENDING; |
@@ -650,15 +678,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) | |||
650 | 678 | ||
651 | kstat_incr_irqs_this_cpu(irq, desc); | 679 | kstat_incr_irqs_this_cpu(irq, desc); |
652 | 680 | ||
653 | if (desc->chip->ack) | 681 | if (desc->irq_data.chip->irq_ack) |
654 | desc->chip->ack(irq); | 682 | desc->irq_data.chip->irq_ack(&desc->irq_data); |
655 | 683 | ||
656 | action_ret = handle_IRQ_event(irq, desc->action); | 684 | action_ret = handle_IRQ_event(irq, desc->action); |
657 | if (!noirqdebug) | 685 | if (!noirqdebug) |
658 | note_interrupt(irq, desc, action_ret); | 686 | note_interrupt(irq, desc, action_ret); |
659 | 687 | ||
660 | if (desc->chip->eoi) | 688 | if (desc->irq_data.chip->irq_eoi) |
661 | desc->chip->eoi(irq); | 689 | desc->irq_data.chip->irq_eoi(&desc->irq_data); |
662 | } | 690 | } |
663 | 691 | ||
664 | void | 692 | void |
@@ -676,7 +704,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
676 | 704 | ||
677 | if (!handle) | 705 | if (!handle) |
678 | handle = handle_bad_irq; | 706 | handle = handle_bad_irq; |
679 | else if (desc->chip == &no_irq_chip) { | 707 | else if (desc->irq_data.chip == &no_irq_chip) { |
680 | printk(KERN_WARNING "Trying to install %sinterrupt handler " | 708 | printk(KERN_WARNING "Trying to install %sinterrupt handler " |
681 | "for IRQ%d\n", is_chained ? "chained " : "", irq); | 709 | "for IRQ%d\n", is_chained ? "chained " : "", irq); |
682 | /* | 710 | /* |
@@ -686,16 +714,16 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
686 | * prevent us to setup the interrupt at all. Switch it to | 714 | * prevent us to setup the interrupt at all. Switch it to |
687 | * dummy_irq_chip for easy transition. | 715 | * dummy_irq_chip for easy transition. |
688 | */ | 716 | */ |
689 | desc->chip = &dummy_irq_chip; | 717 | desc->irq_data.chip = &dummy_irq_chip; |
690 | } | 718 | } |
691 | 719 | ||
692 | chip_bus_lock(irq, desc); | 720 | chip_bus_lock(desc); |
693 | raw_spin_lock_irqsave(&desc->lock, flags); | 721 | raw_spin_lock_irqsave(&desc->lock, flags); |
694 | 722 | ||
695 | /* Uninstall? */ | 723 | /* Uninstall? */ |
696 | if (handle == handle_bad_irq) { | 724 | if (handle == handle_bad_irq) { |
697 | if (desc->chip != &no_irq_chip) | 725 | if (desc->irq_data.chip != &no_irq_chip) |
698 | mask_ack_irq(desc, irq); | 726 | mask_ack_irq(desc); |
699 | desc->status |= IRQ_DISABLED; | 727 | desc->status |= IRQ_DISABLED; |
700 | desc->depth = 1; | 728 | desc->depth = 1; |
701 | } | 729 | } |
@@ -706,10 +734,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
706 | desc->status &= ~IRQ_DISABLED; | 734 | desc->status &= ~IRQ_DISABLED; |
707 | desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; | 735 | desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; |
708 | desc->depth = 0; | 736 | desc->depth = 0; |
709 | desc->chip->startup(irq); | 737 | desc->irq_data.chip->irq_startup(&desc->irq_data); |
710 | } | 738 | } |
711 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 739 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
712 | chip_bus_sync_unlock(irq, desc); | 740 | chip_bus_sync_unlock(desc); |
713 | } | 741 | } |
714 | EXPORT_SYMBOL_GPL(__set_irq_handler); | 742 | EXPORT_SYMBOL_GPL(__set_irq_handler); |
715 | 743 | ||
@@ -729,32 +757,20 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, | |||
729 | __set_irq_handler(irq, handle, 0, name); | 757 | __set_irq_handler(irq, handle, 0, name); |
730 | } | 758 | } |
731 | 759 | ||
732 | void set_irq_noprobe(unsigned int irq) | 760 | void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) |
733 | { | 761 | { |
734 | struct irq_desc *desc = irq_to_desc(irq); | 762 | struct irq_desc *desc = irq_to_desc(irq); |
735 | unsigned long flags; | 763 | unsigned long flags; |
736 | 764 | ||
737 | if (!desc) { | 765 | if (!desc) |
738 | printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); | ||
739 | return; | 766 | return; |
740 | } | ||
741 | |||
742 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
743 | desc->status |= IRQ_NOPROBE; | ||
744 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
745 | } | ||
746 | |||
747 | void set_irq_probe(unsigned int irq) | ||
748 | { | ||
749 | struct irq_desc *desc = irq_to_desc(irq); | ||
750 | unsigned long flags; | ||
751 | 767 | ||
752 | if (!desc) { | 768 | /* Sanitize flags */ |
753 | printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); | 769 | set &= IRQF_MODIFY_MASK; |
754 | return; | 770 | clr &= IRQF_MODIFY_MASK; |
755 | } | ||
756 | 771 | ||
757 | raw_spin_lock_irqsave(&desc->lock, flags); | 772 | raw_spin_lock_irqsave(&desc->lock, flags); |
758 | desc->status &= ~IRQ_NOPROBE; | 773 | desc->status &= ~clr; |
774 | desc->status |= set; | ||
759 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 775 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
760 | } | 776 | } |
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c new file mode 100644 index 000000000000..20dc5474947e --- /dev/null +++ b/kernel/irq/dummychip.c | |||
@@ -0,0 +1,68 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar | ||
3 | * Copyright (C) 2005-2006, Thomas Gleixner, Russell King | ||
4 | * | ||
5 | * This file contains the dummy interrupt chip implementation | ||
6 | */ | ||
7 | #include <linux/interrupt.h> | ||
8 | #include <linux/irq.h> | ||
9 | |||
10 | #include "internals.h" | ||
11 | |||
12 | /* | ||
13 | * What should we do if we get a hw irq event on an illegal vector? | ||
14 | * Each architecture has to answer this themself. | ||
15 | */ | ||
16 | static void ack_bad(struct irq_data *data) | ||
17 | { | ||
18 | struct irq_desc *desc = irq_data_to_desc(data); | ||
19 | |||
20 | print_irq_desc(data->irq, desc); | ||
21 | ack_bad_irq(data->irq); | ||
22 | } | ||
23 | |||
24 | /* | ||
25 | * NOP functions | ||
26 | */ | ||
27 | static void noop(struct irq_data *data) { } | ||
28 | |||
29 | static unsigned int noop_ret(struct irq_data *data) | ||
30 | { | ||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | ||
35 | static void compat_noop(unsigned int irq) { } | ||
36 | #define END_INIT .end = compat_noop | ||
37 | #else | ||
38 | #define END_INIT | ||
39 | #endif | ||
40 | |||
41 | /* | ||
42 | * Generic no controller implementation | ||
43 | */ | ||
44 | struct irq_chip no_irq_chip = { | ||
45 | .name = "none", | ||
46 | .irq_startup = noop_ret, | ||
47 | .irq_shutdown = noop, | ||
48 | .irq_enable = noop, | ||
49 | .irq_disable = noop, | ||
50 | .irq_ack = ack_bad, | ||
51 | END_INIT | ||
52 | }; | ||
53 | |||
54 | /* | ||
55 | * Generic dummy implementation which can be used for | ||
56 | * real dumb interrupt sources | ||
57 | */ | ||
58 | struct irq_chip dummy_irq_chip = { | ||
59 | .name = "dummy", | ||
60 | .irq_startup = noop_ret, | ||
61 | .irq_shutdown = noop, | ||
62 | .irq_enable = noop, | ||
63 | .irq_disable = noop, | ||
64 | .irq_ack = noop, | ||
65 | .irq_mask = noop, | ||
66 | .irq_unmask = noop, | ||
67 | END_INIT | ||
68 | }; | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 27e5c6911223..e2347eb63306 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -11,24 +11,15 @@ | |||
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <linux/irq.h> | 13 | #include <linux/irq.h> |
14 | #include <linux/sched.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/random.h> | 14 | #include <linux/random.h> |
15 | #include <linux/sched.h> | ||
18 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
19 | #include <linux/kernel_stat.h> | 17 | #include <linux/kernel_stat.h> |
20 | #include <linux/rculist.h> | 18 | |
21 | #include <linux/hash.h> | ||
22 | #include <linux/radix-tree.h> | ||
23 | #include <trace/events/irq.h> | 19 | #include <trace/events/irq.h> |
24 | 20 | ||
25 | #include "internals.h" | 21 | #include "internals.h" |
26 | 22 | ||
27 | /* | ||
28 | * lockdep: we want to handle all irq_desc locks as a single lock-class: | ||
29 | */ | ||
30 | struct lock_class_key irq_desc_lock_class; | ||
31 | |||
32 | /** | 23 | /** |
33 | * handle_bad_irq - handle spurious and unhandled irqs | 24 | * handle_bad_irq - handle spurious and unhandled irqs |
34 | * @irq: the interrupt number | 25 | * @irq: the interrupt number |
@@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) | |||
43 | ack_bad_irq(irq); | 34 | ack_bad_irq(irq); |
44 | } | 35 | } |
45 | 36 | ||
46 | #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) | ||
47 | static void __init init_irq_default_affinity(void) | ||
48 | { | ||
49 | alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); | ||
50 | cpumask_setall(irq_default_affinity); | ||
51 | } | ||
52 | #else | ||
53 | static void __init init_irq_default_affinity(void) | ||
54 | { | ||
55 | } | ||
56 | #endif | ||
57 | |||
58 | /* | ||
59 | * Linux has a controller-independent interrupt architecture. | ||
60 | * Every controller has a 'controller-template', that is used | ||
61 | * by the main code to do the right thing. Each driver-visible | ||
62 | * interrupt source is transparently wired to the appropriate | ||
63 | * controller. Thus drivers need not be aware of the | ||
64 | * interrupt-controller. | ||
65 | * | ||
66 | * The code is designed to be easily extended with new/different | ||
67 | * interrupt controllers, without having to do assembly magic or | ||
68 | * having to touch the generic code. | ||
69 | * | ||
70 | * Controller mappings for all interrupt sources: | ||
71 | */ | ||
72 | int nr_irqs = NR_IRQS; | ||
73 | EXPORT_SYMBOL_GPL(nr_irqs); | ||
74 | |||
75 | #ifdef CONFIG_SPARSE_IRQ | ||
76 | |||
77 | static struct irq_desc irq_desc_init = { | ||
78 | .irq = -1, | ||
79 | .status = IRQ_DISABLED, | ||
80 | .chip = &no_irq_chip, | ||
81 | .handle_irq = handle_bad_irq, | ||
82 | .depth = 1, | ||
83 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), | ||
84 | }; | ||
85 | |||
86 | void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) | ||
87 | { | ||
88 | void *ptr; | ||
89 | |||
90 | ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), | ||
91 | GFP_ATOMIC, node); | ||
92 | |||
93 | /* | ||
94 | * don't overwite if can not get new one | ||
95 | * init_copy_kstat_irqs() could still use old one | ||
96 | */ | ||
97 | if (ptr) { | ||
98 | printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); | ||
99 | desc->kstat_irqs = ptr; | ||
100 | } | ||
101 | } | ||
102 | |||
103 | static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) | ||
104 | { | ||
105 | memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); | ||
106 | |||
107 | raw_spin_lock_init(&desc->lock); | ||
108 | desc->irq = irq; | ||
109 | #ifdef CONFIG_SMP | ||
110 | desc->node = node; | ||
111 | #endif | ||
112 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | ||
113 | init_kstat_irqs(desc, node, nr_cpu_ids); | ||
114 | if (!desc->kstat_irqs) { | ||
115 | printk(KERN_ERR "can not alloc kstat_irqs\n"); | ||
116 | BUG_ON(1); | ||
117 | } | ||
118 | if (!alloc_desc_masks(desc, node, false)) { | ||
119 | printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); | ||
120 | BUG_ON(1); | ||
121 | } | ||
122 | init_desc_masks(desc); | ||
123 | arch_init_chip_data(desc, node); | ||
124 | } | ||
125 | |||
126 | /* | ||
127 | * Protect the sparse_irqs: | ||
128 | */ | ||
129 | DEFINE_RAW_SPINLOCK(sparse_irq_lock); | ||
130 | |||
131 | static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); | ||
132 | |||
133 | static void set_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
134 | { | ||
135 | radix_tree_insert(&irq_desc_tree, irq, desc); | ||
136 | } | ||
137 | |||
138 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
139 | { | ||
140 | return radix_tree_lookup(&irq_desc_tree, irq); | ||
141 | } | ||
142 | |||
143 | void replace_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
144 | { | ||
145 | void **ptr; | ||
146 | |||
147 | ptr = radix_tree_lookup_slot(&irq_desc_tree, irq); | ||
148 | if (ptr) | ||
149 | radix_tree_replace_slot(ptr, desc); | ||
150 | } | ||
151 | |||
152 | static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { | ||
153 | [0 ... NR_IRQS_LEGACY-1] = { | ||
154 | .irq = -1, | ||
155 | .status = IRQ_DISABLED, | ||
156 | .chip = &no_irq_chip, | ||
157 | .handle_irq = handle_bad_irq, | ||
158 | .depth = 1, | ||
159 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), | ||
160 | } | ||
161 | }; | ||
162 | |||
163 | static unsigned int *kstat_irqs_legacy; | ||
164 | |||
165 | int __init early_irq_init(void) | ||
166 | { | ||
167 | struct irq_desc *desc; | ||
168 | int legacy_count; | ||
169 | int node; | ||
170 | int i; | ||
171 | |||
172 | init_irq_default_affinity(); | ||
173 | |||
174 | /* initialize nr_irqs based on nr_cpu_ids */ | ||
175 | arch_probe_nr_irqs(); | ||
176 | printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); | ||
177 | |||
178 | desc = irq_desc_legacy; | ||
179 | legacy_count = ARRAY_SIZE(irq_desc_legacy); | ||
180 | node = first_online_node; | ||
181 | |||
182 | /* allocate based on nr_cpu_ids */ | ||
183 | kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * | ||
184 | sizeof(int), GFP_NOWAIT, node); | ||
185 | |||
186 | for (i = 0; i < legacy_count; i++) { | ||
187 | desc[i].irq = i; | ||
188 | #ifdef CONFIG_SMP | ||
189 | desc[i].node = node; | ||
190 | #endif | ||
191 | desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; | ||
192 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | ||
193 | alloc_desc_masks(&desc[i], node, true); | ||
194 | init_desc_masks(&desc[i]); | ||
195 | set_irq_desc(i, &desc[i]); | ||
196 | } | ||
197 | |||
198 | return arch_early_irq_init(); | ||
199 | } | ||
200 | |||
201 | struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | ||
202 | { | ||
203 | struct irq_desc *desc; | ||
204 | unsigned long flags; | ||
205 | |||
206 | if (irq >= nr_irqs) { | ||
207 | WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", | ||
208 | irq, nr_irqs); | ||
209 | return NULL; | ||
210 | } | ||
211 | |||
212 | desc = irq_to_desc(irq); | ||
213 | if (desc) | ||
214 | return desc; | ||
215 | |||
216 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); | ||
217 | |||
218 | /* We have to check it to avoid races with another CPU */ | ||
219 | desc = irq_to_desc(irq); | ||
220 | if (desc) | ||
221 | goto out_unlock; | ||
222 | |||
223 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); | ||
224 | |||
225 | printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); | ||
226 | if (!desc) { | ||
227 | printk(KERN_ERR "can not alloc irq_desc\n"); | ||
228 | BUG_ON(1); | ||
229 | } | ||
230 | init_one_irq_desc(irq, desc, node); | ||
231 | |||
232 | set_irq_desc(irq, desc); | ||
233 | |||
234 | out_unlock: | ||
235 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | ||
236 | |||
237 | return desc; | ||
238 | } | ||
239 | |||
240 | #else /* !CONFIG_SPARSE_IRQ */ | ||
241 | |||
242 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { | ||
243 | [0 ... NR_IRQS-1] = { | ||
244 | .status = IRQ_DISABLED, | ||
245 | .chip = &no_irq_chip, | ||
246 | .handle_irq = handle_bad_irq, | ||
247 | .depth = 1, | ||
248 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), | ||
249 | } | ||
250 | }; | ||
251 | |||
252 | static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; | ||
253 | int __init early_irq_init(void) | ||
254 | { | ||
255 | struct irq_desc *desc; | ||
256 | int count; | ||
257 | int i; | ||
258 | |||
259 | init_irq_default_affinity(); | ||
260 | |||
261 | printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); | ||
262 | |||
263 | desc = irq_desc; | ||
264 | count = ARRAY_SIZE(irq_desc); | ||
265 | |||
266 | for (i = 0; i < count; i++) { | ||
267 | desc[i].irq = i; | ||
268 | alloc_desc_masks(&desc[i], 0, true); | ||
269 | init_desc_masks(&desc[i]); | ||
270 | desc[i].kstat_irqs = kstat_irqs_all[i]; | ||
271 | } | ||
272 | return arch_early_irq_init(); | ||
273 | } | ||
274 | |||
275 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
276 | { | ||
277 | return (irq < NR_IRQS) ? irq_desc + irq : NULL; | ||
278 | } | ||
279 | |||
280 | struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) | ||
281 | { | ||
282 | return irq_to_desc(irq); | ||
283 | } | ||
284 | #endif /* !CONFIG_SPARSE_IRQ */ | ||
285 | |||
286 | void clear_kstat_irqs(struct irq_desc *desc) | ||
287 | { | ||
288 | memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); | ||
289 | } | ||
290 | |||
291 | /* | ||
292 | * What should we do if we get a hw irq event on an illegal vector? | ||
293 | * Each architecture has to answer this themself. | ||
294 | */ | ||
295 | static void ack_bad(unsigned int irq) | ||
296 | { | ||
297 | struct irq_desc *desc = irq_to_desc(irq); | ||
298 | |||
299 | print_irq_desc(irq, desc); | ||
300 | ack_bad_irq(irq); | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * NOP functions | ||
305 | */ | ||
306 | static void noop(unsigned int irq) | ||
307 | { | ||
308 | } | ||
309 | |||
310 | static unsigned int noop_ret(unsigned int irq) | ||
311 | { | ||
312 | return 0; | ||
313 | } | ||
314 | |||
315 | /* | ||
316 | * Generic no controller implementation | ||
317 | */ | ||
318 | struct irq_chip no_irq_chip = { | ||
319 | .name = "none", | ||
320 | .startup = noop_ret, | ||
321 | .shutdown = noop, | ||
322 | .enable = noop, | ||
323 | .disable = noop, | ||
324 | .ack = ack_bad, | ||
325 | .end = noop, | ||
326 | }; | ||
327 | |||
328 | /* | ||
329 | * Generic dummy implementation which can be used for | ||
330 | * real dumb interrupt sources | ||
331 | */ | ||
332 | struct irq_chip dummy_irq_chip = { | ||
333 | .name = "dummy", | ||
334 | .startup = noop_ret, | ||
335 | .shutdown = noop, | ||
336 | .enable = noop, | ||
337 | .disable = noop, | ||
338 | .ack = noop, | ||
339 | .mask = noop, | ||
340 | .unmask = noop, | ||
341 | .end = noop, | ||
342 | }; | ||
343 | |||
344 | /* | 37 | /* |
345 | * Special, empty irq handler: | 38 | * Special, empty irq handler: |
346 | */ | 39 | */ |
@@ -457,20 +150,20 @@ unsigned int __do_IRQ(unsigned int irq) | |||
457 | /* | 150 | /* |
458 | * No locking required for CPU-local interrupts: | 151 | * No locking required for CPU-local interrupts: |
459 | */ | 152 | */ |
460 | if (desc->chip->ack) | 153 | if (desc->irq_data.chip->ack) |
461 | desc->chip->ack(irq); | 154 | desc->irq_data.chip->ack(irq); |
462 | if (likely(!(desc->status & IRQ_DISABLED))) { | 155 | if (likely(!(desc->status & IRQ_DISABLED))) { |
463 | action_ret = handle_IRQ_event(irq, desc->action); | 156 | action_ret = handle_IRQ_event(irq, desc->action); |
464 | if (!noirqdebug) | 157 | if (!noirqdebug) |
465 | note_interrupt(irq, desc, action_ret); | 158 | note_interrupt(irq, desc, action_ret); |
466 | } | 159 | } |
467 | desc->chip->end(irq); | 160 | desc->irq_data.chip->end(irq); |
468 | return 1; | 161 | return 1; |
469 | } | 162 | } |
470 | 163 | ||
471 | raw_spin_lock(&desc->lock); | 164 | raw_spin_lock(&desc->lock); |
472 | if (desc->chip->ack) | 165 | if (desc->irq_data.chip->ack) |
473 | desc->chip->ack(irq); | 166 | desc->irq_data.chip->ack(irq); |
474 | /* | 167 | /* |
475 | * REPLAY is when Linux resends an IRQ that was dropped earlier | 168 | * REPLAY is when Linux resends an IRQ that was dropped earlier |
476 | * WAITING is used by probe to mark irqs that are being tested | 169 | * WAITING is used by probe to mark irqs that are being tested |
@@ -530,27 +223,9 @@ out: | |||
530 | * The ->end() handler has to deal with interrupts which got | 223 | * The ->end() handler has to deal with interrupts which got |
531 | * disabled while the handler was running. | 224 | * disabled while the handler was running. |
532 | */ | 225 | */ |
533 | desc->chip->end(irq); | 226 | desc->irq_data.chip->end(irq); |
534 | raw_spin_unlock(&desc->lock); | 227 | raw_spin_unlock(&desc->lock); |
535 | 228 | ||
536 | return 1; | 229 | return 1; |
537 | } | 230 | } |
538 | #endif | 231 | #endif |
539 | |||
540 | void early_init_irq_lock_class(void) | ||
541 | { | ||
542 | struct irq_desc *desc; | ||
543 | int i; | ||
544 | |||
545 | for_each_irq_desc(i, desc) { | ||
546 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | ||
547 | } | ||
548 | } | ||
549 | |||
550 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | ||
551 | { | ||
552 | struct irq_desc *desc = irq_to_desc(irq); | ||
553 | return desc ? desc->kstat_irqs[cpu] : 0; | ||
554 | } | ||
555 | EXPORT_SYMBOL(kstat_irqs_cpu); | ||
556 | |||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c63f3bc88f0b..4571ae7e085a 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -1,9 +1,12 @@ | |||
1 | /* | 1 | /* |
2 | * IRQ subsystem internal functions and variables: | 2 | * IRQ subsystem internal functions and variables: |
3 | */ | 3 | */ |
4 | #include <linux/irqdesc.h> | ||
4 | 5 | ||
5 | extern int noirqdebug; | 6 | extern int noirqdebug; |
6 | 7 | ||
8 | #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) | ||
9 | |||
7 | /* Set default functions for irq_chip structures: */ | 10 | /* Set default functions for irq_chip structures: */ |
8 | extern void irq_chip_set_defaults(struct irq_chip *chip); | 11 | extern void irq_chip_set_defaults(struct irq_chip *chip); |
9 | 12 | ||
@@ -15,21 +18,19 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
15 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); | 18 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); |
16 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); | 19 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); |
17 | 20 | ||
18 | extern struct lock_class_key irq_desc_lock_class; | ||
19 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); | 21 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); |
20 | extern void clear_kstat_irqs(struct irq_desc *desc); | ||
21 | extern raw_spinlock_t sparse_irq_lock; | ||
22 | 22 | ||
23 | #ifdef CONFIG_SPARSE_IRQ | 23 | /* Resending of interrupts :*/ |
24 | void replace_irq_desc(unsigned int irq, struct irq_desc *desc); | 24 | void check_irq_resend(struct irq_desc *desc, unsigned int irq); |
25 | #endif | ||
26 | 25 | ||
27 | #ifdef CONFIG_PROC_FS | 26 | #ifdef CONFIG_PROC_FS |
28 | extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); | 27 | extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); |
28 | extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc); | ||
29 | extern void register_handler_proc(unsigned int irq, struct irqaction *action); | 29 | extern void register_handler_proc(unsigned int irq, struct irqaction *action); |
30 | extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); | 30 | extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); |
31 | #else | 31 | #else |
32 | static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } | 32 | static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } |
33 | static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { } | ||
33 | static inline void register_handler_proc(unsigned int irq, | 34 | static inline void register_handler_proc(unsigned int irq, |
34 | struct irqaction *action) { } | 35 | struct irqaction *action) { } |
35 | static inline void unregister_handler_proc(unsigned int irq, | 36 | static inline void unregister_handler_proc(unsigned int irq, |
@@ -40,17 +41,27 @@ extern int irq_select_affinity_usr(unsigned int irq); | |||
40 | 41 | ||
41 | extern void irq_set_thread_affinity(struct irq_desc *desc); | 42 | extern void irq_set_thread_affinity(struct irq_desc *desc); |
42 | 43 | ||
44 | #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED | ||
45 | static inline void irq_end(unsigned int irq, struct irq_desc *desc) | ||
46 | { | ||
47 | if (desc->irq_data.chip && desc->irq_data.chip->end) | ||
48 | desc->irq_data.chip->end(irq); | ||
49 | } | ||
50 | #else | ||
51 | static inline void irq_end(unsigned int irq, struct irq_desc *desc) { } | ||
52 | #endif | ||
53 | |||
43 | /* Inline functions for support of irq chips on slow busses */ | 54 | /* Inline functions for support of irq chips on slow busses */ |
44 | static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) | 55 | static inline void chip_bus_lock(struct irq_desc *desc) |
45 | { | 56 | { |
46 | if (unlikely(desc->chip->bus_lock)) | 57 | if (unlikely(desc->irq_data.chip->irq_bus_lock)) |
47 | desc->chip->bus_lock(irq); | 58 | desc->irq_data.chip->irq_bus_lock(&desc->irq_data); |
48 | } | 59 | } |
49 | 60 | ||
50 | static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) | 61 | static inline void chip_bus_sync_unlock(struct irq_desc *desc) |
51 | { | 62 | { |
52 | if (unlikely(desc->chip->bus_sync_unlock)) | 63 | if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock)) |
53 | desc->chip->bus_sync_unlock(irq); | 64 | desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); |
54 | } | 65 | } |
55 | 66 | ||
56 | /* | 67 | /* |
@@ -67,8 +78,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | |||
67 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); | 78 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); |
68 | printk("->handle_irq(): %p, ", desc->handle_irq); | 79 | printk("->handle_irq(): %p, ", desc->handle_irq); |
69 | print_symbol("%s\n", (unsigned long)desc->handle_irq); | 80 | print_symbol("%s\n", (unsigned long)desc->handle_irq); |
70 | printk("->chip(): %p, ", desc->chip); | 81 | printk("->irq_data.chip(): %p, ", desc->irq_data.chip); |
71 | print_symbol("%s\n", (unsigned long)desc->chip); | 82 | print_symbol("%s\n", (unsigned long)desc->irq_data.chip); |
72 | printk("->action(): %p\n", desc->action); | 83 | printk("->action(): %p\n", desc->action); |
73 | if (desc->action) { | 84 | if (desc->action) { |
74 | printk("->action->handler(): %p, ", desc->action->handler); | 85 | printk("->action->handler(): %p, ", desc->action->handler); |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c new file mode 100644 index 000000000000..9988d03797f5 --- /dev/null +++ b/kernel/irq/irqdesc.c | |||
@@ -0,0 +1,410 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar | ||
3 | * Copyright (C) 2005-2006, Thomas Gleixner, Russell King | ||
4 | * | ||
5 | * This file contains the interrupt descriptor management code | ||
6 | * | ||
7 | * Detailed information is available in Documentation/DocBook/genericirq | ||
8 | * | ||
9 | */ | ||
10 | #include <linux/irq.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/interrupt.h> | ||
14 | #include <linux/kernel_stat.h> | ||
15 | #include <linux/radix-tree.h> | ||
16 | #include <linux/bitmap.h> | ||
17 | |||
18 | #include "internals.h" | ||
19 | |||
20 | /* | ||
21 | * lockdep: we want to handle all irq_desc locks as a single lock-class: | ||
22 | */ | ||
23 | static struct lock_class_key irq_desc_lock_class; | ||
24 | |||
25 | #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) | ||
26 | static void __init init_irq_default_affinity(void) | ||
27 | { | ||
28 | alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); | ||
29 | cpumask_setall(irq_default_affinity); | ||
30 | } | ||
31 | #else | ||
32 | static void __init init_irq_default_affinity(void) | ||
33 | { | ||
34 | } | ||
35 | #endif | ||
36 | |||
37 | #ifdef CONFIG_SMP | ||
38 | static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) | ||
39 | { | ||
40 | if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) | ||
41 | return -ENOMEM; | ||
42 | |||
43 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
44 | if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { | ||
45 | free_cpumask_var(desc->irq_data.affinity); | ||
46 | return -ENOMEM; | ||
47 | } | ||
48 | #endif | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | static void desc_smp_init(struct irq_desc *desc, int node) | ||
53 | { | ||
54 | desc->irq_data.node = node; | ||
55 | cpumask_copy(desc->irq_data.affinity, irq_default_affinity); | ||
56 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
57 | cpumask_clear(desc->pending_mask); | ||
58 | #endif | ||
59 | } | ||
60 | |||
61 | static inline int desc_node(struct irq_desc *desc) | ||
62 | { | ||
63 | return desc->irq_data.node; | ||
64 | } | ||
65 | |||
66 | #else | ||
67 | static inline int | ||
68 | alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } | ||
69 | static inline void desc_smp_init(struct irq_desc *desc, int node) { } | ||
70 | static inline int desc_node(struct irq_desc *desc) { return 0; } | ||
71 | #endif | ||
72 | |||
73 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) | ||
74 | { | ||
75 | desc->irq_data.irq = irq; | ||
76 | desc->irq_data.chip = &no_irq_chip; | ||
77 | desc->irq_data.chip_data = NULL; | ||
78 | desc->irq_data.handler_data = NULL; | ||
79 | desc->irq_data.msi_desc = NULL; | ||
80 | desc->status = IRQ_DEFAULT_INIT_FLAGS; | ||
81 | desc->handle_irq = handle_bad_irq; | ||
82 | desc->depth = 1; | ||
83 | desc->irq_count = 0; | ||
84 | desc->irqs_unhandled = 0; | ||
85 | desc->name = NULL; | ||
86 | memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); | ||
87 | desc_smp_init(desc, node); | ||
88 | } | ||
89 | |||
90 | int nr_irqs = NR_IRQS; | ||
91 | EXPORT_SYMBOL_GPL(nr_irqs); | ||
92 | |||
93 | static DEFINE_MUTEX(sparse_irq_lock); | ||
94 | static DECLARE_BITMAP(allocated_irqs, NR_IRQS); | ||
95 | |||
96 | #ifdef CONFIG_SPARSE_IRQ | ||
97 | |||
98 | static RADIX_TREE(irq_desc_tree, GFP_KERNEL); | ||
99 | |||
100 | static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) | ||
101 | { | ||
102 | radix_tree_insert(&irq_desc_tree, irq, desc); | ||
103 | } | ||
104 | |||
105 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
106 | { | ||
107 | return radix_tree_lookup(&irq_desc_tree, irq); | ||
108 | } | ||
109 | |||
110 | static void delete_irq_desc(unsigned int irq) | ||
111 | { | ||
112 | radix_tree_delete(&irq_desc_tree, irq); | ||
113 | } | ||
114 | |||
115 | #ifdef CONFIG_SMP | ||
116 | static void free_masks(struct irq_desc *desc) | ||
117 | { | ||
118 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
119 | free_cpumask_var(desc->pending_mask); | ||
120 | #endif | ||
121 | free_cpumask_var(desc->irq_data.affinity); | ||
122 | } | ||
123 | #else | ||
124 | static inline void free_masks(struct irq_desc *desc) { } | ||
125 | #endif | ||
126 | |||
127 | static struct irq_desc *alloc_desc(int irq, int node) | ||
128 | { | ||
129 | struct irq_desc *desc; | ||
130 | gfp_t gfp = GFP_KERNEL; | ||
131 | |||
132 | desc = kzalloc_node(sizeof(*desc), gfp, node); | ||
133 | if (!desc) | ||
134 | return NULL; | ||
135 | /* allocate based on nr_cpu_ids */ | ||
136 | desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), | ||
137 | gfp, node); | ||
138 | if (!desc->kstat_irqs) | ||
139 | goto err_desc; | ||
140 | |||
141 | if (alloc_masks(desc, gfp, node)) | ||
142 | goto err_kstat; | ||
143 | |||
144 | raw_spin_lock_init(&desc->lock); | ||
145 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | ||
146 | |||
147 | desc_set_defaults(irq, desc, node); | ||
148 | |||
149 | return desc; | ||
150 | |||
151 | err_kstat: | ||
152 | kfree(desc->kstat_irqs); | ||
153 | err_desc: | ||
154 | kfree(desc); | ||
155 | return NULL; | ||
156 | } | ||
157 | |||
158 | static void free_desc(unsigned int irq) | ||
159 | { | ||
160 | struct irq_desc *desc = irq_to_desc(irq); | ||
161 | |||
162 | unregister_irq_proc(irq, desc); | ||
163 | |||
164 | mutex_lock(&sparse_irq_lock); | ||
165 | delete_irq_desc(irq); | ||
166 | mutex_unlock(&sparse_irq_lock); | ||
167 | |||
168 | free_masks(desc); | ||
169 | kfree(desc->kstat_irqs); | ||
170 | kfree(desc); | ||
171 | } | ||
172 | |||
173 | static int alloc_descs(unsigned int start, unsigned int cnt, int node) | ||
174 | { | ||
175 | struct irq_desc *desc; | ||
176 | int i; | ||
177 | |||
178 | for (i = 0; i < cnt; i++) { | ||
179 | desc = alloc_desc(start + i, node); | ||
180 | if (!desc) | ||
181 | goto err; | ||
182 | mutex_lock(&sparse_irq_lock); | ||
183 | irq_insert_desc(start + i, desc); | ||
184 | mutex_unlock(&sparse_irq_lock); | ||
185 | } | ||
186 | return start; | ||
187 | |||
188 | err: | ||
189 | for (i--; i >= 0; i--) | ||
190 | free_desc(start + i); | ||
191 | |||
192 | mutex_lock(&sparse_irq_lock); | ||
193 | bitmap_clear(allocated_irqs, start, cnt); | ||
194 | mutex_unlock(&sparse_irq_lock); | ||
195 | return -ENOMEM; | ||
196 | } | ||
197 | |||
198 | struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) | ||
199 | { | ||
200 | int res = irq_alloc_descs(irq, irq, 1, node); | ||
201 | |||
202 | if (res == -EEXIST || res == irq) | ||
203 | return irq_to_desc(irq); | ||
204 | return NULL; | ||
205 | } | ||
206 | |||
207 | int __init early_irq_init(void) | ||
208 | { | ||
209 | int i, initcnt, node = first_online_node; | ||
210 | struct irq_desc *desc; | ||
211 | |||
212 | init_irq_default_affinity(); | ||
213 | |||
214 | /* Let arch update nr_irqs and return the nr of preallocated irqs */ | ||
215 | initcnt = arch_probe_nr_irqs(); | ||
216 | printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); | ||
217 | |||
218 | for (i = 0; i < initcnt; i++) { | ||
219 | desc = alloc_desc(i, node); | ||
220 | set_bit(i, allocated_irqs); | ||
221 | irq_insert_desc(i, desc); | ||
222 | } | ||
223 | return arch_early_irq_init(); | ||
224 | } | ||
225 | |||
226 | #else /* !CONFIG_SPARSE_IRQ */ | ||
227 | |||
228 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { | ||
229 | [0 ... NR_IRQS-1] = { | ||
230 | .status = IRQ_DEFAULT_INIT_FLAGS, | ||
231 | .handle_irq = handle_bad_irq, | ||
232 | .depth = 1, | ||
233 | .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), | ||
234 | } | ||
235 | }; | ||
236 | |||
237 | static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; | ||
238 | int __init early_irq_init(void) | ||
239 | { | ||
240 | int count, i, node = first_online_node; | ||
241 | struct irq_desc *desc; | ||
242 | |||
243 | init_irq_default_affinity(); | ||
244 | |||
245 | printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); | ||
246 | |||
247 | desc = irq_desc; | ||
248 | count = ARRAY_SIZE(irq_desc); | ||
249 | |||
250 | for (i = 0; i < count; i++) { | ||
251 | desc[i].irq_data.irq = i; | ||
252 | desc[i].irq_data.chip = &no_irq_chip; | ||
253 | desc[i].kstat_irqs = kstat_irqs_all[i]; | ||
254 | alloc_masks(desc + i, GFP_KERNEL, node); | ||
255 | desc_smp_init(desc + i, node); | ||
256 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | ||
257 | } | ||
258 | return arch_early_irq_init(); | ||
259 | } | ||
260 | |||
261 | struct irq_desc *irq_to_desc(unsigned int irq) | ||
262 | { | ||
263 | return (irq < NR_IRQS) ? irq_desc + irq : NULL; | ||
264 | } | ||
265 | |||
266 | struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) | ||
267 | { | ||
268 | return irq_to_desc(irq); | ||
269 | } | ||
270 | |||
271 | static void free_desc(unsigned int irq) | ||
272 | { | ||
273 | dynamic_irq_cleanup(irq); | ||
274 | } | ||
275 | |||
276 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) | ||
277 | { | ||
278 | return start; | ||
279 | } | ||
280 | #endif /* !CONFIG_SPARSE_IRQ */ | ||
281 | |||
282 | /* Dynamic interrupt handling */ | ||
283 | |||
284 | /** | ||
285 | * irq_free_descs - free irq descriptors | ||
286 | * @from: Start of descriptor range | ||
287 | * @cnt: Number of consecutive irqs to free | ||
288 | */ | ||
289 | void irq_free_descs(unsigned int from, unsigned int cnt) | ||
290 | { | ||
291 | int i; | ||
292 | |||
293 | if (from >= nr_irqs || (from + cnt) > nr_irqs) | ||
294 | return; | ||
295 | |||
296 | for (i = 0; i < cnt; i++) | ||
297 | free_desc(from + i); | ||
298 | |||
299 | mutex_lock(&sparse_irq_lock); | ||
300 | bitmap_clear(allocated_irqs, from, cnt); | ||
301 | mutex_unlock(&sparse_irq_lock); | ||
302 | } | ||
303 | |||
304 | /** | ||
305 | * irq_alloc_descs - allocate and initialize a range of irq descriptors | ||
306 | * @irq: Allocate for specific irq number if irq >= 0 | ||
307 | * @from: Start the search from this irq number | ||
308 | * @cnt: Number of consecutive irqs to allocate. | ||
309 | * @node: Preferred node on which the irq descriptor should be allocated | ||
310 | * | ||
311 | * Returns the first irq number or error code | ||
312 | */ | ||
313 | int __ref | ||
314 | irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) | ||
315 | { | ||
316 | int start, ret; | ||
317 | |||
318 | if (!cnt) | ||
319 | return -EINVAL; | ||
320 | |||
321 | mutex_lock(&sparse_irq_lock); | ||
322 | |||
323 | start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); | ||
324 | ret = -EEXIST; | ||
325 | if (irq >=0 && start != irq) | ||
326 | goto err; | ||
327 | |||
328 | ret = -ENOMEM; | ||
329 | if (start >= nr_irqs) | ||
330 | goto err; | ||
331 | |||
332 | bitmap_set(allocated_irqs, start, cnt); | ||
333 | mutex_unlock(&sparse_irq_lock); | ||
334 | return alloc_descs(start, cnt, node); | ||
335 | |||
336 | err: | ||
337 | mutex_unlock(&sparse_irq_lock); | ||
338 | return ret; | ||
339 | } | ||
340 | |||
341 | /** | ||
342 | * irq_reserve_irqs - mark irqs allocated | ||
343 | * @from: mark from irq number | ||
344 | * @cnt: number of irqs to mark | ||
345 | * | ||
346 | * Returns 0 on success or an appropriate error code | ||
347 | */ | ||
348 | int irq_reserve_irqs(unsigned int from, unsigned int cnt) | ||
349 | { | ||
350 | unsigned int start; | ||
351 | int ret = 0; | ||
352 | |||
353 | if (!cnt || (from + cnt) > nr_irqs) | ||
354 | return -EINVAL; | ||
355 | |||
356 | mutex_lock(&sparse_irq_lock); | ||
357 | start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); | ||
358 | if (start == from) | ||
359 | bitmap_set(allocated_irqs, start, cnt); | ||
360 | else | ||
361 | ret = -EEXIST; | ||
362 | mutex_unlock(&sparse_irq_lock); | ||
363 | return ret; | ||
364 | } | ||
365 | |||
366 | /** | ||
367 | * irq_get_next_irq - get next allocated irq number | ||
368 | * @offset: where to start the search | ||
369 | * | ||
370 | * Returns next irq number after offset or nr_irqs if none is found. | ||
371 | */ | ||
372 | unsigned int irq_get_next_irq(unsigned int offset) | ||
373 | { | ||
374 | return find_next_bit(allocated_irqs, nr_irqs, offset); | ||
375 | } | ||
376 | |||
377 | /** | ||
378 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | ||
379 | * @irq: irq number to initialize | ||
380 | */ | ||
381 | void dynamic_irq_cleanup(unsigned int irq) | ||
382 | { | ||
383 | struct irq_desc *desc = irq_to_desc(irq); | ||
384 | unsigned long flags; | ||
385 | |||
386 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
387 | desc_set_defaults(irq, desc, desc_node(desc)); | ||
388 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
389 | } | ||
390 | |||
391 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | ||
392 | { | ||
393 | struct irq_desc *desc = irq_to_desc(irq); | ||
394 | return desc ? desc->kstat_irqs[cpu] : 0; | ||
395 | } | ||
396 | |||
397 | #ifdef CONFIG_GENERIC_HARDIRQS | ||
398 | unsigned int kstat_irqs(unsigned int irq) | ||
399 | { | ||
400 | struct irq_desc *desc = irq_to_desc(irq); | ||
401 | int cpu; | ||
402 | int sum = 0; | ||
403 | |||
404 | if (!desc) | ||
405 | return 0; | ||
406 | for_each_possible_cpu(cpu) | ||
407 | sum += desc->kstat_irqs[cpu]; | ||
408 | return sum; | ||
409 | } | ||
410 | #endif /* CONFIG_GENERIC_HARDIRQS */ | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e1497481fe8a..644e8d5fa367 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq) | |||
73 | { | 73 | { |
74 | struct irq_desc *desc = irq_to_desc(irq); | 74 | struct irq_desc *desc = irq_to_desc(irq); |
75 | 75 | ||
76 | if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || | 76 | if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || |
77 | !desc->chip->set_affinity) | 77 | !desc->irq_data.chip->irq_set_affinity) |
78 | return 0; | 78 | return 0; |
79 | 79 | ||
80 | return 1; | 80 | return 1; |
@@ -109,17 +109,18 @@ void irq_set_thread_affinity(struct irq_desc *desc) | |||
109 | int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) | 109 | int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) |
110 | { | 110 | { |
111 | struct irq_desc *desc = irq_to_desc(irq); | 111 | struct irq_desc *desc = irq_to_desc(irq); |
112 | struct irq_chip *chip = desc->irq_data.chip; | ||
112 | unsigned long flags; | 113 | unsigned long flags; |
113 | 114 | ||
114 | if (!desc->chip->set_affinity) | 115 | if (!chip->irq_set_affinity) |
115 | return -EINVAL; | 116 | return -EINVAL; |
116 | 117 | ||
117 | raw_spin_lock_irqsave(&desc->lock, flags); | 118 | raw_spin_lock_irqsave(&desc->lock, flags); |
118 | 119 | ||
119 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 120 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
120 | if (desc->status & IRQ_MOVE_PCNTXT) { | 121 | if (desc->status & IRQ_MOVE_PCNTXT) { |
121 | if (!desc->chip->set_affinity(irq, cpumask)) { | 122 | if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { |
122 | cpumask_copy(desc->affinity, cpumask); | 123 | cpumask_copy(desc->irq_data.affinity, cpumask); |
123 | irq_set_thread_affinity(desc); | 124 | irq_set_thread_affinity(desc); |
124 | } | 125 | } |
125 | } | 126 | } |
@@ -128,8 +129,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) | |||
128 | cpumask_copy(desc->pending_mask, cpumask); | 129 | cpumask_copy(desc->pending_mask, cpumask); |
129 | } | 130 | } |
130 | #else | 131 | #else |
131 | if (!desc->chip->set_affinity(irq, cpumask)) { | 132 | if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { |
132 | cpumask_copy(desc->affinity, cpumask); | 133 | cpumask_copy(desc->irq_data.affinity, cpumask); |
133 | irq_set_thread_affinity(desc); | 134 | irq_set_thread_affinity(desc); |
134 | } | 135 | } |
135 | #endif | 136 | #endif |
@@ -168,16 +169,16 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) | |||
168 | * one of the targets is online. | 169 | * one of the targets is online. |
169 | */ | 170 | */ |
170 | if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { | 171 | if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { |
171 | if (cpumask_any_and(desc->affinity, cpu_online_mask) | 172 | if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) |
172 | < nr_cpu_ids) | 173 | < nr_cpu_ids) |
173 | goto set_affinity; | 174 | goto set_affinity; |
174 | else | 175 | else |
175 | desc->status &= ~IRQ_AFFINITY_SET; | 176 | desc->status &= ~IRQ_AFFINITY_SET; |
176 | } | 177 | } |
177 | 178 | ||
178 | cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); | 179 | cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); |
179 | set_affinity: | 180 | set_affinity: |
180 | desc->chip->set_affinity(irq, desc->affinity); | 181 | desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); |
181 | 182 | ||
182 | return 0; | 183 | return 0; |
183 | } | 184 | } |
@@ -216,14 +217,14 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) | |||
216 | void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) | 217 | void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) |
217 | { | 218 | { |
218 | if (suspend) { | 219 | if (suspend) { |
219 | if (!desc->action || (desc->action->flags & IRQF_TIMER)) | 220 | if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) |
220 | return; | 221 | return; |
221 | desc->status |= IRQ_SUSPENDED; | 222 | desc->status |= IRQ_SUSPENDED; |
222 | } | 223 | } |
223 | 224 | ||
224 | if (!desc->depth++) { | 225 | if (!desc->depth++) { |
225 | desc->status |= IRQ_DISABLED; | 226 | desc->status |= IRQ_DISABLED; |
226 | desc->chip->disable(irq); | 227 | desc->irq_data.chip->irq_disable(&desc->irq_data); |
227 | } | 228 | } |
228 | } | 229 | } |
229 | 230 | ||
@@ -246,11 +247,11 @@ void disable_irq_nosync(unsigned int irq) | |||
246 | if (!desc) | 247 | if (!desc) |
247 | return; | 248 | return; |
248 | 249 | ||
249 | chip_bus_lock(irq, desc); | 250 | chip_bus_lock(desc); |
250 | raw_spin_lock_irqsave(&desc->lock, flags); | 251 | raw_spin_lock_irqsave(&desc->lock, flags); |
251 | __disable_irq(desc, irq, false); | 252 | __disable_irq(desc, irq, false); |
252 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 253 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
253 | chip_bus_sync_unlock(irq, desc); | 254 | chip_bus_sync_unlock(desc); |
254 | } | 255 | } |
255 | EXPORT_SYMBOL(disable_irq_nosync); | 256 | EXPORT_SYMBOL(disable_irq_nosync); |
256 | 257 | ||
@@ -313,7 +314,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) | |||
313 | * IRQ line is re-enabled. | 314 | * IRQ line is re-enabled. |
314 | * | 315 | * |
315 | * This function may be called from IRQ context only when | 316 | * This function may be called from IRQ context only when |
316 | * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! | 317 | * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! |
317 | */ | 318 | */ |
318 | void enable_irq(unsigned int irq) | 319 | void enable_irq(unsigned int irq) |
319 | { | 320 | { |
@@ -323,11 +324,11 @@ void enable_irq(unsigned int irq) | |||
323 | if (!desc) | 324 | if (!desc) |
324 | return; | 325 | return; |
325 | 326 | ||
326 | chip_bus_lock(irq, desc); | 327 | chip_bus_lock(desc); |
327 | raw_spin_lock_irqsave(&desc->lock, flags); | 328 | raw_spin_lock_irqsave(&desc->lock, flags); |
328 | __enable_irq(desc, irq, false); | 329 | __enable_irq(desc, irq, false); |
329 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 330 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
330 | chip_bus_sync_unlock(irq, desc); | 331 | chip_bus_sync_unlock(desc); |
331 | } | 332 | } |
332 | EXPORT_SYMBOL(enable_irq); | 333 | EXPORT_SYMBOL(enable_irq); |
333 | 334 | ||
@@ -336,8 +337,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) | |||
336 | struct irq_desc *desc = irq_to_desc(irq); | 337 | struct irq_desc *desc = irq_to_desc(irq); |
337 | int ret = -ENXIO; | 338 | int ret = -ENXIO; |
338 | 339 | ||
339 | if (desc->chip->set_wake) | 340 | if (desc->irq_data.chip->irq_set_wake) |
340 | ret = desc->chip->set_wake(irq, on); | 341 | ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); |
341 | 342 | ||
342 | return ret; | 343 | return ret; |
343 | } | 344 | } |
@@ -429,12 +430,12 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc) | |||
429 | } | 430 | } |
430 | 431 | ||
431 | int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | 432 | int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, |
432 | unsigned long flags) | 433 | unsigned long flags) |
433 | { | 434 | { |
434 | int ret; | 435 | int ret; |
435 | struct irq_chip *chip = desc->chip; | 436 | struct irq_chip *chip = desc->irq_data.chip; |
436 | 437 | ||
437 | if (!chip || !chip->set_type) { | 438 | if (!chip || !chip->irq_set_type) { |
438 | /* | 439 | /* |
439 | * IRQF_TRIGGER_* but the PIC does not support multiple | 440 | * IRQF_TRIGGER_* but the PIC does not support multiple |
440 | * flow-types? | 441 | * flow-types? |
@@ -445,11 +446,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
445 | } | 446 | } |
446 | 447 | ||
447 | /* caller masked out all except trigger mode flags */ | 448 | /* caller masked out all except trigger mode flags */ |
448 | ret = chip->set_type(irq, flags); | 449 | ret = chip->irq_set_type(&desc->irq_data, flags); |
449 | 450 | ||
450 | if (ret) | 451 | if (ret) |
451 | pr_err("setting trigger mode %d for irq %u failed (%pF)\n", | 452 | pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", |
452 | (int)flags, irq, chip->set_type); | 453 | flags, irq, chip->irq_set_type); |
453 | else { | 454 | else { |
454 | if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) | 455 | if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) |
455 | flags |= IRQ_LEVEL; | 456 | flags |= IRQ_LEVEL; |
@@ -457,8 +458,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
457 | desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); | 458 | desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); |
458 | desc->status |= flags; | 459 | desc->status |= flags; |
459 | 460 | ||
460 | if (chip != desc->chip) | 461 | if (chip != desc->irq_data.chip) |
461 | irq_chip_set_defaults(desc->chip); | 462 | irq_chip_set_defaults(desc->irq_data.chip); |
462 | } | 463 | } |
463 | 464 | ||
464 | return ret; | 465 | return ret; |
@@ -507,7 +508,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) | |||
507 | static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) | 508 | static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) |
508 | { | 509 | { |
509 | again: | 510 | again: |
510 | chip_bus_lock(irq, desc); | 511 | chip_bus_lock(desc); |
511 | raw_spin_lock_irq(&desc->lock); | 512 | raw_spin_lock_irq(&desc->lock); |
512 | 513 | ||
513 | /* | 514 | /* |
@@ -521,17 +522,17 @@ again: | |||
521 | */ | 522 | */ |
522 | if (unlikely(desc->status & IRQ_INPROGRESS)) { | 523 | if (unlikely(desc->status & IRQ_INPROGRESS)) { |
523 | raw_spin_unlock_irq(&desc->lock); | 524 | raw_spin_unlock_irq(&desc->lock); |
524 | chip_bus_sync_unlock(irq, desc); | 525 | chip_bus_sync_unlock(desc); |
525 | cpu_relax(); | 526 | cpu_relax(); |
526 | goto again; | 527 | goto again; |
527 | } | 528 | } |
528 | 529 | ||
529 | if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { | 530 | if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { |
530 | desc->status &= ~IRQ_MASKED; | 531 | desc->status &= ~IRQ_MASKED; |
531 | desc->chip->unmask(irq); | 532 | desc->irq_data.chip->irq_unmask(&desc->irq_data); |
532 | } | 533 | } |
533 | raw_spin_unlock_irq(&desc->lock); | 534 | raw_spin_unlock_irq(&desc->lock); |
534 | chip_bus_sync_unlock(irq, desc); | 535 | chip_bus_sync_unlock(desc); |
535 | } | 536 | } |
536 | 537 | ||
537 | #ifdef CONFIG_SMP | 538 | #ifdef CONFIG_SMP |
@@ -556,7 +557,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) | |||
556 | } | 557 | } |
557 | 558 | ||
558 | raw_spin_lock_irq(&desc->lock); | 559 | raw_spin_lock_irq(&desc->lock); |
559 | cpumask_copy(mask, desc->affinity); | 560 | cpumask_copy(mask, desc->irq_data.affinity); |
560 | raw_spin_unlock_irq(&desc->lock); | 561 | raw_spin_unlock_irq(&desc->lock); |
561 | 562 | ||
562 | set_cpus_allowed_ptr(current, mask); | 563 | set_cpus_allowed_ptr(current, mask); |
@@ -657,7 +658,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
657 | if (!desc) | 658 | if (!desc) |
658 | return -EINVAL; | 659 | return -EINVAL; |
659 | 660 | ||
660 | if (desc->chip == &no_irq_chip) | 661 | if (desc->irq_data.chip == &no_irq_chip) |
661 | return -ENOSYS; | 662 | return -ENOSYS; |
662 | /* | 663 | /* |
663 | * Some drivers like serial.c use request_irq() heavily, | 664 | * Some drivers like serial.c use request_irq() heavily, |
@@ -752,7 +753,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
752 | } | 753 | } |
753 | 754 | ||
754 | if (!shared) { | 755 | if (!shared) { |
755 | irq_chip_set_defaults(desc->chip); | 756 | irq_chip_set_defaults(desc->irq_data.chip); |
756 | 757 | ||
757 | init_waitqueue_head(&desc->wait_for_threads); | 758 | init_waitqueue_head(&desc->wait_for_threads); |
758 | 759 | ||
@@ -779,7 +780,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
779 | if (!(desc->status & IRQ_NOAUTOEN)) { | 780 | if (!(desc->status & IRQ_NOAUTOEN)) { |
780 | desc->depth = 0; | 781 | desc->depth = 0; |
781 | desc->status &= ~IRQ_DISABLED; | 782 | desc->status &= ~IRQ_DISABLED; |
782 | desc->chip->startup(irq); | 783 | desc->irq_data.chip->irq_startup(&desc->irq_data); |
783 | } else | 784 | } else |
784 | /* Undo nested disables: */ | 785 | /* Undo nested disables: */ |
785 | desc->depth = 1; | 786 | desc->depth = 1; |
@@ -912,17 +913,17 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
912 | 913 | ||
913 | /* Currently used only by UML, might disappear one day: */ | 914 | /* Currently used only by UML, might disappear one day: */ |
914 | #ifdef CONFIG_IRQ_RELEASE_METHOD | 915 | #ifdef CONFIG_IRQ_RELEASE_METHOD |
915 | if (desc->chip->release) | 916 | if (desc->irq_data.chip->release) |
916 | desc->chip->release(irq, dev_id); | 917 | desc->irq_data.chip->release(irq, dev_id); |
917 | #endif | 918 | #endif |
918 | 919 | ||
919 | /* If this was the last handler, shut down the IRQ line: */ | 920 | /* If this was the last handler, shut down the IRQ line: */ |
920 | if (!desc->action) { | 921 | if (!desc->action) { |
921 | desc->status |= IRQ_DISABLED; | 922 | desc->status |= IRQ_DISABLED; |
922 | if (desc->chip->shutdown) | 923 | if (desc->irq_data.chip->irq_shutdown) |
923 | desc->chip->shutdown(irq); | 924 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); |
924 | else | 925 | else |
925 | desc->chip->disable(irq); | 926 | desc->irq_data.chip->irq_disable(&desc->irq_data); |
926 | } | 927 | } |
927 | 928 | ||
928 | #ifdef CONFIG_SMP | 929 | #ifdef CONFIG_SMP |
@@ -997,9 +998,9 @@ void free_irq(unsigned int irq, void *dev_id) | |||
997 | if (!desc) | 998 | if (!desc) |
998 | return; | 999 | return; |
999 | 1000 | ||
1000 | chip_bus_lock(irq, desc); | 1001 | chip_bus_lock(desc); |
1001 | kfree(__free_irq(irq, dev_id)); | 1002 | kfree(__free_irq(irq, dev_id)); |
1002 | chip_bus_sync_unlock(irq, desc); | 1003 | chip_bus_sync_unlock(desc); |
1003 | } | 1004 | } |
1004 | EXPORT_SYMBOL(free_irq); | 1005 | EXPORT_SYMBOL(free_irq); |
1005 | 1006 | ||
@@ -1086,9 +1087,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
1086 | action->name = devname; | 1087 | action->name = devname; |
1087 | action->dev_id = dev_id; | 1088 | action->dev_id = dev_id; |
1088 | 1089 | ||
1089 | chip_bus_lock(irq, desc); | 1090 | chip_bus_lock(desc); |
1090 | retval = __setup_irq(irq, desc, action); | 1091 | retval = __setup_irq(irq, desc, action); |
1091 | chip_bus_sync_unlock(irq, desc); | 1092 | chip_bus_sync_unlock(desc); |
1092 | 1093 | ||
1093 | if (retval) | 1094 | if (retval) |
1094 | kfree(action); | 1095 | kfree(action); |
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 241962280836..1d2541940480 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
@@ -7,6 +7,7 @@ | |||
7 | void move_masked_irq(int irq) | 7 | void move_masked_irq(int irq) |
8 | { | 8 | { |
9 | struct irq_desc *desc = irq_to_desc(irq); | 9 | struct irq_desc *desc = irq_to_desc(irq); |
10 | struct irq_chip *chip = desc->irq_data.chip; | ||
10 | 11 | ||
11 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) | 12 | if (likely(!(desc->status & IRQ_MOVE_PENDING))) |
12 | return; | 13 | return; |
@@ -24,7 +25,7 @@ void move_masked_irq(int irq) | |||
24 | if (unlikely(cpumask_empty(desc->pending_mask))) | 25 | if (unlikely(cpumask_empty(desc->pending_mask))) |
25 | return; | 26 | return; |
26 | 27 | ||
27 | if (!desc->chip->set_affinity) | 28 | if (!chip->irq_set_affinity) |
28 | return; | 29 | return; |
29 | 30 | ||
30 | assert_raw_spin_locked(&desc->lock); | 31 | assert_raw_spin_locked(&desc->lock); |
@@ -43,8 +44,9 @@ void move_masked_irq(int irq) | |||
43 | */ | 44 | */ |
44 | if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) | 45 | if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) |
45 | < nr_cpu_ids)) | 46 | < nr_cpu_ids)) |
46 | if (!desc->chip->set_affinity(irq, desc->pending_mask)) { | 47 | if (!chip->irq_set_affinity(&desc->irq_data, |
47 | cpumask_copy(desc->affinity, desc->pending_mask); | 48 | desc->pending_mask, false)) { |
49 | cpumask_copy(desc->irq_data.affinity, desc->pending_mask); | ||
48 | irq_set_thread_affinity(desc); | 50 | irq_set_thread_affinity(desc); |
49 | } | 51 | } |
50 | 52 | ||
@@ -61,8 +63,8 @@ void move_native_irq(int irq) | |||
61 | if (unlikely(desc->status & IRQ_DISABLED)) | 63 | if (unlikely(desc->status & IRQ_DISABLED)) |
62 | return; | 64 | return; |
63 | 65 | ||
64 | desc->chip->mask(irq); | 66 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
65 | move_masked_irq(irq); | 67 | move_masked_irq(irq); |
66 | desc->chip->unmask(irq); | 68 | desc->irq_data.chip->irq_unmask(&desc->irq_data); |
67 | } | 69 | } |
68 | 70 | ||
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c deleted file mode 100644 index 65d3845665ac..000000000000 --- a/kernel/irq/numa_migrate.c +++ /dev/null | |||
@@ -1,120 +0,0 @@ | |||
1 | /* | ||
2 | * NUMA irq-desc migration code | ||
3 | * | ||
4 | * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to | ||
5 | * the new "home node" of the IRQ. | ||
6 | */ | ||
7 | |||
8 | #include <linux/irq.h> | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/random.h> | ||
12 | #include <linux/interrupt.h> | ||
13 | #include <linux/kernel_stat.h> | ||
14 | |||
15 | #include "internals.h" | ||
16 | |||
17 | static void init_copy_kstat_irqs(struct irq_desc *old_desc, | ||
18 | struct irq_desc *desc, | ||
19 | int node, int nr) | ||
20 | { | ||
21 | init_kstat_irqs(desc, node, nr); | ||
22 | |||
23 | if (desc->kstat_irqs != old_desc->kstat_irqs) | ||
24 | memcpy(desc->kstat_irqs, old_desc->kstat_irqs, | ||
25 | nr * sizeof(*desc->kstat_irqs)); | ||
26 | } | ||
27 | |||
28 | static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) | ||
29 | { | ||
30 | if (old_desc->kstat_irqs == desc->kstat_irqs) | ||
31 | return; | ||
32 | |||
33 | kfree(old_desc->kstat_irqs); | ||
34 | old_desc->kstat_irqs = NULL; | ||
35 | } | ||
36 | |||
37 | static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, | ||
38 | struct irq_desc *desc, int node) | ||
39 | { | ||
40 | memcpy(desc, old_desc, sizeof(struct irq_desc)); | ||
41 | if (!alloc_desc_masks(desc, node, false)) { | ||
42 | printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " | ||
43 | "for migration.\n", irq); | ||
44 | return false; | ||
45 | } | ||
46 | raw_spin_lock_init(&desc->lock); | ||
47 | desc->node = node; | ||
48 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | ||
49 | init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); | ||
50 | init_copy_desc_masks(old_desc, desc); | ||
51 | arch_init_copy_chip_data(old_desc, desc, node); | ||
52 | return true; | ||
53 | } | ||
54 | |||
55 | static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) | ||
56 | { | ||
57 | free_kstat_irqs(old_desc, desc); | ||
58 | free_desc_masks(old_desc, desc); | ||
59 | arch_free_chip_data(old_desc, desc); | ||
60 | } | ||
61 | |||
62 | static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, | ||
63 | int node) | ||
64 | { | ||
65 | struct irq_desc *desc; | ||
66 | unsigned int irq; | ||
67 | unsigned long flags; | ||
68 | |||
69 | irq = old_desc->irq; | ||
70 | |||
71 | raw_spin_lock_irqsave(&sparse_irq_lock, flags); | ||
72 | |||
73 | /* We have to check it to avoid races with another CPU */ | ||
74 | desc = irq_to_desc(irq); | ||
75 | |||
76 | if (desc && old_desc != desc) | ||
77 | goto out_unlock; | ||
78 | |||
79 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); | ||
80 | if (!desc) { | ||
81 | printk(KERN_ERR "irq %d: can not get new irq_desc " | ||
82 | "for migration.\n", irq); | ||
83 | /* still use old one */ | ||
84 | desc = old_desc; | ||
85 | goto out_unlock; | ||
86 | } | ||
87 | if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) { | ||
88 | /* still use old one */ | ||
89 | kfree(desc); | ||
90 | desc = old_desc; | ||
91 | goto out_unlock; | ||
92 | } | ||
93 | |||
94 | replace_irq_desc(irq, desc); | ||
95 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | ||
96 | |||
97 | /* free the old one */ | ||
98 | free_one_irq_desc(old_desc, desc); | ||
99 | kfree(old_desc); | ||
100 | |||
101 | return desc; | ||
102 | |||
103 | out_unlock: | ||
104 | raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); | ||
105 | |||
106 | return desc; | ||
107 | } | ||
108 | |||
109 | struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) | ||
110 | { | ||
111 | /* those static or target node is -1, do not move them */ | ||
112 | if (desc->irq < NR_IRQS_LEGACY || node == -1) | ||
113 | return desc; | ||
114 | |||
115 | if (desc->node != node) | ||
116 | desc = __real_move_irq_desc(desc, node); | ||
117 | |||
118 | return desc; | ||
119 | } | ||
120 | |||
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 09a2ee540bd2..01b1d3a88983 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -21,7 +21,7 @@ static struct proc_dir_entry *root_irq_dir; | |||
21 | static int irq_affinity_proc_show(struct seq_file *m, void *v) | 21 | static int irq_affinity_proc_show(struct seq_file *m, void *v) |
22 | { | 22 | { |
23 | struct irq_desc *desc = irq_to_desc((long)m->private); | 23 | struct irq_desc *desc = irq_to_desc((long)m->private); |
24 | const struct cpumask *mask = desc->affinity; | 24 | const struct cpumask *mask = desc->irq_data.affinity; |
25 | 25 | ||
26 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 26 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
27 | if (desc->status & IRQ_MOVE_PENDING) | 27 | if (desc->status & IRQ_MOVE_PENDING) |
@@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, | |||
65 | cpumask_var_t new_value; | 65 | cpumask_var_t new_value; |
66 | int err; | 66 | int err; |
67 | 67 | ||
68 | if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || | 68 | if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || |
69 | irq_balancing_disabled(irq)) | 69 | irq_balancing_disabled(irq)) |
70 | return -EIO; | 70 | return -EIO; |
71 | 71 | ||
@@ -185,7 +185,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v) | |||
185 | { | 185 | { |
186 | struct irq_desc *desc = irq_to_desc((long) m->private); | 186 | struct irq_desc *desc = irq_to_desc((long) m->private); |
187 | 187 | ||
188 | seq_printf(m, "%d\n", desc->node); | 188 | seq_printf(m, "%d\n", desc->irq_data.node); |
189 | return 0; | 189 | return 0; |
190 | } | 190 | } |
191 | 191 | ||
@@ -269,7 +269,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
269 | { | 269 | { |
270 | char name [MAX_NAMELEN]; | 270 | char name [MAX_NAMELEN]; |
271 | 271 | ||
272 | if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) | 272 | if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) |
273 | return; | 273 | return; |
274 | 274 | ||
275 | memset(name, 0, MAX_NAMELEN); | 275 | memset(name, 0, MAX_NAMELEN); |
@@ -297,6 +297,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
297 | &irq_spurious_proc_fops, (void *)(long)irq); | 297 | &irq_spurious_proc_fops, (void *)(long)irq); |
298 | } | 298 | } |
299 | 299 | ||
300 | void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) | ||
301 | { | ||
302 | char name [MAX_NAMELEN]; | ||
303 | |||
304 | if (!root_irq_dir || !desc->dir) | ||
305 | return; | ||
306 | #ifdef CONFIG_SMP | ||
307 | remove_proc_entry("smp_affinity", desc->dir); | ||
308 | remove_proc_entry("affinity_hint", desc->dir); | ||
309 | remove_proc_entry("node", desc->dir); | ||
310 | #endif | ||
311 | remove_proc_entry("spurious", desc->dir); | ||
312 | |||
313 | memset(name, 0, MAX_NAMELEN); | ||
314 | sprintf(name, "%u", irq); | ||
315 | remove_proc_entry(name, root_irq_dir); | ||
316 | } | ||
317 | |||
300 | #undef MAX_NAMELEN | 318 | #undef MAX_NAMELEN |
301 | 319 | ||
302 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) | 320 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 090c3763f3a2..891115a929aa 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) | |||
60 | /* | 60 | /* |
61 | * Make sure the interrupt is enabled, before resending it: | 61 | * Make sure the interrupt is enabled, before resending it: |
62 | */ | 62 | */ |
63 | desc->chip->enable(irq); | 63 | desc->irq_data.chip->irq_enable(&desc->irq_data); |
64 | 64 | ||
65 | /* | 65 | /* |
66 | * We do not resend level type interrupts. Level type | 66 | * We do not resend level type interrupts. Level type |
@@ -70,7 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) | |||
70 | if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { | 70 | if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { |
71 | desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; | 71 | desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; |
72 | 72 | ||
73 | if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { | 73 | if (!desc->irq_data.chip->irq_retrigger || |
74 | !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { | ||
74 | #ifdef CONFIG_HARDIRQS_SW_RESEND | 75 | #ifdef CONFIG_HARDIRQS_SW_RESEND |
75 | /* Set it pending and activate the softirq: */ | 76 | /* Set it pending and activate the softirq: */ |
76 | set_bit(irq, irqs_resend); | 77 | set_bit(irq, irqs_resend); |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 89fb90ae534f..3089d3b9d5f3 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -14,6 +14,8 @@ | |||
14 | #include <linux/moduleparam.h> | 14 | #include <linux/moduleparam.h> |
15 | #include <linux/timer.h> | 15 | #include <linux/timer.h> |
16 | 16 | ||
17 | #include "internals.h" | ||
18 | |||
17 | static int irqfixup __read_mostly; | 19 | static int irqfixup __read_mostly; |
18 | 20 | ||
19 | #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) | 21 | #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) |
@@ -78,8 +80,8 @@ static int try_one_irq(int irq, struct irq_desc *desc) | |||
78 | * If we did actual work for the real IRQ line we must let the | 80 | * If we did actual work for the real IRQ line we must let the |
79 | * IRQ controller clean up too | 81 | * IRQ controller clean up too |
80 | */ | 82 | */ |
81 | if (work && desc->chip && desc->chip->end) | 83 | if (work) |
82 | desc->chip->end(irq); | 84 | irq_end(irq, desc); |
83 | raw_spin_unlock(&desc->lock); | 85 | raw_spin_unlock(&desc->lock); |
84 | 86 | ||
85 | return ok; | 87 | return ok; |
@@ -254,7 +256,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
254 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); | 256 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); |
255 | desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; | 257 | desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; |
256 | desc->depth++; | 258 | desc->depth++; |
257 | desc->chip->disable(irq); | 259 | desc->irq_data.chip->irq_disable(&desc->irq_data); |
258 | 260 | ||
259 | mod_timer(&poll_spurious_irq_timer, | 261 | mod_timer(&poll_spurious_irq_timer, |
260 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); | 262 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c new file mode 100644 index 000000000000..f16763ff8481 --- /dev/null +++ b/kernel/irq_work.c | |||
@@ -0,0 +1,164 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
3 | * | ||
4 | * Provides a framework for enqueueing and running callbacks from hardirq | ||
5 | * context. The enqueueing is NMI-safe. | ||
6 | */ | ||
7 | |||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/irq_work.h> | ||
11 | #include <linux/hardirq.h> | ||
12 | |||
13 | /* | ||
14 | * An entry can be in one of four states: | ||
15 | * | ||
16 | * free NULL, 0 -> {claimed} : free to be used | ||
17 | * claimed NULL, 3 -> {pending} : claimed to be enqueued | ||
18 | * pending next, 3 -> {busy} : queued, pending callback | ||
19 | * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed | ||
20 | * | ||
21 | * We use the lower two bits of the next pointer to keep PENDING and BUSY | ||
22 | * flags. | ||
23 | */ | ||
24 | |||
25 | #define IRQ_WORK_PENDING 1UL | ||
26 | #define IRQ_WORK_BUSY 2UL | ||
27 | #define IRQ_WORK_FLAGS 3UL | ||
28 | |||
29 | static inline bool irq_work_is_set(struct irq_work *entry, int flags) | ||
30 | { | ||
31 | return (unsigned long)entry->next & flags; | ||
32 | } | ||
33 | |||
34 | static inline struct irq_work *irq_work_next(struct irq_work *entry) | ||
35 | { | ||
36 | unsigned long next = (unsigned long)entry->next; | ||
37 | next &= ~IRQ_WORK_FLAGS; | ||
38 | return (struct irq_work *)next; | ||
39 | } | ||
40 | |||
41 | static inline struct irq_work *next_flags(struct irq_work *entry, int flags) | ||
42 | { | ||
43 | unsigned long next = (unsigned long)entry; | ||
44 | next |= flags; | ||
45 | return (struct irq_work *)next; | ||
46 | } | ||
47 | |||
48 | static DEFINE_PER_CPU(struct irq_work *, irq_work_list); | ||
49 | |||
50 | /* | ||
51 | * Claim the entry so that no one else will poke at it. | ||
52 | */ | ||
53 | static bool irq_work_claim(struct irq_work *entry) | ||
54 | { | ||
55 | struct irq_work *next, *nflags; | ||
56 | |||
57 | do { | ||
58 | next = entry->next; | ||
59 | if ((unsigned long)next & IRQ_WORK_PENDING) | ||
60 | return false; | ||
61 | nflags = next_flags(next, IRQ_WORK_FLAGS); | ||
62 | } while (cmpxchg(&entry->next, next, nflags) != next); | ||
63 | |||
64 | return true; | ||
65 | } | ||
66 | |||
67 | |||
68 | void __weak arch_irq_work_raise(void) | ||
69 | { | ||
70 | /* | ||
71 | * Lame architectures will get the timer tick callback | ||
72 | */ | ||
73 | } | ||
74 | |||
75 | /* | ||
76 | * Queue the entry and raise the IPI if needed. | ||
77 | */ | ||
78 | static void __irq_work_queue(struct irq_work *entry) | ||
79 | { | ||
80 | struct irq_work **head, *next; | ||
81 | |||
82 | head = &get_cpu_var(irq_work_list); | ||
83 | |||
84 | do { | ||
85 | next = *head; | ||
86 | /* Can assign non-atomic because we keep the flags set. */ | ||
87 | entry->next = next_flags(next, IRQ_WORK_FLAGS); | ||
88 | } while (cmpxchg(head, next, entry) != next); | ||
89 | |||
90 | /* The list was empty, raise self-interrupt to start processing. */ | ||
91 | if (!irq_work_next(entry)) | ||
92 | arch_irq_work_raise(); | ||
93 | |||
94 | put_cpu_var(irq_work_list); | ||
95 | } | ||
96 | |||
97 | /* | ||
98 | * Enqueue the irq_work @entry, returns true on success, failure when the | ||
99 | * @entry was already enqueued by someone else. | ||
100 | * | ||
101 | * Can be re-enqueued while the callback is still in progress. | ||
102 | */ | ||
103 | bool irq_work_queue(struct irq_work *entry) | ||
104 | { | ||
105 | if (!irq_work_claim(entry)) { | ||
106 | /* | ||
107 | * Already enqueued, can't do! | ||
108 | */ | ||
109 | return false; | ||
110 | } | ||
111 | |||
112 | __irq_work_queue(entry); | ||
113 | return true; | ||
114 | } | ||
115 | EXPORT_SYMBOL_GPL(irq_work_queue); | ||
116 | |||
117 | /* | ||
118 | * Run the irq_work entries on this cpu. Requires to be ran from hardirq | ||
119 | * context with local IRQs disabled. | ||
120 | */ | ||
121 | void irq_work_run(void) | ||
122 | { | ||
123 | struct irq_work *list, **head; | ||
124 | |||
125 | head = &__get_cpu_var(irq_work_list); | ||
126 | if (*head == NULL) | ||
127 | return; | ||
128 | |||
129 | BUG_ON(!in_irq()); | ||
130 | BUG_ON(!irqs_disabled()); | ||
131 | |||
132 | list = xchg(head, NULL); | ||
133 | while (list != NULL) { | ||
134 | struct irq_work *entry = list; | ||
135 | |||
136 | list = irq_work_next(list); | ||
137 | |||
138 | /* | ||
139 | * Clear the PENDING bit, after this point the @entry | ||
140 | * can be re-used. | ||
141 | */ | ||
142 | entry->next = next_flags(NULL, IRQ_WORK_BUSY); | ||
143 | entry->func(entry); | ||
144 | /* | ||
145 | * Clear the BUSY bit and return to the free state if | ||
146 | * no-one else claimed it meanwhile. | ||
147 | */ | ||
148 | cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); | ||
149 | } | ||
150 | } | ||
151 | EXPORT_SYMBOL_GPL(irq_work_run); | ||
152 | |||
153 | /* | ||
154 | * Synchronize against the irq_work @entry, ensures the entry is not | ||
155 | * currently in use. | ||
156 | */ | ||
157 | void irq_work_sync(struct irq_work *entry) | ||
158 | { | ||
159 | WARN_ON_ONCE(irqs_disabled()); | ||
160 | |||
161 | while (irq_work_is_set(entry, IRQ_WORK_BUSY)) | ||
162 | cpu_relax(); | ||
163 | } | ||
164 | EXPORT_SYMBOL_GPL(irq_work_sync); | ||
diff --git a/kernel/jump_label.c b/kernel/jump_label.c new file mode 100644 index 000000000000..7be868bf25c6 --- /dev/null +++ b/kernel/jump_label.c | |||
@@ -0,0 +1,429 @@ | |||
1 | /* | ||
2 | * jump label support | ||
3 | * | ||
4 | * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> | ||
5 | * | ||
6 | */ | ||
7 | #include <linux/jump_label.h> | ||
8 | #include <linux/memory.h> | ||
9 | #include <linux/uaccess.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/list.h> | ||
12 | #include <linux/jhash.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/sort.h> | ||
15 | #include <linux/err.h> | ||
16 | |||
17 | #ifdef HAVE_JUMP_LABEL | ||
18 | |||
19 | #define JUMP_LABEL_HASH_BITS 6 | ||
20 | #define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS) | ||
21 | static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE]; | ||
22 | |||
23 | /* mutex to protect coming/going of the the jump_label table */ | ||
24 | static DEFINE_MUTEX(jump_label_mutex); | ||
25 | |||
26 | struct jump_label_entry { | ||
27 | struct hlist_node hlist; | ||
28 | struct jump_entry *table; | ||
29 | int nr_entries; | ||
30 | /* hang modules off here */ | ||
31 | struct hlist_head modules; | ||
32 | unsigned long key; | ||
33 | }; | ||
34 | |||
35 | struct jump_label_module_entry { | ||
36 | struct hlist_node hlist; | ||
37 | struct jump_entry *table; | ||
38 | int nr_entries; | ||
39 | struct module *mod; | ||
40 | }; | ||
41 | |||
42 | static int jump_label_cmp(const void *a, const void *b) | ||
43 | { | ||
44 | const struct jump_entry *jea = a; | ||
45 | const struct jump_entry *jeb = b; | ||
46 | |||
47 | if (jea->key < jeb->key) | ||
48 | return -1; | ||
49 | |||
50 | if (jea->key > jeb->key) | ||
51 | return 1; | ||
52 | |||
53 | return 0; | ||
54 | } | ||
55 | |||
56 | static void | ||
57 | sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) | ||
58 | { | ||
59 | unsigned long size; | ||
60 | |||
61 | size = (((unsigned long)stop - (unsigned long)start) | ||
62 | / sizeof(struct jump_entry)); | ||
63 | sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); | ||
64 | } | ||
65 | |||
66 | static struct jump_label_entry *get_jump_label_entry(jump_label_t key) | ||
67 | { | ||
68 | struct hlist_head *head; | ||
69 | struct hlist_node *node; | ||
70 | struct jump_label_entry *e; | ||
71 | u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0); | ||
72 | |||
73 | head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; | ||
74 | hlist_for_each_entry(e, node, head, hlist) { | ||
75 | if (key == e->key) | ||
76 | return e; | ||
77 | } | ||
78 | return NULL; | ||
79 | } | ||
80 | |||
81 | static struct jump_label_entry * | ||
82 | add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table) | ||
83 | { | ||
84 | struct hlist_head *head; | ||
85 | struct jump_label_entry *e; | ||
86 | u32 hash; | ||
87 | |||
88 | e = get_jump_label_entry(key); | ||
89 | if (e) | ||
90 | return ERR_PTR(-EEXIST); | ||
91 | |||
92 | e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL); | ||
93 | if (!e) | ||
94 | return ERR_PTR(-ENOMEM); | ||
95 | |||
96 | hash = jhash((void *)&key, sizeof(jump_label_t), 0); | ||
97 | head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; | ||
98 | e->key = key; | ||
99 | e->table = table; | ||
100 | e->nr_entries = nr_entries; | ||
101 | INIT_HLIST_HEAD(&(e->modules)); | ||
102 | hlist_add_head(&e->hlist, head); | ||
103 | return e; | ||
104 | } | ||
105 | |||
106 | static int | ||
107 | build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) | ||
108 | { | ||
109 | struct jump_entry *iter, *iter_begin; | ||
110 | struct jump_label_entry *entry; | ||
111 | int count; | ||
112 | |||
113 | sort_jump_label_entries(start, stop); | ||
114 | iter = start; | ||
115 | while (iter < stop) { | ||
116 | entry = get_jump_label_entry(iter->key); | ||
117 | if (!entry) { | ||
118 | iter_begin = iter; | ||
119 | count = 0; | ||
120 | while ((iter < stop) && | ||
121 | (iter->key == iter_begin->key)) { | ||
122 | iter++; | ||
123 | count++; | ||
124 | } | ||
125 | entry = add_jump_label_entry(iter_begin->key, | ||
126 | count, iter_begin); | ||
127 | if (IS_ERR(entry)) | ||
128 | return PTR_ERR(entry); | ||
129 | } else { | ||
130 | WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n"); | ||
131 | return -1; | ||
132 | } | ||
133 | } | ||
134 | return 0; | ||
135 | } | ||
136 | |||
137 | /*** | ||
138 | * jump_label_update - update jump label text | ||
139 | * @key - key value associated with a a jump label | ||
140 | * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE | ||
141 | * | ||
142 | * Will enable/disable the jump for jump label @key, depending on the | ||
143 | * value of @type. | ||
144 | * | ||
145 | */ | ||
146 | |||
147 | void jump_label_update(unsigned long key, enum jump_label_type type) | ||
148 | { | ||
149 | struct jump_entry *iter; | ||
150 | struct jump_label_entry *entry; | ||
151 | struct hlist_node *module_node; | ||
152 | struct jump_label_module_entry *e_module; | ||
153 | int count; | ||
154 | |||
155 | mutex_lock(&jump_label_mutex); | ||
156 | entry = get_jump_label_entry((jump_label_t)key); | ||
157 | if (entry) { | ||
158 | count = entry->nr_entries; | ||
159 | iter = entry->table; | ||
160 | while (count--) { | ||
161 | if (kernel_text_address(iter->code)) | ||
162 | arch_jump_label_transform(iter, type); | ||
163 | iter++; | ||
164 | } | ||
165 | /* eanble/disable jump labels in modules */ | ||
166 | hlist_for_each_entry(e_module, module_node, &(entry->modules), | ||
167 | hlist) { | ||
168 | count = e_module->nr_entries; | ||
169 | iter = e_module->table; | ||
170 | while (count--) { | ||
171 | if (kernel_text_address(iter->code)) | ||
172 | arch_jump_label_transform(iter, type); | ||
173 | iter++; | ||
174 | } | ||
175 | } | ||
176 | } | ||
177 | mutex_unlock(&jump_label_mutex); | ||
178 | } | ||
179 | |||
180 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) | ||
181 | { | ||
182 | if (entry->code <= (unsigned long)end && | ||
183 | entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start) | ||
184 | return 1; | ||
185 | |||
186 | return 0; | ||
187 | } | ||
188 | |||
189 | #ifdef CONFIG_MODULES | ||
190 | |||
191 | static int module_conflict(void *start, void *end) | ||
192 | { | ||
193 | struct hlist_head *head; | ||
194 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | ||
195 | struct jump_label_entry *e; | ||
196 | struct jump_label_module_entry *e_module; | ||
197 | struct jump_entry *iter; | ||
198 | int i, count; | ||
199 | int conflict = 0; | ||
200 | |||
201 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | ||
202 | head = &jump_label_table[i]; | ||
203 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | ||
204 | hlist_for_each_entry_safe(e_module, module_node, | ||
205 | module_node_next, | ||
206 | &(e->modules), hlist) { | ||
207 | count = e_module->nr_entries; | ||
208 | iter = e_module->table; | ||
209 | while (count--) { | ||
210 | if (addr_conflict(iter, start, end)) { | ||
211 | conflict = 1; | ||
212 | goto out; | ||
213 | } | ||
214 | iter++; | ||
215 | } | ||
216 | } | ||
217 | } | ||
218 | } | ||
219 | out: | ||
220 | return conflict; | ||
221 | } | ||
222 | |||
223 | #endif | ||
224 | |||
225 | /*** | ||
226 | * jump_label_text_reserved - check if addr range is reserved | ||
227 | * @start: start text addr | ||
228 | * @end: end text addr | ||
229 | * | ||
230 | * checks if the text addr located between @start and @end | ||
231 | * overlaps with any of the jump label patch addresses. Code | ||
232 | * that wants to modify kernel text should first verify that | ||
233 | * it does not overlap with any of the jump label addresses. | ||
234 | * | ||
235 | * returns 1 if there is an overlap, 0 otherwise | ||
236 | */ | ||
237 | int jump_label_text_reserved(void *start, void *end) | ||
238 | { | ||
239 | struct jump_entry *iter; | ||
240 | struct jump_entry *iter_start = __start___jump_table; | ||
241 | struct jump_entry *iter_stop = __start___jump_table; | ||
242 | int conflict = 0; | ||
243 | |||
244 | mutex_lock(&jump_label_mutex); | ||
245 | iter = iter_start; | ||
246 | while (iter < iter_stop) { | ||
247 | if (addr_conflict(iter, start, end)) { | ||
248 | conflict = 1; | ||
249 | goto out; | ||
250 | } | ||
251 | iter++; | ||
252 | } | ||
253 | |||
254 | /* now check modules */ | ||
255 | #ifdef CONFIG_MODULES | ||
256 | conflict = module_conflict(start, end); | ||
257 | #endif | ||
258 | out: | ||
259 | mutex_unlock(&jump_label_mutex); | ||
260 | return conflict; | ||
261 | } | ||
262 | |||
263 | static __init int init_jump_label(void) | ||
264 | { | ||
265 | int ret; | ||
266 | struct jump_entry *iter_start = __start___jump_table; | ||
267 | struct jump_entry *iter_stop = __stop___jump_table; | ||
268 | struct jump_entry *iter; | ||
269 | |||
270 | mutex_lock(&jump_label_mutex); | ||
271 | ret = build_jump_label_hashtable(__start___jump_table, | ||
272 | __stop___jump_table); | ||
273 | iter = iter_start; | ||
274 | while (iter < iter_stop) { | ||
275 | arch_jump_label_text_poke_early(iter->code); | ||
276 | iter++; | ||
277 | } | ||
278 | mutex_unlock(&jump_label_mutex); | ||
279 | return ret; | ||
280 | } | ||
281 | early_initcall(init_jump_label); | ||
282 | |||
283 | #ifdef CONFIG_MODULES | ||
284 | |||
285 | static struct jump_label_module_entry * | ||
286 | add_jump_label_module_entry(struct jump_label_entry *entry, | ||
287 | struct jump_entry *iter_begin, | ||
288 | int count, struct module *mod) | ||
289 | { | ||
290 | struct jump_label_module_entry *e; | ||
291 | |||
292 | e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); | ||
293 | if (!e) | ||
294 | return ERR_PTR(-ENOMEM); | ||
295 | e->mod = mod; | ||
296 | e->nr_entries = count; | ||
297 | e->table = iter_begin; | ||
298 | hlist_add_head(&e->hlist, &entry->modules); | ||
299 | return e; | ||
300 | } | ||
301 | |||
302 | static int add_jump_label_module(struct module *mod) | ||
303 | { | ||
304 | struct jump_entry *iter, *iter_begin; | ||
305 | struct jump_label_entry *entry; | ||
306 | struct jump_label_module_entry *module_entry; | ||
307 | int count; | ||
308 | |||
309 | /* if the module doesn't have jump label entries, just return */ | ||
310 | if (!mod->num_jump_entries) | ||
311 | return 0; | ||
312 | |||
313 | sort_jump_label_entries(mod->jump_entries, | ||
314 | mod->jump_entries + mod->num_jump_entries); | ||
315 | iter = mod->jump_entries; | ||
316 | while (iter < mod->jump_entries + mod->num_jump_entries) { | ||
317 | entry = get_jump_label_entry(iter->key); | ||
318 | iter_begin = iter; | ||
319 | count = 0; | ||
320 | while ((iter < mod->jump_entries + mod->num_jump_entries) && | ||
321 | (iter->key == iter_begin->key)) { | ||
322 | iter++; | ||
323 | count++; | ||
324 | } | ||
325 | if (!entry) { | ||
326 | entry = add_jump_label_entry(iter_begin->key, 0, NULL); | ||
327 | if (IS_ERR(entry)) | ||
328 | return PTR_ERR(entry); | ||
329 | } | ||
330 | module_entry = add_jump_label_module_entry(entry, iter_begin, | ||
331 | count, mod); | ||
332 | if (IS_ERR(module_entry)) | ||
333 | return PTR_ERR(module_entry); | ||
334 | } | ||
335 | return 0; | ||
336 | } | ||
337 | |||
338 | static void remove_jump_label_module(struct module *mod) | ||
339 | { | ||
340 | struct hlist_head *head; | ||
341 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | ||
342 | struct jump_label_entry *e; | ||
343 | struct jump_label_module_entry *e_module; | ||
344 | int i; | ||
345 | |||
346 | /* if the module doesn't have jump label entries, just return */ | ||
347 | if (!mod->num_jump_entries) | ||
348 | return; | ||
349 | |||
350 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | ||
351 | head = &jump_label_table[i]; | ||
352 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | ||
353 | hlist_for_each_entry_safe(e_module, module_node, | ||
354 | module_node_next, | ||
355 | &(e->modules), hlist) { | ||
356 | if (e_module->mod == mod) { | ||
357 | hlist_del(&e_module->hlist); | ||
358 | kfree(e_module); | ||
359 | } | ||
360 | } | ||
361 | if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { | ||
362 | hlist_del(&e->hlist); | ||
363 | kfree(e); | ||
364 | } | ||
365 | } | ||
366 | } | ||
367 | } | ||
368 | |||
369 | static int | ||
370 | jump_label_module_notify(struct notifier_block *self, unsigned long val, | ||
371 | void *data) | ||
372 | { | ||
373 | struct module *mod = data; | ||
374 | int ret = 0; | ||
375 | |||
376 | switch (val) { | ||
377 | case MODULE_STATE_COMING: | ||
378 | mutex_lock(&jump_label_mutex); | ||
379 | ret = add_jump_label_module(mod); | ||
380 | if (ret) | ||
381 | remove_jump_label_module(mod); | ||
382 | mutex_unlock(&jump_label_mutex); | ||
383 | break; | ||
384 | case MODULE_STATE_GOING: | ||
385 | mutex_lock(&jump_label_mutex); | ||
386 | remove_jump_label_module(mod); | ||
387 | mutex_unlock(&jump_label_mutex); | ||
388 | break; | ||
389 | } | ||
390 | return ret; | ||
391 | } | ||
392 | |||
393 | /*** | ||
394 | * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() | ||
395 | * @mod: module to patch | ||
396 | * | ||
397 | * Allow for run-time selection of the optimal nops. Before the module | ||
398 | * loads patch these with arch_get_jump_label_nop(), which is specified by | ||
399 | * the arch specific jump label code. | ||
400 | */ | ||
401 | void jump_label_apply_nops(struct module *mod) | ||
402 | { | ||
403 | struct jump_entry *iter; | ||
404 | |||
405 | /* if the module doesn't have jump label entries, just return */ | ||
406 | if (!mod->num_jump_entries) | ||
407 | return; | ||
408 | |||
409 | iter = mod->jump_entries; | ||
410 | while (iter < mod->jump_entries + mod->num_jump_entries) { | ||
411 | arch_jump_label_text_poke_early(iter->code); | ||
412 | iter++; | ||
413 | } | ||
414 | } | ||
415 | |||
416 | struct notifier_block jump_label_module_nb = { | ||
417 | .notifier_call = jump_label_module_notify, | ||
418 | .priority = 0, | ||
419 | }; | ||
420 | |||
421 | static __init int init_jump_label_module(void) | ||
422 | { | ||
423 | return register_module_notifier(&jump_label_module_nb); | ||
424 | } | ||
425 | early_initcall(init_jump_label_module); | ||
426 | |||
427 | #endif /* CONFIG_MODULES */ | ||
428 | |||
429 | #endif | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index 131b1703936f..b55045bc7563 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -151,8 +151,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
151 | image->nr_segments = nr_segments; | 151 | image->nr_segments = nr_segments; |
152 | segment_bytes = nr_segments * sizeof(*segments); | 152 | segment_bytes = nr_segments * sizeof(*segments); |
153 | result = copy_from_user(image->segment, segments, segment_bytes); | 153 | result = copy_from_user(image->segment, segments, segment_bytes); |
154 | if (result) | 154 | if (result) { |
155 | result = -EFAULT; | ||
155 | goto out; | 156 | goto out; |
157 | } | ||
156 | 158 | ||
157 | /* | 159 | /* |
158 | * Verify we have good destination addresses. The caller is | 160 | * Verify we have good destination addresses. The caller is |
@@ -814,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image, | |||
814 | 816 | ||
815 | ptr = kmap(page); | 817 | ptr = kmap(page); |
816 | /* Start with a clear page */ | 818 | /* Start with a clear page */ |
817 | memset(ptr, 0, PAGE_SIZE); | 819 | clear_page(ptr); |
818 | ptr += maddr & ~PAGE_MASK; | 820 | ptr += maddr & ~PAGE_MASK; |
819 | mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); | 821 | mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); |
820 | if (mchunk > mbytes) | 822 | if (mchunk > mbytes) |
@@ -827,7 +829,7 @@ static int kimage_load_normal_segment(struct kimage *image, | |||
827 | result = copy_from_user(ptr, buf, uchunk); | 829 | result = copy_from_user(ptr, buf, uchunk); |
828 | kunmap(page); | 830 | kunmap(page); |
829 | if (result) { | 831 | if (result) { |
830 | result = (result < 0) ? result : -EIO; | 832 | result = -EFAULT; |
831 | goto out; | 833 | goto out; |
832 | } | 834 | } |
833 | ubytes -= uchunk; | 835 | ubytes -= uchunk; |
@@ -882,7 +884,7 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
882 | kexec_flush_icache_page(page); | 884 | kexec_flush_icache_page(page); |
883 | kunmap(page); | 885 | kunmap(page); |
884 | if (result) { | 886 | if (result) { |
885 | result = (result < 0) ? result : -EIO; | 887 | result = -EFAULT; |
886 | goto out; | 888 | goto out; |
887 | } | 889 | } |
888 | ubytes -= uchunk; | 890 | ubytes -= uchunk; |
diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 35edbe22e9a9..01a0700e873f 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c | |||
@@ -1,8 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * A generic kernel FIFO implementation. | 2 | * A generic kernel FIFO implementation |
3 | * | 3 | * |
4 | * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net> | 4 | * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net> |
5 | * Copyright (C) 2004 Stelian Pop <stelian@popies.net> | ||
6 | * | 5 | * |
7 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
8 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -11,7 +10,7 @@ | |||
11 | * | 10 | * |
12 | * This program is distributed in the hope that it will be useful, | 11 | * This program is distributed in the hope that it will be useful, |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | * GNU General Public License for more details. | 14 | * GNU General Public License for more details. |
16 | * | 15 | * |
17 | * You should have received a copy of the GNU General Public License | 16 | * You should have received a copy of the GNU General Public License |
@@ -24,422 +23,586 @@ | |||
24 | #include <linux/module.h> | 23 | #include <linux/module.h> |
25 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
26 | #include <linux/err.h> | 25 | #include <linux/err.h> |
27 | #include <linux/kfifo.h> | ||
28 | #include <linux/log2.h> | 26 | #include <linux/log2.h> |
29 | #include <linux/uaccess.h> | 27 | #include <linux/uaccess.h> |
28 | #include <linux/kfifo.h> | ||
30 | 29 | ||
31 | static void _kfifo_init(struct kfifo *fifo, void *buffer, | 30 | /* |
32 | unsigned int size) | 31 | * internal helper to calculate the unused elements in a fifo |
33 | { | ||
34 | fifo->buffer = buffer; | ||
35 | fifo->size = size; | ||
36 | |||
37 | kfifo_reset(fifo); | ||
38 | } | ||
39 | |||
40 | /** | ||
41 | * kfifo_init - initialize a FIFO using a preallocated buffer | ||
42 | * @fifo: the fifo to assign the buffer | ||
43 | * @buffer: the preallocated buffer to be used. | ||
44 | * @size: the size of the internal buffer, this has to be a power of 2. | ||
45 | * | ||
46 | */ | 32 | */ |
47 | void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) | 33 | static inline unsigned int kfifo_unused(struct __kfifo *fifo) |
48 | { | 34 | { |
49 | /* size must be a power of 2 */ | 35 | return (fifo->mask + 1) - (fifo->in - fifo->out); |
50 | BUG_ON(!is_power_of_2(size)); | ||
51 | |||
52 | _kfifo_init(fifo, buffer, size); | ||
53 | } | 36 | } |
54 | EXPORT_SYMBOL(kfifo_init); | ||
55 | 37 | ||
56 | /** | 38 | int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, |
57 | * kfifo_alloc - allocates a new FIFO internal buffer | 39 | size_t esize, gfp_t gfp_mask) |
58 | * @fifo: the fifo to assign then new buffer | ||
59 | * @size: the size of the buffer to be allocated, this have to be a power of 2. | ||
60 | * @gfp_mask: get_free_pages mask, passed to kmalloc() | ||
61 | * | ||
62 | * This function dynamically allocates a new fifo internal buffer | ||
63 | * | ||
64 | * The size will be rounded-up to a power of 2. | ||
65 | * The buffer will be release with kfifo_free(). | ||
66 | * Return 0 if no error, otherwise the an error code | ||
67 | */ | ||
68 | int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) | ||
69 | { | 40 | { |
70 | unsigned char *buffer; | ||
71 | |||
72 | /* | 41 | /* |
73 | * round up to the next power of 2, since our 'let the indices | 42 | * round down to the next power of 2, since our 'let the indices |
74 | * wrap' technique works only in this case. | 43 | * wrap' technique works only in this case. |
75 | */ | 44 | */ |
76 | if (!is_power_of_2(size)) { | 45 | if (!is_power_of_2(size)) |
77 | BUG_ON(size > 0x80000000); | 46 | size = rounddown_pow_of_two(size); |
78 | size = roundup_pow_of_two(size); | 47 | |
48 | fifo->in = 0; | ||
49 | fifo->out = 0; | ||
50 | fifo->esize = esize; | ||
51 | |||
52 | if (size < 2) { | ||
53 | fifo->data = NULL; | ||
54 | fifo->mask = 0; | ||
55 | return -EINVAL; | ||
79 | } | 56 | } |
80 | 57 | ||
81 | buffer = kmalloc(size, gfp_mask); | 58 | fifo->data = kmalloc(size * esize, gfp_mask); |
82 | if (!buffer) { | 59 | |
83 | _kfifo_init(fifo, NULL, 0); | 60 | if (!fifo->data) { |
61 | fifo->mask = 0; | ||
84 | return -ENOMEM; | 62 | return -ENOMEM; |
85 | } | 63 | } |
86 | 64 | fifo->mask = size - 1; | |
87 | _kfifo_init(fifo, buffer, size); | ||
88 | 65 | ||
89 | return 0; | 66 | return 0; |
90 | } | 67 | } |
91 | EXPORT_SYMBOL(kfifo_alloc); | 68 | EXPORT_SYMBOL(__kfifo_alloc); |
92 | 69 | ||
93 | /** | 70 | void __kfifo_free(struct __kfifo *fifo) |
94 | * kfifo_free - frees the FIFO internal buffer | ||
95 | * @fifo: the fifo to be freed. | ||
96 | */ | ||
97 | void kfifo_free(struct kfifo *fifo) | ||
98 | { | 71 | { |
99 | kfree(fifo->buffer); | 72 | kfree(fifo->data); |
100 | _kfifo_init(fifo, NULL, 0); | 73 | fifo->in = 0; |
74 | fifo->out = 0; | ||
75 | fifo->esize = 0; | ||
76 | fifo->data = NULL; | ||
77 | fifo->mask = 0; | ||
101 | } | 78 | } |
102 | EXPORT_SYMBOL(kfifo_free); | 79 | EXPORT_SYMBOL(__kfifo_free); |
103 | 80 | ||
104 | /** | 81 | int __kfifo_init(struct __kfifo *fifo, void *buffer, |
105 | * kfifo_skip - skip output data | 82 | unsigned int size, size_t esize) |
106 | * @fifo: the fifo to be used. | ||
107 | * @len: number of bytes to skip | ||
108 | */ | ||
109 | void kfifo_skip(struct kfifo *fifo, unsigned int len) | ||
110 | { | 83 | { |
111 | if (len < kfifo_len(fifo)) { | 84 | size /= esize; |
112 | __kfifo_add_out(fifo, len); | 85 | |
113 | return; | 86 | if (!is_power_of_2(size)) |
87 | size = rounddown_pow_of_two(size); | ||
88 | |||
89 | fifo->in = 0; | ||
90 | fifo->out = 0; | ||
91 | fifo->esize = esize; | ||
92 | fifo->data = buffer; | ||
93 | |||
94 | if (size < 2) { | ||
95 | fifo->mask = 0; | ||
96 | return -EINVAL; | ||
114 | } | 97 | } |
115 | kfifo_reset_out(fifo); | 98 | fifo->mask = size - 1; |
99 | |||
100 | return 0; | ||
116 | } | 101 | } |
117 | EXPORT_SYMBOL(kfifo_skip); | 102 | EXPORT_SYMBOL(__kfifo_init); |
118 | 103 | ||
119 | static inline void __kfifo_in_data(struct kfifo *fifo, | 104 | static void kfifo_copy_in(struct __kfifo *fifo, const void *src, |
120 | const void *from, unsigned int len, unsigned int off) | 105 | unsigned int len, unsigned int off) |
121 | { | 106 | { |
107 | unsigned int size = fifo->mask + 1; | ||
108 | unsigned int esize = fifo->esize; | ||
122 | unsigned int l; | 109 | unsigned int l; |
123 | 110 | ||
111 | off &= fifo->mask; | ||
112 | if (esize != 1) { | ||
113 | off *= esize; | ||
114 | size *= esize; | ||
115 | len *= esize; | ||
116 | } | ||
117 | l = min(len, size - off); | ||
118 | |||
119 | memcpy(fifo->data + off, src, l); | ||
120 | memcpy(fifo->data, src + l, len - l); | ||
124 | /* | 121 | /* |
125 | * Ensure that we sample the fifo->out index -before- we | 122 | * make sure that the data in the fifo is up to date before |
126 | * start putting bytes into the kfifo. | 123 | * incrementing the fifo->in index counter |
127 | */ | 124 | */ |
125 | smp_wmb(); | ||
126 | } | ||
128 | 127 | ||
129 | smp_mb(); | 128 | unsigned int __kfifo_in(struct __kfifo *fifo, |
130 | 129 | const void *buf, unsigned int len) | |
131 | off = __kfifo_off(fifo, fifo->in + off); | 130 | { |
131 | unsigned int l; | ||
132 | 132 | ||
133 | /* first put the data starting from fifo->in to buffer end */ | 133 | l = kfifo_unused(fifo); |
134 | l = min(len, fifo->size - off); | 134 | if (len > l) |
135 | memcpy(fifo->buffer + off, from, l); | 135 | len = l; |
136 | 136 | ||
137 | /* then put the rest (if any) at the beginning of the buffer */ | 137 | kfifo_copy_in(fifo, buf, len, fifo->in); |
138 | memcpy(fifo->buffer, from + l, len - l); | 138 | fifo->in += len; |
139 | return len; | ||
139 | } | 140 | } |
141 | EXPORT_SYMBOL(__kfifo_in); | ||
140 | 142 | ||
141 | static inline void __kfifo_out_data(struct kfifo *fifo, | 143 | static void kfifo_copy_out(struct __kfifo *fifo, void *dst, |
142 | void *to, unsigned int len, unsigned int off) | 144 | unsigned int len, unsigned int off) |
143 | { | 145 | { |
146 | unsigned int size = fifo->mask + 1; | ||
147 | unsigned int esize = fifo->esize; | ||
144 | unsigned int l; | 148 | unsigned int l; |
145 | 149 | ||
150 | off &= fifo->mask; | ||
151 | if (esize != 1) { | ||
152 | off *= esize; | ||
153 | size *= esize; | ||
154 | len *= esize; | ||
155 | } | ||
156 | l = min(len, size - off); | ||
157 | |||
158 | memcpy(dst, fifo->data + off, l); | ||
159 | memcpy(dst + l, fifo->data, len - l); | ||
146 | /* | 160 | /* |
147 | * Ensure that we sample the fifo->in index -before- we | 161 | * make sure that the data is copied before |
148 | * start removing bytes from the kfifo. | 162 | * incrementing the fifo->out index counter |
149 | */ | 163 | */ |
164 | smp_wmb(); | ||
165 | } | ||
150 | 166 | ||
151 | smp_rmb(); | 167 | unsigned int __kfifo_out_peek(struct __kfifo *fifo, |
168 | void *buf, unsigned int len) | ||
169 | { | ||
170 | unsigned int l; | ||
152 | 171 | ||
153 | off = __kfifo_off(fifo, fifo->out + off); | 172 | l = fifo->in - fifo->out; |
173 | if (len > l) | ||
174 | len = l; | ||
154 | 175 | ||
155 | /* first get the data from fifo->out until the end of the buffer */ | 176 | kfifo_copy_out(fifo, buf, len, fifo->out); |
156 | l = min(len, fifo->size - off); | 177 | return len; |
157 | memcpy(to, fifo->buffer + off, l); | 178 | } |
179 | EXPORT_SYMBOL(__kfifo_out_peek); | ||
158 | 180 | ||
159 | /* then get the rest (if any) from the beginning of the buffer */ | 181 | unsigned int __kfifo_out(struct __kfifo *fifo, |
160 | memcpy(to + l, fifo->buffer, len - l); | 182 | void *buf, unsigned int len) |
183 | { | ||
184 | len = __kfifo_out_peek(fifo, buf, len); | ||
185 | fifo->out += len; | ||
186 | return len; | ||
161 | } | 187 | } |
188 | EXPORT_SYMBOL(__kfifo_out); | ||
162 | 189 | ||
163 | static inline int __kfifo_from_user_data(struct kfifo *fifo, | 190 | static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, |
164 | const void __user *from, unsigned int len, unsigned int off, | 191 | const void __user *from, unsigned int len, unsigned int off, |
165 | unsigned *lenout) | 192 | unsigned int *copied) |
166 | { | 193 | { |
194 | unsigned int size = fifo->mask + 1; | ||
195 | unsigned int esize = fifo->esize; | ||
167 | unsigned int l; | 196 | unsigned int l; |
168 | int ret; | 197 | unsigned long ret; |
169 | 198 | ||
199 | off &= fifo->mask; | ||
200 | if (esize != 1) { | ||
201 | off *= esize; | ||
202 | size *= esize; | ||
203 | len *= esize; | ||
204 | } | ||
205 | l = min(len, size - off); | ||
206 | |||
207 | ret = copy_from_user(fifo->data + off, from, l); | ||
208 | if (unlikely(ret)) | ||
209 | ret = DIV_ROUND_UP(ret + len - l, esize); | ||
210 | else { | ||
211 | ret = copy_from_user(fifo->data, from + l, len - l); | ||
212 | if (unlikely(ret)) | ||
213 | ret = DIV_ROUND_UP(ret, esize); | ||
214 | } | ||
170 | /* | 215 | /* |
171 | * Ensure that we sample the fifo->out index -before- we | 216 | * make sure that the data in the fifo is up to date before |
172 | * start putting bytes into the kfifo. | 217 | * incrementing the fifo->in index counter |
173 | */ | 218 | */ |
219 | smp_wmb(); | ||
220 | *copied = len - ret; | ||
221 | /* return the number of elements which are not copied */ | ||
222 | return ret; | ||
223 | } | ||
174 | 224 | ||
175 | smp_mb(); | 225 | int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, |
226 | unsigned long len, unsigned int *copied) | ||
227 | { | ||
228 | unsigned int l; | ||
229 | unsigned long ret; | ||
230 | unsigned int esize = fifo->esize; | ||
231 | int err; | ||
176 | 232 | ||
177 | off = __kfifo_off(fifo, fifo->in + off); | 233 | if (esize != 1) |
234 | len /= esize; | ||
178 | 235 | ||
179 | /* first put the data starting from fifo->in to buffer end */ | 236 | l = kfifo_unused(fifo); |
180 | l = min(len, fifo->size - off); | 237 | if (len > l) |
181 | ret = copy_from_user(fifo->buffer + off, from, l); | 238 | len = l; |
182 | if (unlikely(ret)) { | ||
183 | *lenout = ret; | ||
184 | return -EFAULT; | ||
185 | } | ||
186 | *lenout = l; | ||
187 | 239 | ||
188 | /* then put the rest (if any) at the beginning of the buffer */ | 240 | ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); |
189 | ret = copy_from_user(fifo->buffer, from + l, len - l); | 241 | if (unlikely(ret)) { |
190 | *lenout += ret ? ret : len - l; | 242 | len -= ret; |
191 | return ret ? -EFAULT : 0; | 243 | err = -EFAULT; |
244 | } else | ||
245 | err = 0; | ||
246 | fifo->in += len; | ||
247 | return err; | ||
192 | } | 248 | } |
249 | EXPORT_SYMBOL(__kfifo_from_user); | ||
193 | 250 | ||
194 | static inline int __kfifo_to_user_data(struct kfifo *fifo, | 251 | static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, |
195 | void __user *to, unsigned int len, unsigned int off, unsigned *lenout) | 252 | unsigned int len, unsigned int off, unsigned int *copied) |
196 | { | 253 | { |
197 | unsigned int l; | 254 | unsigned int l; |
198 | int ret; | 255 | unsigned long ret; |
199 | 256 | unsigned int size = fifo->mask + 1; | |
257 | unsigned int esize = fifo->esize; | ||
258 | |||
259 | off &= fifo->mask; | ||
260 | if (esize != 1) { | ||
261 | off *= esize; | ||
262 | size *= esize; | ||
263 | len *= esize; | ||
264 | } | ||
265 | l = min(len, size - off); | ||
266 | |||
267 | ret = copy_to_user(to, fifo->data + off, l); | ||
268 | if (unlikely(ret)) | ||
269 | ret = DIV_ROUND_UP(ret + len - l, esize); | ||
270 | else { | ||
271 | ret = copy_to_user(to + l, fifo->data, len - l); | ||
272 | if (unlikely(ret)) | ||
273 | ret = DIV_ROUND_UP(ret, esize); | ||
274 | } | ||
200 | /* | 275 | /* |
201 | * Ensure that we sample the fifo->in index -before- we | 276 | * make sure that the data is copied before |
202 | * start removing bytes from the kfifo. | 277 | * incrementing the fifo->out index counter |
203 | */ | 278 | */ |
279 | smp_wmb(); | ||
280 | *copied = len - ret; | ||
281 | /* return the number of elements which are not copied */ | ||
282 | return ret; | ||
283 | } | ||
204 | 284 | ||
205 | smp_rmb(); | 285 | int __kfifo_to_user(struct __kfifo *fifo, void __user *to, |
286 | unsigned long len, unsigned int *copied) | ||
287 | { | ||
288 | unsigned int l; | ||
289 | unsigned long ret; | ||
290 | unsigned int esize = fifo->esize; | ||
291 | int err; | ||
206 | 292 | ||
207 | off = __kfifo_off(fifo, fifo->out + off); | 293 | if (esize != 1) |
294 | len /= esize; | ||
208 | 295 | ||
209 | /* first get the data from fifo->out until the end of the buffer */ | 296 | l = fifo->in - fifo->out; |
210 | l = min(len, fifo->size - off); | 297 | if (len > l) |
211 | ret = copy_to_user(to, fifo->buffer + off, l); | 298 | len = l; |
212 | *lenout = l; | 299 | ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); |
213 | if (unlikely(ret)) { | 300 | if (unlikely(ret)) { |
214 | *lenout -= ret; | 301 | len -= ret; |
215 | return -EFAULT; | 302 | err = -EFAULT; |
216 | } | 303 | } else |
304 | err = 0; | ||
305 | fifo->out += len; | ||
306 | return err; | ||
307 | } | ||
308 | EXPORT_SYMBOL(__kfifo_to_user); | ||
217 | 309 | ||
218 | /* then get the rest (if any) from the beginning of the buffer */ | 310 | static int setup_sgl_buf(struct scatterlist *sgl, void *buf, |
219 | len -= l; | 311 | int nents, unsigned int len) |
220 | ret = copy_to_user(to + l, fifo->buffer, len); | 312 | { |
221 | if (unlikely(ret)) { | 313 | int n; |
222 | *lenout += len - ret; | 314 | unsigned int l; |
223 | return -EFAULT; | 315 | unsigned int off; |
316 | struct page *page; | ||
317 | |||
318 | if (!nents) | ||
319 | return 0; | ||
320 | |||
321 | if (!len) | ||
322 | return 0; | ||
323 | |||
324 | n = 0; | ||
325 | page = virt_to_page(buf); | ||
326 | off = offset_in_page(buf); | ||
327 | l = 0; | ||
328 | |||
329 | while (len >= l + PAGE_SIZE - off) { | ||
330 | struct page *npage; | ||
331 | |||
332 | l += PAGE_SIZE; | ||
333 | buf += PAGE_SIZE; | ||
334 | npage = virt_to_page(buf); | ||
335 | if (page_to_phys(page) != page_to_phys(npage) - l) { | ||
336 | sg_set_page(sgl, page, l - off, off); | ||
337 | sgl = sg_next(sgl); | ||
338 | if (++n == nents || sgl == NULL) | ||
339 | return n; | ||
340 | page = npage; | ||
341 | len -= l - off; | ||
342 | l = off = 0; | ||
343 | } | ||
224 | } | 344 | } |
225 | *lenout += len; | 345 | sg_set_page(sgl, page, len, off); |
226 | return 0; | 346 | return n + 1; |
227 | } | 347 | } |
228 | 348 | ||
229 | unsigned int __kfifo_in_n(struct kfifo *fifo, | 349 | static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, |
230 | const void *from, unsigned int len, unsigned int recsize) | 350 | int nents, unsigned int len, unsigned int off) |
231 | { | 351 | { |
232 | if (kfifo_avail(fifo) < len + recsize) | 352 | unsigned int size = fifo->mask + 1; |
233 | return len + 1; | 353 | unsigned int esize = fifo->esize; |
354 | unsigned int l; | ||
355 | unsigned int n; | ||
234 | 356 | ||
235 | __kfifo_in_data(fifo, from, len, recsize); | 357 | off &= fifo->mask; |
236 | return 0; | 358 | if (esize != 1) { |
359 | off *= esize; | ||
360 | size *= esize; | ||
361 | len *= esize; | ||
362 | } | ||
363 | l = min(len, size - off); | ||
364 | |||
365 | n = setup_sgl_buf(sgl, fifo->data + off, nents, l); | ||
366 | n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); | ||
367 | |||
368 | return n; | ||
237 | } | 369 | } |
238 | EXPORT_SYMBOL(__kfifo_in_n); | ||
239 | 370 | ||
240 | /** | 371 | unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, |
241 | * kfifo_in - puts some data into the FIFO | 372 | struct scatterlist *sgl, int nents, unsigned int len) |
242 | * @fifo: the fifo to be used. | ||
243 | * @from: the data to be added. | ||
244 | * @len: the length of the data to be added. | ||
245 | * | ||
246 | * This function copies at most @len bytes from the @from buffer into | ||
247 | * the FIFO depending on the free space, and returns the number of | ||
248 | * bytes copied. | ||
249 | * | ||
250 | * Note that with only one concurrent reader and one concurrent | ||
251 | * writer, you don't need extra locking to use these functions. | ||
252 | */ | ||
253 | unsigned int kfifo_in(struct kfifo *fifo, const void *from, | ||
254 | unsigned int len) | ||
255 | { | 373 | { |
256 | len = min(kfifo_avail(fifo), len); | 374 | unsigned int l; |
257 | 375 | ||
258 | __kfifo_in_data(fifo, from, len, 0); | 376 | l = kfifo_unused(fifo); |
259 | __kfifo_add_in(fifo, len); | 377 | if (len > l) |
260 | return len; | 378 | len = l; |
379 | |||
380 | return setup_sgl(fifo, sgl, nents, len, fifo->in); | ||
261 | } | 381 | } |
262 | EXPORT_SYMBOL(kfifo_in); | 382 | EXPORT_SYMBOL(__kfifo_dma_in_prepare); |
263 | 383 | ||
264 | unsigned int __kfifo_in_generic(struct kfifo *fifo, | 384 | unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, |
265 | const void *from, unsigned int len, unsigned int recsize) | 385 | struct scatterlist *sgl, int nents, unsigned int len) |
266 | { | 386 | { |
267 | return __kfifo_in_rec(fifo, from, len, recsize); | 387 | unsigned int l; |
388 | |||
389 | l = fifo->in - fifo->out; | ||
390 | if (len > l) | ||
391 | len = l; | ||
392 | |||
393 | return setup_sgl(fifo, sgl, nents, len, fifo->out); | ||
268 | } | 394 | } |
269 | EXPORT_SYMBOL(__kfifo_in_generic); | 395 | EXPORT_SYMBOL(__kfifo_dma_out_prepare); |
270 | 396 | ||
271 | unsigned int __kfifo_out_n(struct kfifo *fifo, | 397 | unsigned int __kfifo_max_r(unsigned int len, size_t recsize) |
272 | void *to, unsigned int len, unsigned int recsize) | ||
273 | { | 398 | { |
274 | if (kfifo_len(fifo) < len + recsize) | 399 | unsigned int max = (1 << (recsize << 3)) - 1; |
275 | return len; | ||
276 | 400 | ||
277 | __kfifo_out_data(fifo, to, len, recsize); | 401 | if (len > max) |
278 | __kfifo_add_out(fifo, len + recsize); | 402 | return max; |
279 | return 0; | 403 | return len; |
280 | } | 404 | } |
281 | EXPORT_SYMBOL(__kfifo_out_n); | ||
282 | 405 | ||
283 | /** | 406 | #define __KFIFO_PEEK(data, out, mask) \ |
284 | * kfifo_out - gets some data from the FIFO | 407 | ((data)[(out) & (mask)]) |
285 | * @fifo: the fifo to be used. | 408 | /* |
286 | * @to: where the data must be copied. | 409 | * __kfifo_peek_n internal helper function for determinate the length of |
287 | * @len: the size of the destination buffer. | 410 | * the next record in the fifo |
288 | * | ||
289 | * This function copies at most @len bytes from the FIFO into the | ||
290 | * @to buffer and returns the number of copied bytes. | ||
291 | * | ||
292 | * Note that with only one concurrent reader and one concurrent | ||
293 | * writer, you don't need extra locking to use these functions. | ||
294 | */ | 411 | */ |
295 | unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) | 412 | static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) |
296 | { | 413 | { |
297 | len = min(kfifo_len(fifo), len); | 414 | unsigned int l; |
415 | unsigned int mask = fifo->mask; | ||
416 | unsigned char *data = fifo->data; | ||
298 | 417 | ||
299 | __kfifo_out_data(fifo, to, len, 0); | 418 | l = __KFIFO_PEEK(data, fifo->out, mask); |
300 | __kfifo_add_out(fifo, len); | ||
301 | 419 | ||
302 | return len; | 420 | if (--recsize) |
421 | l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; | ||
422 | |||
423 | return l; | ||
303 | } | 424 | } |
304 | EXPORT_SYMBOL(kfifo_out); | ||
305 | 425 | ||
306 | /** | 426 | #define __KFIFO_POKE(data, in, mask, val) \ |
307 | * kfifo_out_peek - copy some data from the FIFO, but do not remove it | 427 | ( \ |
308 | * @fifo: the fifo to be used. | 428 | (data)[(in) & (mask)] = (unsigned char)(val) \ |
309 | * @to: where the data must be copied. | 429 | ) |
310 | * @len: the size of the destination buffer. | 430 | |
311 | * @offset: offset into the fifo | 431 | /* |
312 | * | 432 | * __kfifo_poke_n internal helper function for storeing the length of |
313 | * This function copies at most @len bytes at @offset from the FIFO | 433 | * the record into the fifo |
314 | * into the @to buffer and returns the number of copied bytes. | ||
315 | * The data is not removed from the FIFO. | ||
316 | */ | 434 | */ |
317 | unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, | 435 | static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) |
318 | unsigned offset) | ||
319 | { | 436 | { |
320 | len = min(kfifo_len(fifo), len + offset); | 437 | unsigned int mask = fifo->mask; |
438 | unsigned char *data = fifo->data; | ||
321 | 439 | ||
322 | __kfifo_out_data(fifo, to, len, offset); | 440 | __KFIFO_POKE(data, fifo->in, mask, n); |
323 | return len; | 441 | |
442 | if (recsize > 1) | ||
443 | __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); | ||
324 | } | 444 | } |
325 | EXPORT_SYMBOL(kfifo_out_peek); | ||
326 | 445 | ||
327 | unsigned int __kfifo_out_generic(struct kfifo *fifo, | 446 | unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) |
328 | void *to, unsigned int len, unsigned int recsize, | ||
329 | unsigned int *total) | ||
330 | { | 447 | { |
331 | return __kfifo_out_rec(fifo, to, len, recsize, total); | 448 | return __kfifo_peek_n(fifo, recsize); |
332 | } | 449 | } |
333 | EXPORT_SYMBOL(__kfifo_out_generic); | 450 | EXPORT_SYMBOL(__kfifo_len_r); |
334 | 451 | ||
335 | unsigned int __kfifo_from_user_n(struct kfifo *fifo, | 452 | unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, |
336 | const void __user *from, unsigned int len, unsigned int recsize) | 453 | unsigned int len, size_t recsize) |
337 | { | 454 | { |
338 | unsigned total; | 455 | if (len + recsize > kfifo_unused(fifo)) |
456 | return 0; | ||
339 | 457 | ||
340 | if (kfifo_avail(fifo) < len + recsize) | 458 | __kfifo_poke_n(fifo, len, recsize); |
341 | return len + 1; | ||
342 | 459 | ||
343 | __kfifo_from_user_data(fifo, from, len, recsize, &total); | 460 | kfifo_copy_in(fifo, buf, len, fifo->in + recsize); |
344 | return total; | 461 | fifo->in += len + recsize; |
462 | return len; | ||
345 | } | 463 | } |
346 | EXPORT_SYMBOL(__kfifo_from_user_n); | 464 | EXPORT_SYMBOL(__kfifo_in_r); |
347 | 465 | ||
348 | /** | 466 | static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, |
349 | * kfifo_from_user - puts some data from user space into the FIFO | 467 | void *buf, unsigned int len, size_t recsize, unsigned int *n) |
350 | * @fifo: the fifo to be used. | 468 | { |
351 | * @from: pointer to the data to be added. | 469 | *n = __kfifo_peek_n(fifo, recsize); |
352 | * @len: the length of the data to be added. | 470 | |
353 | * @total: the actual returned data length. | 471 | if (len > *n) |
354 | * | 472 | len = *n; |
355 | * This function copies at most @len bytes from the @from into the | 473 | |
356 | * FIFO depending and returns -EFAULT/0. | 474 | kfifo_copy_out(fifo, buf, len, fifo->out + recsize); |
357 | * | 475 | return len; |
358 | * Note that with only one concurrent reader and one concurrent | ||
359 | * writer, you don't need extra locking to use these functions. | ||
360 | */ | ||
361 | int kfifo_from_user(struct kfifo *fifo, | ||
362 | const void __user *from, unsigned int len, unsigned *total) | ||
363 | { | ||
364 | int ret; | ||
365 | len = min(kfifo_avail(fifo), len); | ||
366 | ret = __kfifo_from_user_data(fifo, from, len, 0, total); | ||
367 | if (ret) | ||
368 | return ret; | ||
369 | __kfifo_add_in(fifo, len); | ||
370 | return 0; | ||
371 | } | 476 | } |
372 | EXPORT_SYMBOL(kfifo_from_user); | ||
373 | 477 | ||
374 | unsigned int __kfifo_from_user_generic(struct kfifo *fifo, | 478 | unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, |
375 | const void __user *from, unsigned int len, unsigned int recsize) | 479 | unsigned int len, size_t recsize) |
376 | { | 480 | { |
377 | return __kfifo_from_user_rec(fifo, from, len, recsize); | 481 | unsigned int n; |
482 | |||
483 | if (fifo->in == fifo->out) | ||
484 | return 0; | ||
485 | |||
486 | return kfifo_out_copy_r(fifo, buf, len, recsize, &n); | ||
378 | } | 487 | } |
379 | EXPORT_SYMBOL(__kfifo_from_user_generic); | 488 | EXPORT_SYMBOL(__kfifo_out_peek_r); |
380 | 489 | ||
381 | unsigned int __kfifo_to_user_n(struct kfifo *fifo, | 490 | unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, |
382 | void __user *to, unsigned int len, unsigned int reclen, | 491 | unsigned int len, size_t recsize) |
383 | unsigned int recsize) | ||
384 | { | 492 | { |
385 | unsigned int ret, total; | 493 | unsigned int n; |
386 | 494 | ||
387 | if (kfifo_len(fifo) < reclen + recsize) | 495 | if (fifo->in == fifo->out) |
388 | return len; | 496 | return 0; |
389 | 497 | ||
390 | ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total); | 498 | len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); |
499 | fifo->out += n + recsize; | ||
500 | return len; | ||
501 | } | ||
502 | EXPORT_SYMBOL(__kfifo_out_r); | ||
391 | 503 | ||
392 | if (likely(ret == 0)) | 504 | void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) |
393 | __kfifo_add_out(fifo, reclen + recsize); | 505 | { |
506 | unsigned int n; | ||
394 | 507 | ||
395 | return total; | 508 | n = __kfifo_peek_n(fifo, recsize); |
509 | fifo->out += n + recsize; | ||
396 | } | 510 | } |
397 | EXPORT_SYMBOL(__kfifo_to_user_n); | 511 | EXPORT_SYMBOL(__kfifo_skip_r); |
398 | 512 | ||
399 | /** | 513 | int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, |
400 | * kfifo_to_user - gets data from the FIFO and write it to user space | 514 | unsigned long len, unsigned int *copied, size_t recsize) |
401 | * @fifo: the fifo to be used. | ||
402 | * @to: where the data must be copied. | ||
403 | * @len: the size of the destination buffer. | ||
404 | * @lenout: pointer to output variable with copied data | ||
405 | * | ||
406 | * This function copies at most @len bytes from the FIFO into the | ||
407 | * @to buffer and 0 or -EFAULT. | ||
408 | * | ||
409 | * Note that with only one concurrent reader and one concurrent | ||
410 | * writer, you don't need extra locking to use these functions. | ||
411 | */ | ||
412 | int kfifo_to_user(struct kfifo *fifo, | ||
413 | void __user *to, unsigned int len, unsigned *lenout) | ||
414 | { | 515 | { |
415 | int ret; | 516 | unsigned long ret; |
416 | len = min(kfifo_len(fifo), len); | 517 | |
417 | ret = __kfifo_to_user_data(fifo, to, len, 0, lenout); | 518 | len = __kfifo_max_r(len, recsize); |
418 | __kfifo_add_out(fifo, *lenout); | 519 | |
419 | return ret; | 520 | if (len + recsize > kfifo_unused(fifo)) { |
521 | *copied = 0; | ||
522 | return 0; | ||
523 | } | ||
524 | |||
525 | __kfifo_poke_n(fifo, len, recsize); | ||
526 | |||
527 | ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); | ||
528 | if (unlikely(ret)) { | ||
529 | *copied = 0; | ||
530 | return -EFAULT; | ||
531 | } | ||
532 | fifo->in += len + recsize; | ||
533 | return 0; | ||
420 | } | 534 | } |
421 | EXPORT_SYMBOL(kfifo_to_user); | 535 | EXPORT_SYMBOL(__kfifo_from_user_r); |
422 | 536 | ||
423 | unsigned int __kfifo_to_user_generic(struct kfifo *fifo, | 537 | int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, |
424 | void __user *to, unsigned int len, unsigned int recsize, | 538 | unsigned long len, unsigned int *copied, size_t recsize) |
425 | unsigned int *total) | ||
426 | { | 539 | { |
427 | return __kfifo_to_user_rec(fifo, to, len, recsize, total); | 540 | unsigned long ret; |
541 | unsigned int n; | ||
542 | |||
543 | if (fifo->in == fifo->out) { | ||
544 | *copied = 0; | ||
545 | return 0; | ||
546 | } | ||
547 | |||
548 | n = __kfifo_peek_n(fifo, recsize); | ||
549 | if (len > n) | ||
550 | len = n; | ||
551 | |||
552 | ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); | ||
553 | if (unlikely(ret)) { | ||
554 | *copied = 0; | ||
555 | return -EFAULT; | ||
556 | } | ||
557 | fifo->out += n + recsize; | ||
558 | return 0; | ||
428 | } | 559 | } |
429 | EXPORT_SYMBOL(__kfifo_to_user_generic); | 560 | EXPORT_SYMBOL(__kfifo_to_user_r); |
430 | 561 | ||
431 | unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize) | 562 | unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, |
563 | struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) | ||
432 | { | 564 | { |
433 | if (recsize == 0) | 565 | if (!nents) |
434 | return kfifo_avail(fifo); | 566 | BUG(); |
435 | 567 | ||
436 | return __kfifo_peek_n(fifo, recsize); | 568 | len = __kfifo_max_r(len, recsize); |
569 | |||
570 | if (len + recsize > kfifo_unused(fifo)) | ||
571 | return 0; | ||
572 | |||
573 | return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize); | ||
437 | } | 574 | } |
438 | EXPORT_SYMBOL(__kfifo_peek_generic); | 575 | EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); |
439 | 576 | ||
440 | void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize) | 577 | void __kfifo_dma_in_finish_r(struct __kfifo *fifo, |
578 | unsigned int len, size_t recsize) | ||
441 | { | 579 | { |
442 | __kfifo_skip_rec(fifo, recsize); | 580 | len = __kfifo_max_r(len, recsize); |
581 | __kfifo_poke_n(fifo, len, recsize); | ||
582 | fifo->in += len + recsize; | ||
443 | } | 583 | } |
444 | EXPORT_SYMBOL(__kfifo_skip_generic); | 584 | EXPORT_SYMBOL(__kfifo_dma_in_finish_r); |
445 | 585 | ||
586 | unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, | ||
587 | struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) | ||
588 | { | ||
589 | if (!nents) | ||
590 | BUG(); | ||
591 | |||
592 | len = __kfifo_max_r(len, recsize); | ||
593 | |||
594 | if (len + recsize > fifo->in - fifo->out) | ||
595 | return 0; | ||
596 | |||
597 | return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize); | ||
598 | } | ||
599 | EXPORT_SYMBOL(__kfifo_dma_out_prepare_r); | ||
600 | |||
601 | void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize) | ||
602 | { | ||
603 | unsigned int len; | ||
604 | |||
605 | len = __kfifo_peek_n(fifo, recsize); | ||
606 | fifo->out += len + recsize; | ||
607 | } | ||
608 | EXPORT_SYMBOL(__kfifo_dma_out_finish_r); | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index 6e9b19667a8d..9cd0591c96a2 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -153,7 +153,9 @@ static int ____call_usermodehelper(void *data) | |||
153 | goto fail; | 153 | goto fail; |
154 | } | 154 | } |
155 | 155 | ||
156 | retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); | 156 | retval = kernel_execve(sub_info->path, |
157 | (const char *const *)sub_info->argv, | ||
158 | (const char *const *)sub_info->envp); | ||
157 | 159 | ||
158 | /* Exec failed? */ | 160 | /* Exec failed? */ |
159 | fail: | 161 | fail: |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 282035f3ae96..99865c33a60d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/memory.h> | 47 | #include <linux/memory.h> |
48 | #include <linux/ftrace.h> | 48 | #include <linux/ftrace.h> |
49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
50 | #include <linux/jump_label.h> | ||
50 | 51 | ||
51 | #include <asm-generic/sections.h> | 52 | #include <asm-generic/sections.h> |
52 | #include <asm/cacheflush.h> | 53 | #include <asm/cacheflush.h> |
@@ -73,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | |||
73 | /* NOTE: change this value only with kprobe_mutex held */ | 74 | /* NOTE: change this value only with kprobe_mutex held */ |
74 | static bool kprobes_all_disarmed; | 75 | static bool kprobes_all_disarmed; |
75 | 76 | ||
76 | static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ | 77 | /* This protects kprobe_table and optimizing_list */ |
78 | static DEFINE_MUTEX(kprobe_mutex); | ||
77 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 79 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
78 | static struct { | 80 | static struct { |
79 | spinlock_t lock ____cacheline_aligned_in_smp; | 81 | spinlock_t lock ____cacheline_aligned_in_smp; |
@@ -399,7 +401,7 @@ static inline int kprobe_optready(struct kprobe *p) | |||
399 | * Return an optimized kprobe whose optimizing code replaces | 401 | * Return an optimized kprobe whose optimizing code replaces |
400 | * instructions including addr (exclude breakpoint). | 402 | * instructions including addr (exclude breakpoint). |
401 | */ | 403 | */ |
402 | struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) | 404 | static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) |
403 | { | 405 | { |
404 | int i; | 406 | int i; |
405 | struct kprobe *p = NULL; | 407 | struct kprobe *p = NULL; |
@@ -594,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) | |||
594 | } | 596 | } |
595 | 597 | ||
596 | #ifdef CONFIG_SYSCTL | 598 | #ifdef CONFIG_SYSCTL |
599 | /* This should be called with kprobe_mutex locked */ | ||
597 | static void __kprobes optimize_all_kprobes(void) | 600 | static void __kprobes optimize_all_kprobes(void) |
598 | { | 601 | { |
599 | struct hlist_head *head; | 602 | struct hlist_head *head; |
@@ -606,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void) | |||
606 | return; | 609 | return; |
607 | 610 | ||
608 | kprobes_allow_optimization = true; | 611 | kprobes_allow_optimization = true; |
609 | mutex_lock(&text_mutex); | ||
610 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 612 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
611 | head = &kprobe_table[i]; | 613 | head = &kprobe_table[i]; |
612 | hlist_for_each_entry_rcu(p, node, head, hlist) | 614 | hlist_for_each_entry_rcu(p, node, head, hlist) |
613 | if (!kprobe_disabled(p)) | 615 | if (!kprobe_disabled(p)) |
614 | optimize_kprobe(p); | 616 | optimize_kprobe(p); |
615 | } | 617 | } |
616 | mutex_unlock(&text_mutex); | ||
617 | printk(KERN_INFO "Kprobes globally optimized\n"); | 618 | printk(KERN_INFO "Kprobes globally optimized\n"); |
618 | } | 619 | } |
619 | 620 | ||
621 | /* This should be called with kprobe_mutex locked */ | ||
620 | static void __kprobes unoptimize_all_kprobes(void) | 622 | static void __kprobes unoptimize_all_kprobes(void) |
621 | { | 623 | { |
622 | struct hlist_head *head; | 624 | struct hlist_head *head; |
@@ -831,6 +833,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, | |||
831 | 833 | ||
832 | void __kprobes kretprobe_hash_lock(struct task_struct *tsk, | 834 | void __kprobes kretprobe_hash_lock(struct task_struct *tsk, |
833 | struct hlist_head **head, unsigned long *flags) | 835 | struct hlist_head **head, unsigned long *flags) |
836 | __acquires(hlist_lock) | ||
834 | { | 837 | { |
835 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 838 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
836 | spinlock_t *hlist_lock; | 839 | spinlock_t *hlist_lock; |
@@ -842,6 +845,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, | |||
842 | 845 | ||
843 | static void __kprobes kretprobe_table_lock(unsigned long hash, | 846 | static void __kprobes kretprobe_table_lock(unsigned long hash, |
844 | unsigned long *flags) | 847 | unsigned long *flags) |
848 | __acquires(hlist_lock) | ||
845 | { | 849 | { |
846 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 850 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
847 | spin_lock_irqsave(hlist_lock, *flags); | 851 | spin_lock_irqsave(hlist_lock, *flags); |
@@ -849,6 +853,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash, | |||
849 | 853 | ||
850 | void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, | 854 | void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, |
851 | unsigned long *flags) | 855 | unsigned long *flags) |
856 | __releases(hlist_lock) | ||
852 | { | 857 | { |
853 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 858 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
854 | spinlock_t *hlist_lock; | 859 | spinlock_t *hlist_lock; |
@@ -857,7 +862,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, | |||
857 | spin_unlock_irqrestore(hlist_lock, *flags); | 862 | spin_unlock_irqrestore(hlist_lock, *flags); |
858 | } | 863 | } |
859 | 864 | ||
860 | void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) | 865 | static void __kprobes kretprobe_table_unlock(unsigned long hash, |
866 | unsigned long *flags) | ||
867 | __releases(hlist_lock) | ||
861 | { | 868 | { |
862 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 869 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
863 | spin_unlock_irqrestore(hlist_lock, *flags); | 870 | spin_unlock_irqrestore(hlist_lock, *flags); |
@@ -1141,7 +1148,8 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
1141 | preempt_disable(); | 1148 | preempt_disable(); |
1142 | if (!kernel_text_address((unsigned long) p->addr) || | 1149 | if (!kernel_text_address((unsigned long) p->addr) || |
1143 | in_kprobes_functions((unsigned long) p->addr) || | 1150 | in_kprobes_functions((unsigned long) p->addr) || |
1144 | ftrace_text_reserved(p->addr, p->addr)) { | 1151 | ftrace_text_reserved(p->addr, p->addr) || |
1152 | jump_label_text_reserved(p->addr, p->addr)) { | ||
1145 | preempt_enable(); | 1153 | preempt_enable(); |
1146 | return -EINVAL; | 1154 | return -EINVAL; |
1147 | } | 1155 | } |
@@ -1339,18 +1347,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num) | |||
1339 | if (num <= 0) | 1347 | if (num <= 0) |
1340 | return -EINVAL; | 1348 | return -EINVAL; |
1341 | for (i = 0; i < num; i++) { | 1349 | for (i = 0; i < num; i++) { |
1342 | unsigned long addr; | 1350 | unsigned long addr, offset; |
1343 | jp = jps[i]; | 1351 | jp = jps[i]; |
1344 | addr = arch_deref_entry_point(jp->entry); | 1352 | addr = arch_deref_entry_point(jp->entry); |
1345 | 1353 | ||
1346 | if (!kernel_text_address(addr)) | 1354 | /* Verify probepoint is a function entry point */ |
1347 | ret = -EINVAL; | 1355 | if (kallsyms_lookup_size_offset(addr, NULL, &offset) && |
1348 | else { | 1356 | offset == 0) { |
1349 | /* Todo: Verify probepoint is a function entry point */ | ||
1350 | jp->kp.pre_handler = setjmp_pre_handler; | 1357 | jp->kp.pre_handler = setjmp_pre_handler; |
1351 | jp->kp.break_handler = longjmp_break_handler; | 1358 | jp->kp.break_handler = longjmp_break_handler; |
1352 | ret = register_kprobe(&jp->kp); | 1359 | ret = register_kprobe(&jp->kp); |
1353 | } | 1360 | } else |
1361 | ret = -EINVAL; | ||
1362 | |||
1354 | if (ret < 0) { | 1363 | if (ret < 0) { |
1355 | if (i > 0) | 1364 | if (i > 0) |
1356 | unregister_jprobes(jps, i); | 1365 | unregister_jprobes(jps, i); |
@@ -1992,6 +2001,7 @@ static ssize_t write_enabled_file_bool(struct file *file, | |||
1992 | static const struct file_operations fops_kp = { | 2001 | static const struct file_operations fops_kp = { |
1993 | .read = read_enabled_file_bool, | 2002 | .read = read_enabled_file_bool, |
1994 | .write = write_enabled_file_bool, | 2003 | .write = write_enabled_file_bool, |
2004 | .llseek = default_llseek, | ||
1995 | }; | 2005 | }; |
1996 | 2006 | ||
1997 | static int __kprobes debugfs_kprobe_init(void) | 2007 | static int __kprobes debugfs_kprobe_init(void) |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 83911c780175..2dc3786349d1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -14,6 +14,8 @@ | |||
14 | #include <linux/file.h> | 14 | #include <linux/file.h> |
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | #include <linux/slab.h> | ||
18 | #include <linux/freezer.h> | ||
17 | #include <trace/events/sched.h> | 19 | #include <trace/events/sched.h> |
18 | 20 | ||
19 | static DEFINE_SPINLOCK(kthread_create_lock); | 21 | static DEFINE_SPINLOCK(kthread_create_lock); |
@@ -35,6 +37,7 @@ struct kthread_create_info | |||
35 | 37 | ||
36 | struct kthread { | 38 | struct kthread { |
37 | int should_stop; | 39 | int should_stop; |
40 | void *data; | ||
38 | struct completion exited; | 41 | struct completion exited; |
39 | }; | 42 | }; |
40 | 43 | ||
@@ -54,6 +57,19 @@ int kthread_should_stop(void) | |||
54 | } | 57 | } |
55 | EXPORT_SYMBOL(kthread_should_stop); | 58 | EXPORT_SYMBOL(kthread_should_stop); |
56 | 59 | ||
60 | /** | ||
61 | * kthread_data - return data value specified on kthread creation | ||
62 | * @task: kthread task in question | ||
63 | * | ||
64 | * Return the data value specified when kthread @task was created. | ||
65 | * The caller is responsible for ensuring the validity of @task when | ||
66 | * calling this function. | ||
67 | */ | ||
68 | void *kthread_data(struct task_struct *task) | ||
69 | { | ||
70 | return to_kthread(task)->data; | ||
71 | } | ||
72 | |||
57 | static int kthread(void *_create) | 73 | static int kthread(void *_create) |
58 | { | 74 | { |
59 | /* Copy data: it's on kthread's stack */ | 75 | /* Copy data: it's on kthread's stack */ |
@@ -64,6 +80,7 @@ static int kthread(void *_create) | |||
64 | int ret; | 80 | int ret; |
65 | 81 | ||
66 | self.should_stop = 0; | 82 | self.should_stop = 0; |
83 | self.data = data; | ||
67 | init_completion(&self.exited); | 84 | init_completion(&self.exited); |
68 | current->vfork_done = &self.exited; | 85 | current->vfork_done = &self.exited; |
69 | 86 | ||
@@ -247,3 +264,150 @@ int kthreadd(void *unused) | |||
247 | 264 | ||
248 | return 0; | 265 | return 0; |
249 | } | 266 | } |
267 | |||
268 | /** | ||
269 | * kthread_worker_fn - kthread function to process kthread_worker | ||
270 | * @worker_ptr: pointer to initialized kthread_worker | ||
271 | * | ||
272 | * This function can be used as @threadfn to kthread_create() or | ||
273 | * kthread_run() with @worker_ptr argument pointing to an initialized | ||
274 | * kthread_worker. The started kthread will process work_list until | ||
275 | * the it is stopped with kthread_stop(). A kthread can also call | ||
276 | * this function directly after extra initialization. | ||
277 | * | ||
278 | * Different kthreads can be used for the same kthread_worker as long | ||
279 | * as there's only one kthread attached to it at any given time. A | ||
280 | * kthread_worker without an attached kthread simply collects queued | ||
281 | * kthread_works. | ||
282 | */ | ||
283 | int kthread_worker_fn(void *worker_ptr) | ||
284 | { | ||
285 | struct kthread_worker *worker = worker_ptr; | ||
286 | struct kthread_work *work; | ||
287 | |||
288 | WARN_ON(worker->task); | ||
289 | worker->task = current; | ||
290 | repeat: | ||
291 | set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ | ||
292 | |||
293 | if (kthread_should_stop()) { | ||
294 | __set_current_state(TASK_RUNNING); | ||
295 | spin_lock_irq(&worker->lock); | ||
296 | worker->task = NULL; | ||
297 | spin_unlock_irq(&worker->lock); | ||
298 | return 0; | ||
299 | } | ||
300 | |||
301 | work = NULL; | ||
302 | spin_lock_irq(&worker->lock); | ||
303 | if (!list_empty(&worker->work_list)) { | ||
304 | work = list_first_entry(&worker->work_list, | ||
305 | struct kthread_work, node); | ||
306 | list_del_init(&work->node); | ||
307 | } | ||
308 | spin_unlock_irq(&worker->lock); | ||
309 | |||
310 | if (work) { | ||
311 | __set_current_state(TASK_RUNNING); | ||
312 | work->func(work); | ||
313 | smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ | ||
314 | work->done_seq = work->queue_seq; | ||
315 | smp_mb(); /* mb worker-b1 paired with flush-b0 */ | ||
316 | if (atomic_read(&work->flushing)) | ||
317 | wake_up_all(&work->done); | ||
318 | } else if (!freezing(current)) | ||
319 | schedule(); | ||
320 | |||
321 | try_to_freeze(); | ||
322 | goto repeat; | ||
323 | } | ||
324 | EXPORT_SYMBOL_GPL(kthread_worker_fn); | ||
325 | |||
326 | /** | ||
327 | * queue_kthread_work - queue a kthread_work | ||
328 | * @worker: target kthread_worker | ||
329 | * @work: kthread_work to queue | ||
330 | * | ||
331 | * Queue @work to work processor @task for async execution. @task | ||
332 | * must have been created with kthread_worker_create(). Returns %true | ||
333 | * if @work was successfully queued, %false if it was already pending. | ||
334 | */ | ||
335 | bool queue_kthread_work(struct kthread_worker *worker, | ||
336 | struct kthread_work *work) | ||
337 | { | ||
338 | bool ret = false; | ||
339 | unsigned long flags; | ||
340 | |||
341 | spin_lock_irqsave(&worker->lock, flags); | ||
342 | if (list_empty(&work->node)) { | ||
343 | list_add_tail(&work->node, &worker->work_list); | ||
344 | work->queue_seq++; | ||
345 | if (likely(worker->task)) | ||
346 | wake_up_process(worker->task); | ||
347 | ret = true; | ||
348 | } | ||
349 | spin_unlock_irqrestore(&worker->lock, flags); | ||
350 | return ret; | ||
351 | } | ||
352 | EXPORT_SYMBOL_GPL(queue_kthread_work); | ||
353 | |||
354 | /** | ||
355 | * flush_kthread_work - flush a kthread_work | ||
356 | * @work: work to flush | ||
357 | * | ||
358 | * If @work is queued or executing, wait for it to finish execution. | ||
359 | */ | ||
360 | void flush_kthread_work(struct kthread_work *work) | ||
361 | { | ||
362 | int seq = work->queue_seq; | ||
363 | |||
364 | atomic_inc(&work->flushing); | ||
365 | |||
366 | /* | ||
367 | * mb flush-b0 paired with worker-b1, to make sure either | ||
368 | * worker sees the above increment or we see done_seq update. | ||
369 | */ | ||
370 | smp_mb__after_atomic_inc(); | ||
371 | |||
372 | /* A - B <= 0 tests whether B is in front of A regardless of overflow */ | ||
373 | wait_event(work->done, seq - work->done_seq <= 0); | ||
374 | atomic_dec(&work->flushing); | ||
375 | |||
376 | /* | ||
377 | * rmb flush-b1 paired with worker-b0, to make sure our caller | ||
378 | * sees every change made by work->func(). | ||
379 | */ | ||
380 | smp_mb__after_atomic_dec(); | ||
381 | } | ||
382 | EXPORT_SYMBOL_GPL(flush_kthread_work); | ||
383 | |||
384 | struct kthread_flush_work { | ||
385 | struct kthread_work work; | ||
386 | struct completion done; | ||
387 | }; | ||
388 | |||
389 | static void kthread_flush_work_fn(struct kthread_work *work) | ||
390 | { | ||
391 | struct kthread_flush_work *fwork = | ||
392 | container_of(work, struct kthread_flush_work, work); | ||
393 | complete(&fwork->done); | ||
394 | } | ||
395 | |||
396 | /** | ||
397 | * flush_kthread_worker - flush all current works on a kthread_worker | ||
398 | * @worker: worker to flush | ||
399 | * | ||
400 | * Wait until all currently executing or pending works on @worker are | ||
401 | * finished. | ||
402 | */ | ||
403 | void flush_kthread_worker(struct kthread_worker *worker) | ||
404 | { | ||
405 | struct kthread_flush_work fwork = { | ||
406 | KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), | ||
407 | COMPLETION_INITIALIZER_ONSTACK(fwork.done), | ||
408 | }; | ||
409 | |||
410 | queue_kthread_work(worker, &fwork.work); | ||
411 | wait_for_completion(&fwork.done); | ||
412 | } | ||
413 | EXPORT_SYMBOL_GPL(flush_kthread_worker); | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 54286798c37b..42ba65dff7d9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], | |||
146 | 146 | ||
147 | static inline u64 lockstat_clock(void) | 147 | static inline u64 lockstat_clock(void) |
148 | { | 148 | { |
149 | return cpu_clock(smp_processor_id()); | 149 | return local_clock(); |
150 | } | 150 | } |
151 | 151 | ||
152 | static int lock_point(unsigned long points[], unsigned long ip) | 152 | static int lock_point(unsigned long points[], unsigned long ip) |
@@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
639 | } | 639 | } |
640 | #endif | 640 | #endif |
641 | 641 | ||
642 | if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { | ||
643 | debug_locks_off(); | ||
644 | printk(KERN_ERR | ||
645 | "BUG: looking up invalid subclass: %u\n", subclass); | ||
646 | printk(KERN_ERR | ||
647 | "turning off the locking correctness validator.\n"); | ||
648 | dump_stack(); | ||
649 | return NULL; | ||
650 | } | ||
651 | |||
642 | /* | 652 | /* |
643 | * Static locks do not have their class-keys yet - for them the key | 653 | * Static locks do not have their class-keys yet - for them the key |
644 | * is the lock object itself: | 654 | * is the lock object itself: |
@@ -774,7 +784,9 @@ out_unlock_set: | |||
774 | raw_local_irq_restore(flags); | 784 | raw_local_irq_restore(flags); |
775 | 785 | ||
776 | if (!subclass || force) | 786 | if (!subclass || force) |
777 | lock->class_cache = class; | 787 | lock->class_cache[0] = class; |
788 | else if (subclass < NR_LOCKDEP_CACHING_CLASSES) | ||
789 | lock->class_cache[subclass] = class; | ||
778 | 790 | ||
779 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) | 791 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) |
780 | return NULL; | 792 | return NULL; |
@@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
2679 | void lockdep_init_map(struct lockdep_map *lock, const char *name, | 2691 | void lockdep_init_map(struct lockdep_map *lock, const char *name, |
2680 | struct lock_class_key *key, int subclass) | 2692 | struct lock_class_key *key, int subclass) |
2681 | { | 2693 | { |
2682 | lock->class_cache = NULL; | 2694 | int i; |
2695 | |||
2696 | for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) | ||
2697 | lock->class_cache[i] = NULL; | ||
2698 | |||
2683 | #ifdef CONFIG_LOCK_STAT | 2699 | #ifdef CONFIG_LOCK_STAT |
2684 | lock->cpu = raw_smp_processor_id(); | 2700 | lock->cpu = raw_smp_processor_id(); |
2685 | #endif | 2701 | #endif |
@@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2739 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2755 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
2740 | return 0; | 2756 | return 0; |
2741 | 2757 | ||
2742 | if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { | ||
2743 | debug_locks_off(); | ||
2744 | printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); | ||
2745 | printk("turning off the locking correctness validator.\n"); | ||
2746 | dump_stack(); | ||
2747 | return 0; | ||
2748 | } | ||
2749 | |||
2750 | if (lock->key == &__lockdep_no_validate__) | 2758 | if (lock->key == &__lockdep_no_validate__) |
2751 | check = 1; | 2759 | check = 1; |
2752 | 2760 | ||
2753 | if (!subclass) | 2761 | if (subclass < NR_LOCKDEP_CACHING_CLASSES) |
2754 | class = lock->class_cache; | 2762 | class = lock->class_cache[subclass]; |
2755 | /* | 2763 | /* |
2756 | * Not cached yet or subclass? | 2764 | * Not cached? |
2757 | */ | 2765 | */ |
2758 | if (unlikely(!class)) { | 2766 | if (unlikely(!class)) { |
2759 | class = register_lock_class(lock, subclass, 0); | 2767 | class = register_lock_class(lock, subclass, 0); |
@@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) | |||
2918 | return 1; | 2926 | return 1; |
2919 | 2927 | ||
2920 | if (hlock->references) { | 2928 | if (hlock->references) { |
2921 | struct lock_class *class = lock->class_cache; | 2929 | struct lock_class *class = lock->class_cache[0]; |
2922 | 2930 | ||
2923 | if (!class) | 2931 | if (!class) |
2924 | class = look_up_lock_class(lock, 0); | 2932 | class = look_up_lock_class(lock, 0); |
@@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
3559 | if (list_empty(head)) | 3567 | if (list_empty(head)) |
3560 | continue; | 3568 | continue; |
3561 | list_for_each_entry_safe(class, next, head, hash_entry) { | 3569 | list_for_each_entry_safe(class, next, head, hash_entry) { |
3562 | if (unlikely(class == lock->class_cache)) { | 3570 | int match = 0; |
3571 | |||
3572 | for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) | ||
3573 | match |= class == lock->class_cache[j]; | ||
3574 | |||
3575 | if (unlikely(match)) { | ||
3563 | if (debug_locks_off_graph_unlock()) | 3576 | if (debug_locks_off_graph_unlock()) |
3564 | WARN_ON(1); | 3577 | WARN_ON(1); |
3565 | goto out_restore; | 3578 | goto out_restore; |
@@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks); | |||
3775 | * Careful: only use this function if you are sure that | 3788 | * Careful: only use this function if you are sure that |
3776 | * the task cannot run in parallel! | 3789 | * the task cannot run in parallel! |
3777 | */ | 3790 | */ |
3778 | void __debug_show_held_locks(struct task_struct *task) | 3791 | void debug_show_held_locks(struct task_struct *task) |
3779 | { | 3792 | { |
3780 | if (unlikely(!debug_locks)) { | 3793 | if (unlikely(!debug_locks)) { |
3781 | printk("INFO: lockdep is turned off.\n"); | 3794 | printk("INFO: lockdep is turned off.\n"); |
@@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task) | |||
3783 | } | 3796 | } |
3784 | lockdep_print_held_locks(task); | 3797 | lockdep_print_held_locks(task); |
3785 | } | 3798 | } |
3786 | EXPORT_SYMBOL_GPL(__debug_show_held_locks); | ||
3787 | |||
3788 | void debug_show_held_locks(struct task_struct *task) | ||
3789 | { | ||
3790 | __debug_show_held_locks(task); | ||
3791 | } | ||
3792 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | 3799 | EXPORT_SYMBOL_GPL(debug_show_held_locks); |
3793 | 3800 | ||
3794 | void lockdep_sys_exit(void) | 3801 | void lockdep_sys_exit(void) |
diff --git a/kernel/module.c b/kernel/module.c index 6c562828c85c..437a74a7524a 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | Copyright (C) 2002 Richard Henderson | 2 | Copyright (C) 2002 Richard Henderson |
3 | Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. | 3 | Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. |
4 | 4 | ||
5 | This program is free software; you can redistribute it and/or modify | 5 | This program is free software; you can redistribute it and/or modify |
6 | it under the terms of the GNU General Public License as published by | 6 | it under the terms of the GNU General Public License as published by |
@@ -55,6 +55,7 @@ | |||
55 | #include <linux/async.h> | 55 | #include <linux/async.h> |
56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
57 | #include <linux/kmemleak.h> | 57 | #include <linux/kmemleak.h> |
58 | #include <linux/jump_label.h> | ||
58 | 59 | ||
59 | #define CREATE_TRACE_POINTS | 60 | #define CREATE_TRACE_POINTS |
60 | #include <trace/events/module.h> | 61 | #include <trace/events/module.h> |
@@ -110,6 +111,20 @@ int unregister_module_notifier(struct notifier_block * nb) | |||
110 | } | 111 | } |
111 | EXPORT_SYMBOL(unregister_module_notifier); | 112 | EXPORT_SYMBOL(unregister_module_notifier); |
112 | 113 | ||
114 | struct load_info { | ||
115 | Elf_Ehdr *hdr; | ||
116 | unsigned long len; | ||
117 | Elf_Shdr *sechdrs; | ||
118 | char *secstrings, *strtab; | ||
119 | unsigned long *strmap; | ||
120 | unsigned long symoffs, stroffs; | ||
121 | struct _ddebug *debug; | ||
122 | unsigned int num_debug; | ||
123 | struct { | ||
124 | unsigned int sym, str, mod, vers, info, pcpu; | ||
125 | } index; | ||
126 | }; | ||
127 | |||
113 | /* We require a truly strong try_module_get(): 0 means failure due to | 128 | /* We require a truly strong try_module_get(): 0 means failure due to |
114 | ongoing or failed initialization etc. */ | 129 | ongoing or failed initialization etc. */ |
115 | static inline int strong_try_module_get(struct module *mod) | 130 | static inline int strong_try_module_get(struct module *mod) |
@@ -140,42 +155,38 @@ void __module_put_and_exit(struct module *mod, long code) | |||
140 | EXPORT_SYMBOL(__module_put_and_exit); | 155 | EXPORT_SYMBOL(__module_put_and_exit); |
141 | 156 | ||
142 | /* Find a module section: 0 means not found. */ | 157 | /* Find a module section: 0 means not found. */ |
143 | static unsigned int find_sec(Elf_Ehdr *hdr, | 158 | static unsigned int find_sec(const struct load_info *info, const char *name) |
144 | Elf_Shdr *sechdrs, | ||
145 | const char *secstrings, | ||
146 | const char *name) | ||
147 | { | 159 | { |
148 | unsigned int i; | 160 | unsigned int i; |
149 | 161 | ||
150 | for (i = 1; i < hdr->e_shnum; i++) | 162 | for (i = 1; i < info->hdr->e_shnum; i++) { |
163 | Elf_Shdr *shdr = &info->sechdrs[i]; | ||
151 | /* Alloc bit cleared means "ignore it." */ | 164 | /* Alloc bit cleared means "ignore it." */ |
152 | if ((sechdrs[i].sh_flags & SHF_ALLOC) | 165 | if ((shdr->sh_flags & SHF_ALLOC) |
153 | && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) | 166 | && strcmp(info->secstrings + shdr->sh_name, name) == 0) |
154 | return i; | 167 | return i; |
168 | } | ||
155 | return 0; | 169 | return 0; |
156 | } | 170 | } |
157 | 171 | ||
158 | /* Find a module section, or NULL. */ | 172 | /* Find a module section, or NULL. */ |
159 | static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, | 173 | static void *section_addr(const struct load_info *info, const char *name) |
160 | const char *secstrings, const char *name) | ||
161 | { | 174 | { |
162 | /* Section 0 has sh_addr 0. */ | 175 | /* Section 0 has sh_addr 0. */ |
163 | return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; | 176 | return (void *)info->sechdrs[find_sec(info, name)].sh_addr; |
164 | } | 177 | } |
165 | 178 | ||
166 | /* Find a module section, or NULL. Fill in number of "objects" in section. */ | 179 | /* Find a module section, or NULL. Fill in number of "objects" in section. */ |
167 | static void *section_objs(Elf_Ehdr *hdr, | 180 | static void *section_objs(const struct load_info *info, |
168 | Elf_Shdr *sechdrs, | ||
169 | const char *secstrings, | ||
170 | const char *name, | 181 | const char *name, |
171 | size_t object_size, | 182 | size_t object_size, |
172 | unsigned int *num) | 183 | unsigned int *num) |
173 | { | 184 | { |
174 | unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); | 185 | unsigned int sec = find_sec(info, name); |
175 | 186 | ||
176 | /* Section 0 has sh_addr 0 and sh_size 0. */ | 187 | /* Section 0 has sh_addr 0 and sh_size 0. */ |
177 | *num = sechdrs[sec].sh_size / object_size; | 188 | *num = info->sechdrs[sec].sh_size / object_size; |
178 | return (void *)sechdrs[sec].sh_addr; | 189 | return (void *)info->sechdrs[sec].sh_addr; |
179 | } | 190 | } |
180 | 191 | ||
181 | /* Provided by the linker */ | 192 | /* Provided by the linker */ |
@@ -227,7 +238,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, | |||
227 | unsigned int symnum, void *data), void *data) | 238 | unsigned int symnum, void *data), void *data) |
228 | { | 239 | { |
229 | struct module *mod; | 240 | struct module *mod; |
230 | const struct symsearch arr[] = { | 241 | static const struct symsearch arr[] = { |
231 | { __start___ksymtab, __stop___ksymtab, __start___kcrctab, | 242 | { __start___ksymtab, __stop___ksymtab, __start___kcrctab, |
232 | NOT_GPL_ONLY, false }, | 243 | NOT_GPL_ONLY, false }, |
233 | { __start___ksymtab_gpl, __stop___ksymtab_gpl, | 244 | { __start___ksymtab_gpl, __stop___ksymtab_gpl, |
@@ -392,7 +403,8 @@ static int percpu_modalloc(struct module *mod, | |||
392 | mod->percpu = __alloc_reserved_percpu(size, align); | 403 | mod->percpu = __alloc_reserved_percpu(size, align); |
393 | if (!mod->percpu) { | 404 | if (!mod->percpu) { |
394 | printk(KERN_WARNING | 405 | printk(KERN_WARNING |
395 | "Could not allocate %lu bytes percpu data\n", size); | 406 | "%s: Could not allocate %lu bytes percpu data\n", |
407 | mod->name, size); | ||
396 | return -ENOMEM; | 408 | return -ENOMEM; |
397 | } | 409 | } |
398 | mod->percpu_size = size; | 410 | mod->percpu_size = size; |
@@ -404,11 +416,9 @@ static void percpu_modfree(struct module *mod) | |||
404 | free_percpu(mod->percpu); | 416 | free_percpu(mod->percpu); |
405 | } | 417 | } |
406 | 418 | ||
407 | static unsigned int find_pcpusec(Elf_Ehdr *hdr, | 419 | static unsigned int find_pcpusec(struct load_info *info) |
408 | Elf_Shdr *sechdrs, | ||
409 | const char *secstrings) | ||
410 | { | 420 | { |
411 | return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); | 421 | return find_sec(info, ".data..percpu"); |
412 | } | 422 | } |
413 | 423 | ||
414 | static void percpu_modcopy(struct module *mod, | 424 | static void percpu_modcopy(struct module *mod, |
@@ -468,9 +478,7 @@ static inline int percpu_modalloc(struct module *mod, | |||
468 | static inline void percpu_modfree(struct module *mod) | 478 | static inline void percpu_modfree(struct module *mod) |
469 | { | 479 | { |
470 | } | 480 | } |
471 | static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, | 481 | static unsigned int find_pcpusec(struct load_info *info) |
472 | Elf_Shdr *sechdrs, | ||
473 | const char *secstrings) | ||
474 | { | 482 | { |
475 | return 0; | 483 | return 0; |
476 | } | 484 | } |
@@ -524,21 +532,21 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; | |||
524 | EXPORT_TRACEPOINT_SYMBOL(module_get); | 532 | EXPORT_TRACEPOINT_SYMBOL(module_get); |
525 | 533 | ||
526 | /* Init the unload section of the module. */ | 534 | /* Init the unload section of the module. */ |
527 | static void module_unload_init(struct module *mod) | 535 | static int module_unload_init(struct module *mod) |
528 | { | 536 | { |
529 | int cpu; | 537 | mod->refptr = alloc_percpu(struct module_ref); |
538 | if (!mod->refptr) | ||
539 | return -ENOMEM; | ||
530 | 540 | ||
531 | INIT_LIST_HEAD(&mod->source_list); | 541 | INIT_LIST_HEAD(&mod->source_list); |
532 | INIT_LIST_HEAD(&mod->target_list); | 542 | INIT_LIST_HEAD(&mod->target_list); |
533 | for_each_possible_cpu(cpu) { | ||
534 | per_cpu_ptr(mod->refptr, cpu)->incs = 0; | ||
535 | per_cpu_ptr(mod->refptr, cpu)->decs = 0; | ||
536 | } | ||
537 | 543 | ||
538 | /* Hold reference count during initialization. */ | 544 | /* Hold reference count during initialization. */ |
539 | __this_cpu_write(mod->refptr->incs, 1); | 545 | __this_cpu_write(mod->refptr->incs, 1); |
540 | /* Backwards compatibility macros put refcount during init. */ | 546 | /* Backwards compatibility macros put refcount during init. */ |
541 | mod->waiter = current; | 547 | mod->waiter = current; |
548 | |||
549 | return 0; | ||
542 | } | 550 | } |
543 | 551 | ||
544 | /* Does a already use b? */ | 552 | /* Does a already use b? */ |
@@ -618,6 +626,8 @@ static void module_unload_free(struct module *mod) | |||
618 | kfree(use); | 626 | kfree(use); |
619 | } | 627 | } |
620 | mutex_unlock(&module_mutex); | 628 | mutex_unlock(&module_mutex); |
629 | |||
630 | free_percpu(mod->refptr); | ||
621 | } | 631 | } |
622 | 632 | ||
623 | #ifdef CONFIG_MODULE_FORCE_UNLOAD | 633 | #ifdef CONFIG_MODULE_FORCE_UNLOAD |
@@ -891,8 +901,9 @@ int ref_module(struct module *a, struct module *b) | |||
891 | } | 901 | } |
892 | EXPORT_SYMBOL_GPL(ref_module); | 902 | EXPORT_SYMBOL_GPL(ref_module); |
893 | 903 | ||
894 | static inline void module_unload_init(struct module *mod) | 904 | static inline int module_unload_init(struct module *mod) |
895 | { | 905 | { |
906 | return 0; | ||
896 | } | 907 | } |
897 | #endif /* CONFIG_MODULE_UNLOAD */ | 908 | #endif /* CONFIG_MODULE_UNLOAD */ |
898 | 909 | ||
@@ -1051,10 +1062,9 @@ static inline int same_magic(const char *amagic, const char *bmagic, | |||
1051 | #endif /* CONFIG_MODVERSIONS */ | 1062 | #endif /* CONFIG_MODVERSIONS */ |
1052 | 1063 | ||
1053 | /* Resolve a symbol for this module. I.e. if we find one, record usage. */ | 1064 | /* Resolve a symbol for this module. I.e. if we find one, record usage. */ |
1054 | static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, | 1065 | static const struct kernel_symbol *resolve_symbol(struct module *mod, |
1055 | unsigned int versindex, | 1066 | const struct load_info *info, |
1056 | const char *name, | 1067 | const char *name, |
1057 | struct module *mod, | ||
1058 | char ownername[]) | 1068 | char ownername[]) |
1059 | { | 1069 | { |
1060 | struct module *owner; | 1070 | struct module *owner; |
@@ -1068,7 +1078,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, | |||
1068 | if (!sym) | 1078 | if (!sym) |
1069 | goto unlock; | 1079 | goto unlock; |
1070 | 1080 | ||
1071 | if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { | 1081 | if (!check_version(info->sechdrs, info->index.vers, name, mod, crc, |
1082 | owner)) { | ||
1072 | sym = ERR_PTR(-EINVAL); | 1083 | sym = ERR_PTR(-EINVAL); |
1073 | goto getname; | 1084 | goto getname; |
1074 | } | 1085 | } |
@@ -1087,21 +1098,20 @@ unlock: | |||
1087 | return sym; | 1098 | return sym; |
1088 | } | 1099 | } |
1089 | 1100 | ||
1090 | static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, | 1101 | static const struct kernel_symbol * |
1091 | unsigned int versindex, | 1102 | resolve_symbol_wait(struct module *mod, |
1092 | const char *name, | 1103 | const struct load_info *info, |
1093 | struct module *mod) | 1104 | const char *name) |
1094 | { | 1105 | { |
1095 | const struct kernel_symbol *ksym; | 1106 | const struct kernel_symbol *ksym; |
1096 | char ownername[MODULE_NAME_LEN]; | 1107 | char owner[MODULE_NAME_LEN]; |
1097 | 1108 | ||
1098 | if (wait_event_interruptible_timeout(module_wq, | 1109 | if (wait_event_interruptible_timeout(module_wq, |
1099 | !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, | 1110 | !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) |
1100 | mod, ownername)) || | 1111 | || PTR_ERR(ksym) != -EBUSY, |
1101 | PTR_ERR(ksym) != -EBUSY, | ||
1102 | 30 * HZ) <= 0) { | 1112 | 30 * HZ) <= 0) { |
1103 | printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", | 1113 | printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", |
1104 | mod->name, ownername); | 1114 | mod->name, owner); |
1105 | } | 1115 | } |
1106 | return ksym; | 1116 | return ksym; |
1107 | } | 1117 | } |
@@ -1110,8 +1120,9 @@ static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, | |||
1110 | * /sys/module/foo/sections stuff | 1120 | * /sys/module/foo/sections stuff |
1111 | * J. Corbet <corbet@lwn.net> | 1121 | * J. Corbet <corbet@lwn.net> |
1112 | */ | 1122 | */ |
1113 | #if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) | 1123 | #ifdef CONFIG_SYSFS |
1114 | 1124 | ||
1125 | #ifdef CONFIG_KALLSYMS | ||
1115 | static inline bool sect_empty(const Elf_Shdr *sect) | 1126 | static inline bool sect_empty(const Elf_Shdr *sect) |
1116 | { | 1127 | { |
1117 | return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; | 1128 | return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; |
@@ -1148,8 +1159,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) | |||
1148 | kfree(sect_attrs); | 1159 | kfree(sect_attrs); |
1149 | } | 1160 | } |
1150 | 1161 | ||
1151 | static void add_sect_attrs(struct module *mod, unsigned int nsect, | 1162 | static void add_sect_attrs(struct module *mod, const struct load_info *info) |
1152 | char *secstrings, Elf_Shdr *sechdrs) | ||
1153 | { | 1163 | { |
1154 | unsigned int nloaded = 0, i, size[2]; | 1164 | unsigned int nloaded = 0, i, size[2]; |
1155 | struct module_sect_attrs *sect_attrs; | 1165 | struct module_sect_attrs *sect_attrs; |
@@ -1157,8 +1167,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, | |||
1157 | struct attribute **gattr; | 1167 | struct attribute **gattr; |
1158 | 1168 | ||
1159 | /* Count loaded sections and allocate structures */ | 1169 | /* Count loaded sections and allocate structures */ |
1160 | for (i = 0; i < nsect; i++) | 1170 | for (i = 0; i < info->hdr->e_shnum; i++) |
1161 | if (!sect_empty(&sechdrs[i])) | 1171 | if (!sect_empty(&info->sechdrs[i])) |
1162 | nloaded++; | 1172 | nloaded++; |
1163 | size[0] = ALIGN(sizeof(*sect_attrs) | 1173 | size[0] = ALIGN(sizeof(*sect_attrs) |
1164 | + nloaded * sizeof(sect_attrs->attrs[0]), | 1174 | + nloaded * sizeof(sect_attrs->attrs[0]), |
@@ -1175,11 +1185,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, | |||
1175 | sect_attrs->nsections = 0; | 1185 | sect_attrs->nsections = 0; |
1176 | sattr = §_attrs->attrs[0]; | 1186 | sattr = §_attrs->attrs[0]; |
1177 | gattr = §_attrs->grp.attrs[0]; | 1187 | gattr = §_attrs->grp.attrs[0]; |
1178 | for (i = 0; i < nsect; i++) { | 1188 | for (i = 0; i < info->hdr->e_shnum; i++) { |
1179 | if (sect_empty(&sechdrs[i])) | 1189 | Elf_Shdr *sec = &info->sechdrs[i]; |
1190 | if (sect_empty(sec)) | ||
1180 | continue; | 1191 | continue; |
1181 | sattr->address = sechdrs[i].sh_addr; | 1192 | sattr->address = sec->sh_addr; |
1182 | sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, | 1193 | sattr->name = kstrdup(info->secstrings + sec->sh_name, |
1183 | GFP_KERNEL); | 1194 | GFP_KERNEL); |
1184 | if (sattr->name == NULL) | 1195 | if (sattr->name == NULL) |
1185 | goto out; | 1196 | goto out; |
@@ -1247,8 +1258,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, | |||
1247 | kfree(notes_attrs); | 1258 | kfree(notes_attrs); |
1248 | } | 1259 | } |
1249 | 1260 | ||
1250 | static void add_notes_attrs(struct module *mod, unsigned int nsect, | 1261 | static void add_notes_attrs(struct module *mod, const struct load_info *info) |
1251 | char *secstrings, Elf_Shdr *sechdrs) | ||
1252 | { | 1262 | { |
1253 | unsigned int notes, loaded, i; | 1263 | unsigned int notes, loaded, i; |
1254 | struct module_notes_attrs *notes_attrs; | 1264 | struct module_notes_attrs *notes_attrs; |
@@ -1260,9 +1270,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, | |||
1260 | 1270 | ||
1261 | /* Count notes sections and allocate structures. */ | 1271 | /* Count notes sections and allocate structures. */ |
1262 | notes = 0; | 1272 | notes = 0; |
1263 | for (i = 0; i < nsect; i++) | 1273 | for (i = 0; i < info->hdr->e_shnum; i++) |
1264 | if (!sect_empty(&sechdrs[i]) && | 1274 | if (!sect_empty(&info->sechdrs[i]) && |
1265 | (sechdrs[i].sh_type == SHT_NOTE)) | 1275 | (info->sechdrs[i].sh_type == SHT_NOTE)) |
1266 | ++notes; | 1276 | ++notes; |
1267 | 1277 | ||
1268 | if (notes == 0) | 1278 | if (notes == 0) |
@@ -1276,15 +1286,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, | |||
1276 | 1286 | ||
1277 | notes_attrs->notes = notes; | 1287 | notes_attrs->notes = notes; |
1278 | nattr = ¬es_attrs->attrs[0]; | 1288 | nattr = ¬es_attrs->attrs[0]; |
1279 | for (loaded = i = 0; i < nsect; ++i) { | 1289 | for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { |
1280 | if (sect_empty(&sechdrs[i])) | 1290 | if (sect_empty(&info->sechdrs[i])) |
1281 | continue; | 1291 | continue; |
1282 | if (sechdrs[i].sh_type == SHT_NOTE) { | 1292 | if (info->sechdrs[i].sh_type == SHT_NOTE) { |
1283 | sysfs_bin_attr_init(nattr); | 1293 | sysfs_bin_attr_init(nattr); |
1284 | nattr->attr.name = mod->sect_attrs->attrs[loaded].name; | 1294 | nattr->attr.name = mod->sect_attrs->attrs[loaded].name; |
1285 | nattr->attr.mode = S_IRUGO; | 1295 | nattr->attr.mode = S_IRUGO; |
1286 | nattr->size = sechdrs[i].sh_size; | 1296 | nattr->size = info->sechdrs[i].sh_size; |
1287 | nattr->private = (void *) sechdrs[i].sh_addr; | 1297 | nattr->private = (void *) info->sechdrs[i].sh_addr; |
1288 | nattr->read = module_notes_read; | 1298 | nattr->read = module_notes_read; |
1289 | ++nattr; | 1299 | ++nattr; |
1290 | } | 1300 | } |
@@ -1315,8 +1325,8 @@ static void remove_notes_attrs(struct module *mod) | |||
1315 | 1325 | ||
1316 | #else | 1326 | #else |
1317 | 1327 | ||
1318 | static inline void add_sect_attrs(struct module *mod, unsigned int nsect, | 1328 | static inline void add_sect_attrs(struct module *mod, |
1319 | char *sectstrings, Elf_Shdr *sechdrs) | 1329 | const struct load_info *info) |
1320 | { | 1330 | { |
1321 | } | 1331 | } |
1322 | 1332 | ||
@@ -1324,17 +1334,16 @@ static inline void remove_sect_attrs(struct module *mod) | |||
1324 | { | 1334 | { |
1325 | } | 1335 | } |
1326 | 1336 | ||
1327 | static inline void add_notes_attrs(struct module *mod, unsigned int nsect, | 1337 | static inline void add_notes_attrs(struct module *mod, |
1328 | char *sectstrings, Elf_Shdr *sechdrs) | 1338 | const struct load_info *info) |
1329 | { | 1339 | { |
1330 | } | 1340 | } |
1331 | 1341 | ||
1332 | static inline void remove_notes_attrs(struct module *mod) | 1342 | static inline void remove_notes_attrs(struct module *mod) |
1333 | { | 1343 | { |
1334 | } | 1344 | } |
1335 | #endif | 1345 | #endif /* CONFIG_KALLSYMS */ |
1336 | 1346 | ||
1337 | #ifdef CONFIG_SYSFS | ||
1338 | static void add_usage_links(struct module *mod) | 1347 | static void add_usage_links(struct module *mod) |
1339 | { | 1348 | { |
1340 | #ifdef CONFIG_MODULE_UNLOAD | 1349 | #ifdef CONFIG_MODULE_UNLOAD |
@@ -1439,6 +1448,7 @@ out: | |||
1439 | } | 1448 | } |
1440 | 1449 | ||
1441 | static int mod_sysfs_setup(struct module *mod, | 1450 | static int mod_sysfs_setup(struct module *mod, |
1451 | const struct load_info *info, | ||
1442 | struct kernel_param *kparam, | 1452 | struct kernel_param *kparam, |
1443 | unsigned int num_params) | 1453 | unsigned int num_params) |
1444 | { | 1454 | { |
@@ -1463,6 +1473,8 @@ static int mod_sysfs_setup(struct module *mod, | |||
1463 | goto out_unreg_param; | 1473 | goto out_unreg_param; |
1464 | 1474 | ||
1465 | add_usage_links(mod); | 1475 | add_usage_links(mod); |
1476 | add_sect_attrs(mod, info); | ||
1477 | add_notes_attrs(mod, info); | ||
1466 | 1478 | ||
1467 | kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); | 1479 | kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); |
1468 | return 0; | 1480 | return 0; |
@@ -1479,33 +1491,26 @@ out: | |||
1479 | 1491 | ||
1480 | static void mod_sysfs_fini(struct module *mod) | 1492 | static void mod_sysfs_fini(struct module *mod) |
1481 | { | 1493 | { |
1494 | remove_notes_attrs(mod); | ||
1495 | remove_sect_attrs(mod); | ||
1482 | kobject_put(&mod->mkobj.kobj); | 1496 | kobject_put(&mod->mkobj.kobj); |
1483 | } | 1497 | } |
1484 | 1498 | ||
1485 | #else /* CONFIG_SYSFS */ | 1499 | #else /* !CONFIG_SYSFS */ |
1486 | |||
1487 | static inline int mod_sysfs_init(struct module *mod) | ||
1488 | { | ||
1489 | return 0; | ||
1490 | } | ||
1491 | 1500 | ||
1492 | static inline int mod_sysfs_setup(struct module *mod, | 1501 | static int mod_sysfs_setup(struct module *mod, |
1502 | const struct load_info *info, | ||
1493 | struct kernel_param *kparam, | 1503 | struct kernel_param *kparam, |
1494 | unsigned int num_params) | 1504 | unsigned int num_params) |
1495 | { | 1505 | { |
1496 | return 0; | 1506 | return 0; |
1497 | } | 1507 | } |
1498 | 1508 | ||
1499 | static inline int module_add_modinfo_attrs(struct module *mod) | 1509 | static void mod_sysfs_fini(struct module *mod) |
1500 | { | ||
1501 | return 0; | ||
1502 | } | ||
1503 | |||
1504 | static inline void module_remove_modinfo_attrs(struct module *mod) | ||
1505 | { | 1510 | { |
1506 | } | 1511 | } |
1507 | 1512 | ||
1508 | static void mod_sysfs_fini(struct module *mod) | 1513 | static void module_remove_modinfo_attrs(struct module *mod) |
1509 | { | 1514 | { |
1510 | } | 1515 | } |
1511 | 1516 | ||
@@ -1515,7 +1520,7 @@ static void del_usage_links(struct module *mod) | |||
1515 | 1520 | ||
1516 | #endif /* CONFIG_SYSFS */ | 1521 | #endif /* CONFIG_SYSFS */ |
1517 | 1522 | ||
1518 | static void mod_kobject_remove(struct module *mod) | 1523 | static void mod_sysfs_teardown(struct module *mod) |
1519 | { | 1524 | { |
1520 | del_usage_links(mod); | 1525 | del_usage_links(mod); |
1521 | module_remove_modinfo_attrs(mod); | 1526 | module_remove_modinfo_attrs(mod); |
@@ -1533,6 +1538,7 @@ static int __unlink_module(void *_mod) | |||
1533 | { | 1538 | { |
1534 | struct module *mod = _mod; | 1539 | struct module *mod = _mod; |
1535 | list_del(&mod->list); | 1540 | list_del(&mod->list); |
1541 | module_bug_cleanup(mod); | ||
1536 | return 0; | 1542 | return 0; |
1537 | } | 1543 | } |
1538 | 1544 | ||
@@ -1545,9 +1551,7 @@ static void free_module(struct module *mod) | |||
1545 | mutex_lock(&module_mutex); | 1551 | mutex_lock(&module_mutex); |
1546 | stop_machine(__unlink_module, mod, NULL); | 1552 | stop_machine(__unlink_module, mod, NULL); |
1547 | mutex_unlock(&module_mutex); | 1553 | mutex_unlock(&module_mutex); |
1548 | remove_notes_attrs(mod); | 1554 | mod_sysfs_teardown(mod); |
1549 | remove_sect_attrs(mod); | ||
1550 | mod_kobject_remove(mod); | ||
1551 | 1555 | ||
1552 | /* Remove dynamic debug info */ | 1556 | /* Remove dynamic debug info */ |
1553 | ddebug_remove_module(mod->name); | 1557 | ddebug_remove_module(mod->name); |
@@ -1565,10 +1569,7 @@ static void free_module(struct module *mod) | |||
1565 | module_free(mod, mod->module_init); | 1569 | module_free(mod, mod->module_init); |
1566 | kfree(mod->args); | 1570 | kfree(mod->args); |
1567 | percpu_modfree(mod); | 1571 | percpu_modfree(mod); |
1568 | #if defined(CONFIG_MODULE_UNLOAD) | 1572 | |
1569 | if (mod->refptr) | ||
1570 | free_percpu(mod->refptr); | ||
1571 | #endif | ||
1572 | /* Free lock-classes: */ | 1573 | /* Free lock-classes: */ |
1573 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1574 | lockdep_free_key_range(mod->module_core, mod->core_size); |
1574 | 1575 | ||
@@ -1634,25 +1635,23 @@ static int verify_export_symbols(struct module *mod) | |||
1634 | } | 1635 | } |
1635 | 1636 | ||
1636 | /* Change all symbols so that st_value encodes the pointer directly. */ | 1637 | /* Change all symbols so that st_value encodes the pointer directly. */ |
1637 | static int simplify_symbols(Elf_Shdr *sechdrs, | 1638 | static int simplify_symbols(struct module *mod, const struct load_info *info) |
1638 | unsigned int symindex, | 1639 | { |
1639 | const char *strtab, | 1640 | Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; |
1640 | unsigned int versindex, | 1641 | Elf_Sym *sym = (void *)symsec->sh_addr; |
1641 | unsigned int pcpuindex, | ||
1642 | struct module *mod) | ||
1643 | { | ||
1644 | Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; | ||
1645 | unsigned long secbase; | 1642 | unsigned long secbase; |
1646 | unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); | 1643 | unsigned int i; |
1647 | int ret = 0; | 1644 | int ret = 0; |
1648 | const struct kernel_symbol *ksym; | 1645 | const struct kernel_symbol *ksym; |
1649 | 1646 | ||
1650 | for (i = 1; i < n; i++) { | 1647 | for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) { |
1648 | const char *name = info->strtab + sym[i].st_name; | ||
1649 | |||
1651 | switch (sym[i].st_shndx) { | 1650 | switch (sym[i].st_shndx) { |
1652 | case SHN_COMMON: | 1651 | case SHN_COMMON: |
1653 | /* We compiled with -fno-common. These are not | 1652 | /* We compiled with -fno-common. These are not |
1654 | supposed to happen. */ | 1653 | supposed to happen. */ |
1655 | DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); | 1654 | DEBUGP("Common symbol: %s\n", name); |
1656 | printk("%s: please compile with -fno-common\n", | 1655 | printk("%s: please compile with -fno-common\n", |
1657 | mod->name); | 1656 | mod->name); |
1658 | ret = -ENOEXEC; | 1657 | ret = -ENOEXEC; |
@@ -1665,9 +1664,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs, | |||
1665 | break; | 1664 | break; |
1666 | 1665 | ||
1667 | case SHN_UNDEF: | 1666 | case SHN_UNDEF: |
1668 | ksym = resolve_symbol_wait(sechdrs, versindex, | 1667 | ksym = resolve_symbol_wait(mod, info, name); |
1669 | strtab + sym[i].st_name, | ||
1670 | mod); | ||
1671 | /* Ok if resolved. */ | 1668 | /* Ok if resolved. */ |
1672 | if (ksym && !IS_ERR(ksym)) { | 1669 | if (ksym && !IS_ERR(ksym)) { |
1673 | sym[i].st_value = ksym->value; | 1670 | sym[i].st_value = ksym->value; |
@@ -1679,17 +1676,16 @@ static int simplify_symbols(Elf_Shdr *sechdrs, | |||
1679 | break; | 1676 | break; |
1680 | 1677 | ||
1681 | printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", | 1678 | printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", |
1682 | mod->name, strtab + sym[i].st_name, | 1679 | mod->name, name, PTR_ERR(ksym)); |
1683 | PTR_ERR(ksym)); | ||
1684 | ret = PTR_ERR(ksym) ?: -ENOENT; | 1680 | ret = PTR_ERR(ksym) ?: -ENOENT; |
1685 | break; | 1681 | break; |
1686 | 1682 | ||
1687 | default: | 1683 | default: |
1688 | /* Divert to percpu allocation if a percpu var. */ | 1684 | /* Divert to percpu allocation if a percpu var. */ |
1689 | if (sym[i].st_shndx == pcpuindex) | 1685 | if (sym[i].st_shndx == info->index.pcpu) |
1690 | secbase = (unsigned long)mod_percpu(mod); | 1686 | secbase = (unsigned long)mod_percpu(mod); |
1691 | else | 1687 | else |
1692 | secbase = sechdrs[sym[i].st_shndx].sh_addr; | 1688 | secbase = info->sechdrs[sym[i].st_shndx].sh_addr; |
1693 | sym[i].st_value += secbase; | 1689 | sym[i].st_value += secbase; |
1694 | break; | 1690 | break; |
1695 | } | 1691 | } |
@@ -1698,6 +1694,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs, | |||
1698 | return ret; | 1694 | return ret; |
1699 | } | 1695 | } |
1700 | 1696 | ||
1697 | static int apply_relocations(struct module *mod, const struct load_info *info) | ||
1698 | { | ||
1699 | unsigned int i; | ||
1700 | int err = 0; | ||
1701 | |||
1702 | /* Now do relocations. */ | ||
1703 | for (i = 1; i < info->hdr->e_shnum; i++) { | ||
1704 | unsigned int infosec = info->sechdrs[i].sh_info; | ||
1705 | |||
1706 | /* Not a valid relocation section? */ | ||
1707 | if (infosec >= info->hdr->e_shnum) | ||
1708 | continue; | ||
1709 | |||
1710 | /* Don't bother with non-allocated sections */ | ||
1711 | if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) | ||
1712 | continue; | ||
1713 | |||
1714 | if (info->sechdrs[i].sh_type == SHT_REL) | ||
1715 | err = apply_relocate(info->sechdrs, info->strtab, | ||
1716 | info->index.sym, i, mod); | ||
1717 | else if (info->sechdrs[i].sh_type == SHT_RELA) | ||
1718 | err = apply_relocate_add(info->sechdrs, info->strtab, | ||
1719 | info->index.sym, i, mod); | ||
1720 | if (err < 0) | ||
1721 | break; | ||
1722 | } | ||
1723 | return err; | ||
1724 | } | ||
1725 | |||
1701 | /* Additional bytes needed by arch in front of individual sections */ | 1726 | /* Additional bytes needed by arch in front of individual sections */ |
1702 | unsigned int __weak arch_mod_section_prepend(struct module *mod, | 1727 | unsigned int __weak arch_mod_section_prepend(struct module *mod, |
1703 | unsigned int section) | 1728 | unsigned int section) |
@@ -1722,10 +1747,7 @@ static long get_offset(struct module *mod, unsigned int *size, | |||
1722 | might -- code, read-only data, read-write data, small data. Tally | 1747 | might -- code, read-only data, read-write data, small data. Tally |
1723 | sizes, and place the offsets into sh_entsize fields: high bit means it | 1748 | sizes, and place the offsets into sh_entsize fields: high bit means it |
1724 | belongs in init. */ | 1749 | belongs in init. */ |
1725 | static void layout_sections(struct module *mod, | 1750 | static void layout_sections(struct module *mod, struct load_info *info) |
1726 | const Elf_Ehdr *hdr, | ||
1727 | Elf_Shdr *sechdrs, | ||
1728 | const char *secstrings) | ||
1729 | { | 1751 | { |
1730 | static unsigned long const masks[][2] = { | 1752 | static unsigned long const masks[][2] = { |
1731 | /* NOTE: all executable code must be the first section | 1753 | /* NOTE: all executable code must be the first section |
@@ -1738,21 +1760,22 @@ static void layout_sections(struct module *mod, | |||
1738 | }; | 1760 | }; |
1739 | unsigned int m, i; | 1761 | unsigned int m, i; |
1740 | 1762 | ||
1741 | for (i = 0; i < hdr->e_shnum; i++) | 1763 | for (i = 0; i < info->hdr->e_shnum; i++) |
1742 | sechdrs[i].sh_entsize = ~0UL; | 1764 | info->sechdrs[i].sh_entsize = ~0UL; |
1743 | 1765 | ||
1744 | DEBUGP("Core section allocation order:\n"); | 1766 | DEBUGP("Core section allocation order:\n"); |
1745 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { | 1767 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { |
1746 | for (i = 0; i < hdr->e_shnum; ++i) { | 1768 | for (i = 0; i < info->hdr->e_shnum; ++i) { |
1747 | Elf_Shdr *s = &sechdrs[i]; | 1769 | Elf_Shdr *s = &info->sechdrs[i]; |
1770 | const char *sname = info->secstrings + s->sh_name; | ||
1748 | 1771 | ||
1749 | if ((s->sh_flags & masks[m][0]) != masks[m][0] | 1772 | if ((s->sh_flags & masks[m][0]) != masks[m][0] |
1750 | || (s->sh_flags & masks[m][1]) | 1773 | || (s->sh_flags & masks[m][1]) |
1751 | || s->sh_entsize != ~0UL | 1774 | || s->sh_entsize != ~0UL |
1752 | || strstarts(secstrings + s->sh_name, ".init")) | 1775 | || strstarts(sname, ".init")) |
1753 | continue; | 1776 | continue; |
1754 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); | 1777 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); |
1755 | DEBUGP("\t%s\n", secstrings + s->sh_name); | 1778 | DEBUGP("\t%s\n", name); |
1756 | } | 1779 | } |
1757 | if (m == 0) | 1780 | if (m == 0) |
1758 | mod->core_text_size = mod->core_size; | 1781 | mod->core_text_size = mod->core_size; |
@@ -1760,17 +1783,18 @@ static void layout_sections(struct module *mod, | |||
1760 | 1783 | ||
1761 | DEBUGP("Init section allocation order:\n"); | 1784 | DEBUGP("Init section allocation order:\n"); |
1762 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { | 1785 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { |
1763 | for (i = 0; i < hdr->e_shnum; ++i) { | 1786 | for (i = 0; i < info->hdr->e_shnum; ++i) { |
1764 | Elf_Shdr *s = &sechdrs[i]; | 1787 | Elf_Shdr *s = &info->sechdrs[i]; |
1788 | const char *sname = info->secstrings + s->sh_name; | ||
1765 | 1789 | ||
1766 | if ((s->sh_flags & masks[m][0]) != masks[m][0] | 1790 | if ((s->sh_flags & masks[m][0]) != masks[m][0] |
1767 | || (s->sh_flags & masks[m][1]) | 1791 | || (s->sh_flags & masks[m][1]) |
1768 | || s->sh_entsize != ~0UL | 1792 | || s->sh_entsize != ~0UL |
1769 | || !strstarts(secstrings + s->sh_name, ".init")) | 1793 | || !strstarts(sname, ".init")) |
1770 | continue; | 1794 | continue; |
1771 | s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | 1795 | s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) |
1772 | | INIT_OFFSET_MASK); | 1796 | | INIT_OFFSET_MASK); |
1773 | DEBUGP("\t%s\n", secstrings + s->sh_name); | 1797 | DEBUGP("\t%s\n", sname); |
1774 | } | 1798 | } |
1775 | if (m == 0) | 1799 | if (m == 0) |
1776 | mod->init_text_size = mod->init_size; | 1800 | mod->init_text_size = mod->init_size; |
@@ -1809,33 +1833,28 @@ static char *next_string(char *string, unsigned long *secsize) | |||
1809 | return string; | 1833 | return string; |
1810 | } | 1834 | } |
1811 | 1835 | ||
1812 | static char *get_modinfo(Elf_Shdr *sechdrs, | 1836 | static char *get_modinfo(struct load_info *info, const char *tag) |
1813 | unsigned int info, | ||
1814 | const char *tag) | ||
1815 | { | 1837 | { |
1816 | char *p; | 1838 | char *p; |
1817 | unsigned int taglen = strlen(tag); | 1839 | unsigned int taglen = strlen(tag); |
1818 | unsigned long size = sechdrs[info].sh_size; | 1840 | Elf_Shdr *infosec = &info->sechdrs[info->index.info]; |
1841 | unsigned long size = infosec->sh_size; | ||
1819 | 1842 | ||
1820 | for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { | 1843 | for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) { |
1821 | if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') | 1844 | if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') |
1822 | return p + taglen + 1; | 1845 | return p + taglen + 1; |
1823 | } | 1846 | } |
1824 | return NULL; | 1847 | return NULL; |
1825 | } | 1848 | } |
1826 | 1849 | ||
1827 | static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, | 1850 | static void setup_modinfo(struct module *mod, struct load_info *info) |
1828 | unsigned int infoindex) | ||
1829 | { | 1851 | { |
1830 | struct module_attribute *attr; | 1852 | struct module_attribute *attr; |
1831 | int i; | 1853 | int i; |
1832 | 1854 | ||
1833 | for (i = 0; (attr = modinfo_attrs[i]); i++) { | 1855 | for (i = 0; (attr = modinfo_attrs[i]); i++) { |
1834 | if (attr->setup) | 1856 | if (attr->setup) |
1835 | attr->setup(mod, | 1857 | attr->setup(mod, get_modinfo(info, attr->attr.name)); |
1836 | get_modinfo(sechdrs, | ||
1837 | infoindex, | ||
1838 | attr->attr.name)); | ||
1839 | } | 1858 | } |
1840 | } | 1859 | } |
1841 | 1860 | ||
@@ -1876,11 +1895,10 @@ static int is_exported(const char *name, unsigned long value, | |||
1876 | } | 1895 | } |
1877 | 1896 | ||
1878 | /* As per nm */ | 1897 | /* As per nm */ |
1879 | static char elf_type(const Elf_Sym *sym, | 1898 | static char elf_type(const Elf_Sym *sym, const struct load_info *info) |
1880 | Elf_Shdr *sechdrs, | ||
1881 | const char *secstrings, | ||
1882 | struct module *mod) | ||
1883 | { | 1899 | { |
1900 | const Elf_Shdr *sechdrs = info->sechdrs; | ||
1901 | |||
1884 | if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { | 1902 | if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { |
1885 | if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) | 1903 | if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) |
1886 | return 'v'; | 1904 | return 'v'; |
@@ -1910,8 +1928,10 @@ static char elf_type(const Elf_Sym *sym, | |||
1910 | else | 1928 | else |
1911 | return 'b'; | 1929 | return 'b'; |
1912 | } | 1930 | } |
1913 | if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug")) | 1931 | if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, |
1932 | ".debug")) { | ||
1914 | return 'n'; | 1933 | return 'n'; |
1934 | } | ||
1915 | return '?'; | 1935 | return '?'; |
1916 | } | 1936 | } |
1917 | 1937 | ||
@@ -1936,127 +1956,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, | |||
1936 | return true; | 1956 | return true; |
1937 | } | 1957 | } |
1938 | 1958 | ||
1939 | static unsigned long layout_symtab(struct module *mod, | 1959 | static void layout_symtab(struct module *mod, struct load_info *info) |
1940 | Elf_Shdr *sechdrs, | ||
1941 | unsigned int symindex, | ||
1942 | unsigned int strindex, | ||
1943 | const Elf_Ehdr *hdr, | ||
1944 | const char *secstrings, | ||
1945 | unsigned long *pstroffs, | ||
1946 | unsigned long *strmap) | ||
1947 | { | 1960 | { |
1948 | unsigned long symoffs; | 1961 | Elf_Shdr *symsect = info->sechdrs + info->index.sym; |
1949 | Elf_Shdr *symsect = sechdrs + symindex; | 1962 | Elf_Shdr *strsect = info->sechdrs + info->index.str; |
1950 | Elf_Shdr *strsect = sechdrs + strindex; | ||
1951 | const Elf_Sym *src; | 1963 | const Elf_Sym *src; |
1952 | const char *strtab; | ||
1953 | unsigned int i, nsrc, ndst; | 1964 | unsigned int i, nsrc, ndst; |
1954 | 1965 | ||
1955 | /* Put symbol section at end of init part of module. */ | 1966 | /* Put symbol section at end of init part of module. */ |
1956 | symsect->sh_flags |= SHF_ALLOC; | 1967 | symsect->sh_flags |= SHF_ALLOC; |
1957 | symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, | 1968 | symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, |
1958 | symindex) | INIT_OFFSET_MASK; | 1969 | info->index.sym) | INIT_OFFSET_MASK; |
1959 | DEBUGP("\t%s\n", secstrings + symsect->sh_name); | 1970 | DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); |
1960 | 1971 | ||
1961 | src = (void *)hdr + symsect->sh_offset; | 1972 | src = (void *)info->hdr + symsect->sh_offset; |
1962 | nsrc = symsect->sh_size / sizeof(*src); | 1973 | nsrc = symsect->sh_size / sizeof(*src); |
1963 | strtab = (void *)hdr + strsect->sh_offset; | ||
1964 | for (ndst = i = 1; i < nsrc; ++i, ++src) | 1974 | for (ndst = i = 1; i < nsrc; ++i, ++src) |
1965 | if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { | 1975 | if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { |
1966 | unsigned int j = src->st_name; | 1976 | unsigned int j = src->st_name; |
1967 | 1977 | ||
1968 | while(!__test_and_set_bit(j, strmap) && strtab[j]) | 1978 | while (!__test_and_set_bit(j, info->strmap) |
1979 | && info->strtab[j]) | ||
1969 | ++j; | 1980 | ++j; |
1970 | ++ndst; | 1981 | ++ndst; |
1971 | } | 1982 | } |
1972 | 1983 | ||
1973 | /* Append room for core symbols at end of core part. */ | 1984 | /* Append room for core symbols at end of core part. */ |
1974 | symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); | 1985 | info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); |
1975 | mod->core_size = symoffs + ndst * sizeof(Elf_Sym); | 1986 | mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); |
1976 | 1987 | ||
1977 | /* Put string table section at end of init part of module. */ | 1988 | /* Put string table section at end of init part of module. */ |
1978 | strsect->sh_flags |= SHF_ALLOC; | 1989 | strsect->sh_flags |= SHF_ALLOC; |
1979 | strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, | 1990 | strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, |
1980 | strindex) | INIT_OFFSET_MASK; | 1991 | info->index.str) | INIT_OFFSET_MASK; |
1981 | DEBUGP("\t%s\n", secstrings + strsect->sh_name); | 1992 | DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); |
1982 | 1993 | ||
1983 | /* Append room for core symbols' strings at end of core part. */ | 1994 | /* Append room for core symbols' strings at end of core part. */ |
1984 | *pstroffs = mod->core_size; | 1995 | info->stroffs = mod->core_size; |
1985 | __set_bit(0, strmap); | 1996 | __set_bit(0, info->strmap); |
1986 | mod->core_size += bitmap_weight(strmap, strsect->sh_size); | 1997 | mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); |
1987 | |||
1988 | return symoffs; | ||
1989 | } | 1998 | } |
1990 | 1999 | ||
1991 | static void add_kallsyms(struct module *mod, | 2000 | static void add_kallsyms(struct module *mod, const struct load_info *info) |
1992 | Elf_Shdr *sechdrs, | ||
1993 | unsigned int shnum, | ||
1994 | unsigned int symindex, | ||
1995 | unsigned int strindex, | ||
1996 | unsigned long symoffs, | ||
1997 | unsigned long stroffs, | ||
1998 | const char *secstrings, | ||
1999 | unsigned long *strmap) | ||
2000 | { | 2001 | { |
2001 | unsigned int i, ndst; | 2002 | unsigned int i, ndst; |
2002 | const Elf_Sym *src; | 2003 | const Elf_Sym *src; |
2003 | Elf_Sym *dst; | 2004 | Elf_Sym *dst; |
2004 | char *s; | 2005 | char *s; |
2006 | Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; | ||
2005 | 2007 | ||
2006 | mod->symtab = (void *)sechdrs[symindex].sh_addr; | 2008 | mod->symtab = (void *)symsec->sh_addr; |
2007 | mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); | 2009 | mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); |
2008 | mod->strtab = (void *)sechdrs[strindex].sh_addr; | 2010 | /* Make sure we get permanent strtab: don't use info->strtab. */ |
2011 | mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; | ||
2009 | 2012 | ||
2010 | /* Set types up while we still have access to sections. */ | 2013 | /* Set types up while we still have access to sections. */ |
2011 | for (i = 0; i < mod->num_symtab; i++) | 2014 | for (i = 0; i < mod->num_symtab; i++) |
2012 | mod->symtab[i].st_info | 2015 | mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); |
2013 | = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); | ||
2014 | 2016 | ||
2015 | mod->core_symtab = dst = mod->module_core + symoffs; | 2017 | mod->core_symtab = dst = mod->module_core + info->symoffs; |
2016 | src = mod->symtab; | 2018 | src = mod->symtab; |
2017 | *dst = *src; | 2019 | *dst = *src; |
2018 | for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { | 2020 | for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { |
2019 | if (!is_core_symbol(src, sechdrs, shnum)) | 2021 | if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) |
2020 | continue; | 2022 | continue; |
2021 | dst[ndst] = *src; | 2023 | dst[ndst] = *src; |
2022 | dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); | 2024 | dst[ndst].st_name = bitmap_weight(info->strmap, |
2025 | dst[ndst].st_name); | ||
2023 | ++ndst; | 2026 | ++ndst; |
2024 | } | 2027 | } |
2025 | mod->core_num_syms = ndst; | 2028 | mod->core_num_syms = ndst; |
2026 | 2029 | ||
2027 | mod->core_strtab = s = mod->module_core + stroffs; | 2030 | mod->core_strtab = s = mod->module_core + info->stroffs; |
2028 | for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) | 2031 | for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i) |
2029 | if (test_bit(i, strmap)) | 2032 | if (test_bit(i, info->strmap)) |
2030 | *++s = mod->strtab[i]; | 2033 | *++s = mod->strtab[i]; |
2031 | } | 2034 | } |
2032 | #else | 2035 | #else |
2033 | static inline unsigned long layout_symtab(struct module *mod, | 2036 | static inline void layout_symtab(struct module *mod, struct load_info *info) |
2034 | Elf_Shdr *sechdrs, | ||
2035 | unsigned int symindex, | ||
2036 | unsigned int strindex, | ||
2037 | const Elf_Ehdr *hdr, | ||
2038 | const char *secstrings, | ||
2039 | unsigned long *pstroffs, | ||
2040 | unsigned long *strmap) | ||
2041 | { | 2037 | { |
2042 | return 0; | ||
2043 | } | 2038 | } |
2044 | 2039 | ||
2045 | static inline void add_kallsyms(struct module *mod, | 2040 | static void add_kallsyms(struct module *mod, const struct load_info *info) |
2046 | Elf_Shdr *sechdrs, | ||
2047 | unsigned int shnum, | ||
2048 | unsigned int symindex, | ||
2049 | unsigned int strindex, | ||
2050 | unsigned long symoffs, | ||
2051 | unsigned long stroffs, | ||
2052 | const char *secstrings, | ||
2053 | const unsigned long *strmap) | ||
2054 | { | 2041 | { |
2055 | } | 2042 | } |
2056 | #endif /* CONFIG_KALLSYMS */ | 2043 | #endif /* CONFIG_KALLSYMS */ |
2057 | 2044 | ||
2058 | static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) | 2045 | static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) |
2059 | { | 2046 | { |
2047 | if (!debug) | ||
2048 | return; | ||
2060 | #ifdef CONFIG_DYNAMIC_DEBUG | 2049 | #ifdef CONFIG_DYNAMIC_DEBUG |
2061 | if (ddebug_add_module(debug, num, debug->modname)) | 2050 | if (ddebug_add_module(debug, num, debug->modname)) |
2062 | printk(KERN_ERR "dynamic debug error adding module: %s\n", | 2051 | printk(KERN_ERR "dynamic debug error adding module: %s\n", |
@@ -2087,65 +2076,47 @@ static void *module_alloc_update_bounds(unsigned long size) | |||
2087 | } | 2076 | } |
2088 | 2077 | ||
2089 | #ifdef CONFIG_DEBUG_KMEMLEAK | 2078 | #ifdef CONFIG_DEBUG_KMEMLEAK |
2090 | static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, | 2079 | static void kmemleak_load_module(const struct module *mod, |
2091 | Elf_Shdr *sechdrs, char *secstrings) | 2080 | const struct load_info *info) |
2092 | { | 2081 | { |
2093 | unsigned int i; | 2082 | unsigned int i; |
2094 | 2083 | ||
2095 | /* only scan the sections containing data */ | 2084 | /* only scan the sections containing data */ |
2096 | kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); | 2085 | kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); |
2097 | 2086 | ||
2098 | for (i = 1; i < hdr->e_shnum; i++) { | 2087 | for (i = 1; i < info->hdr->e_shnum; i++) { |
2099 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | 2088 | const char *name = info->secstrings + info->sechdrs[i].sh_name; |
2089 | if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) | ||
2100 | continue; | 2090 | continue; |
2101 | if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 | 2091 | if (!strstarts(name, ".data") && !strstarts(name, ".bss")) |
2102 | && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) | ||
2103 | continue; | 2092 | continue; |
2104 | 2093 | ||
2105 | kmemleak_scan_area((void *)sechdrs[i].sh_addr, | 2094 | kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, |
2106 | sechdrs[i].sh_size, GFP_KERNEL); | 2095 | info->sechdrs[i].sh_size, GFP_KERNEL); |
2107 | } | 2096 | } |
2108 | } | 2097 | } |
2109 | #else | 2098 | #else |
2110 | static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, | 2099 | static inline void kmemleak_load_module(const struct module *mod, |
2111 | Elf_Shdr *sechdrs, char *secstrings) | 2100 | const struct load_info *info) |
2112 | { | 2101 | { |
2113 | } | 2102 | } |
2114 | #endif | 2103 | #endif |
2115 | 2104 | ||
2116 | /* Allocate and load the module: note that size of section 0 is always | 2105 | /* Sets info->hdr and info->len. */ |
2117 | zero, and we rely on this for optional sections. */ | 2106 | static int copy_and_check(struct load_info *info, |
2118 | static noinline struct module *load_module(void __user *umod, | 2107 | const void __user *umod, unsigned long len, |
2119 | unsigned long len, | 2108 | const char __user *uargs) |
2120 | const char __user *uargs) | ||
2121 | { | 2109 | { |
2110 | int err; | ||
2122 | Elf_Ehdr *hdr; | 2111 | Elf_Ehdr *hdr; |
2123 | Elf_Shdr *sechdrs; | ||
2124 | char *secstrings, *args, *modmagic, *strtab = NULL; | ||
2125 | char *staging; | ||
2126 | unsigned int i; | ||
2127 | unsigned int symindex = 0; | ||
2128 | unsigned int strindex = 0; | ||
2129 | unsigned int modindex, versindex, infoindex, pcpuindex; | ||
2130 | struct module *mod; | ||
2131 | long err = 0; | ||
2132 | void *ptr = NULL; /* Stops spurious gcc warning */ | ||
2133 | unsigned long symoffs, stroffs, *strmap; | ||
2134 | void __percpu *percpu; | ||
2135 | struct _ddebug *debug = NULL; | ||
2136 | unsigned int num_debug = 0; | ||
2137 | 2112 | ||
2138 | mm_segment_t old_fs; | ||
2139 | |||
2140 | DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", | ||
2141 | umod, len, uargs); | ||
2142 | if (len < sizeof(*hdr)) | 2113 | if (len < sizeof(*hdr)) |
2143 | return ERR_PTR(-ENOEXEC); | 2114 | return -ENOEXEC; |
2144 | 2115 | ||
2145 | /* Suck in entire file: we'll want most of it. */ | 2116 | /* Suck in entire file: we'll want most of it. */ |
2146 | /* vmalloc barfs on "unusual" numbers. Check here */ | 2117 | /* vmalloc barfs on "unusual" numbers. Check here */ |
2147 | if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) | 2118 | if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) |
2148 | return ERR_PTR(-ENOMEM); | 2119 | return -ENOMEM; |
2149 | 2120 | ||
2150 | if (copy_from_user(hdr, umod, len) != 0) { | 2121 | if (copy_from_user(hdr, umod, len) != 0) { |
2151 | err = -EFAULT; | 2122 | err = -EFAULT; |
@@ -2153,135 +2124,230 @@ static noinline struct module *load_module(void __user *umod, | |||
2153 | } | 2124 | } |
2154 | 2125 | ||
2155 | /* Sanity checks against insmoding binaries or wrong arch, | 2126 | /* Sanity checks against insmoding binaries or wrong arch, |
2156 | weird elf version */ | 2127 | weird elf version */ |
2157 | if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 | 2128 | if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 |
2158 | || hdr->e_type != ET_REL | 2129 | || hdr->e_type != ET_REL |
2159 | || !elf_check_arch(hdr) | 2130 | || !elf_check_arch(hdr) |
2160 | || hdr->e_shentsize != sizeof(*sechdrs)) { | 2131 | || hdr->e_shentsize != sizeof(Elf_Shdr)) { |
2161 | err = -ENOEXEC; | 2132 | err = -ENOEXEC; |
2162 | goto free_hdr; | 2133 | goto free_hdr; |
2163 | } | 2134 | } |
2164 | 2135 | ||
2165 | if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) | 2136 | if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { |
2166 | goto truncated; | 2137 | err = -ENOEXEC; |
2138 | goto free_hdr; | ||
2139 | } | ||
2167 | 2140 | ||
2168 | /* Convenience variables */ | 2141 | info->hdr = hdr; |
2169 | sechdrs = (void *)hdr + hdr->e_shoff; | 2142 | info->len = len; |
2170 | secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; | 2143 | return 0; |
2171 | sechdrs[0].sh_addr = 0; | ||
2172 | 2144 | ||
2173 | for (i = 1; i < hdr->e_shnum; i++) { | 2145 | free_hdr: |
2174 | if (sechdrs[i].sh_type != SHT_NOBITS | 2146 | vfree(hdr); |
2175 | && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) | 2147 | return err; |
2176 | goto truncated; | 2148 | } |
2149 | |||
2150 | static void free_copy(struct load_info *info) | ||
2151 | { | ||
2152 | vfree(info->hdr); | ||
2153 | } | ||
2154 | |||
2155 | static int rewrite_section_headers(struct load_info *info) | ||
2156 | { | ||
2157 | unsigned int i; | ||
2158 | |||
2159 | /* This should always be true, but let's be sure. */ | ||
2160 | info->sechdrs[0].sh_addr = 0; | ||
2161 | |||
2162 | for (i = 1; i < info->hdr->e_shnum; i++) { | ||
2163 | Elf_Shdr *shdr = &info->sechdrs[i]; | ||
2164 | if (shdr->sh_type != SHT_NOBITS | ||
2165 | && info->len < shdr->sh_offset + shdr->sh_size) { | ||
2166 | printk(KERN_ERR "Module len %lu truncated\n", | ||
2167 | info->len); | ||
2168 | return -ENOEXEC; | ||
2169 | } | ||
2177 | 2170 | ||
2178 | /* Mark all sections sh_addr with their address in the | 2171 | /* Mark all sections sh_addr with their address in the |
2179 | temporary image. */ | 2172 | temporary image. */ |
2180 | sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; | 2173 | shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset; |
2181 | 2174 | ||
2182 | /* Internal symbols and strings. */ | ||
2183 | if (sechdrs[i].sh_type == SHT_SYMTAB) { | ||
2184 | symindex = i; | ||
2185 | strindex = sechdrs[i].sh_link; | ||
2186 | strtab = (char *)hdr + sechdrs[strindex].sh_offset; | ||
2187 | } | ||
2188 | #ifndef CONFIG_MODULE_UNLOAD | 2175 | #ifndef CONFIG_MODULE_UNLOAD |
2189 | /* Don't load .exit sections */ | 2176 | /* Don't load .exit sections */ |
2190 | if (strstarts(secstrings+sechdrs[i].sh_name, ".exit")) | 2177 | if (strstarts(info->secstrings+shdr->sh_name, ".exit")) |
2191 | sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; | 2178 | shdr->sh_flags &= ~(unsigned long)SHF_ALLOC; |
2192 | #endif | 2179 | #endif |
2193 | } | 2180 | } |
2194 | 2181 | ||
2195 | modindex = find_sec(hdr, sechdrs, secstrings, | 2182 | /* Track but don't keep modinfo and version sections. */ |
2196 | ".gnu.linkonce.this_module"); | 2183 | info->index.vers = find_sec(info, "__versions"); |
2197 | if (!modindex) { | 2184 | info->index.info = find_sec(info, ".modinfo"); |
2185 | info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
2186 | info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
2187 | return 0; | ||
2188 | } | ||
2189 | |||
2190 | /* | ||
2191 | * Set up our basic convenience variables (pointers to section headers, | ||
2192 | * search for module section index etc), and do some basic section | ||
2193 | * verification. | ||
2194 | * | ||
2195 | * Return the temporary module pointer (we'll replace it with the final | ||
2196 | * one when we move the module sections around). | ||
2197 | */ | ||
2198 | static struct module *setup_load_info(struct load_info *info) | ||
2199 | { | ||
2200 | unsigned int i; | ||
2201 | int err; | ||
2202 | struct module *mod; | ||
2203 | |||
2204 | /* Set up the convenience variables */ | ||
2205 | info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; | ||
2206 | info->secstrings = (void *)info->hdr | ||
2207 | + info->sechdrs[info->hdr->e_shstrndx].sh_offset; | ||
2208 | |||
2209 | err = rewrite_section_headers(info); | ||
2210 | if (err) | ||
2211 | return ERR_PTR(err); | ||
2212 | |||
2213 | /* Find internal symbols and strings. */ | ||
2214 | for (i = 1; i < info->hdr->e_shnum; i++) { | ||
2215 | if (info->sechdrs[i].sh_type == SHT_SYMTAB) { | ||
2216 | info->index.sym = i; | ||
2217 | info->index.str = info->sechdrs[i].sh_link; | ||
2218 | info->strtab = (char *)info->hdr | ||
2219 | + info->sechdrs[info->index.str].sh_offset; | ||
2220 | break; | ||
2221 | } | ||
2222 | } | ||
2223 | |||
2224 | info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); | ||
2225 | if (!info->index.mod) { | ||
2198 | printk(KERN_WARNING "No module found in object\n"); | 2226 | printk(KERN_WARNING "No module found in object\n"); |
2199 | err = -ENOEXEC; | 2227 | return ERR_PTR(-ENOEXEC); |
2200 | goto free_hdr; | ||
2201 | } | 2228 | } |
2202 | /* This is temporary: point mod into copy of data. */ | 2229 | /* This is temporary: point mod into copy of data. */ |
2203 | mod = (void *)sechdrs[modindex].sh_addr; | 2230 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; |
2204 | 2231 | ||
2205 | if (symindex == 0) { | 2232 | if (info->index.sym == 0) { |
2206 | printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", | 2233 | printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", |
2207 | mod->name); | 2234 | mod->name); |
2208 | err = -ENOEXEC; | 2235 | return ERR_PTR(-ENOEXEC); |
2209 | goto free_hdr; | ||
2210 | } | 2236 | } |
2211 | 2237 | ||
2212 | versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); | 2238 | info->index.pcpu = find_pcpusec(info); |
2213 | infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); | ||
2214 | pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); | ||
2215 | |||
2216 | /* Don't keep modinfo and version sections. */ | ||
2217 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
2218 | sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
2219 | 2239 | ||
2220 | /* Check module struct version now, before we try to use module. */ | 2240 | /* Check module struct version now, before we try to use module. */ |
2221 | if (!check_modstruct_version(sechdrs, versindex, mod)) { | 2241 | if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) |
2222 | err = -ENOEXEC; | 2242 | return ERR_PTR(-ENOEXEC); |
2223 | goto free_hdr; | 2243 | |
2224 | } | 2244 | return mod; |
2245 | } | ||
2246 | |||
2247 | static int check_modinfo(struct module *mod, struct load_info *info) | ||
2248 | { | ||
2249 | const char *modmagic = get_modinfo(info, "vermagic"); | ||
2250 | int err; | ||
2225 | 2251 | ||
2226 | modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); | ||
2227 | /* This is allowed: modprobe --force will invalidate it. */ | 2252 | /* This is allowed: modprobe --force will invalidate it. */ |
2228 | if (!modmagic) { | 2253 | if (!modmagic) { |
2229 | err = try_to_force_load(mod, "bad vermagic"); | 2254 | err = try_to_force_load(mod, "bad vermagic"); |
2230 | if (err) | 2255 | if (err) |
2231 | goto free_hdr; | 2256 | return err; |
2232 | } else if (!same_magic(modmagic, vermagic, versindex)) { | 2257 | } else if (!same_magic(modmagic, vermagic, info->index.vers)) { |
2233 | printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", | 2258 | printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", |
2234 | mod->name, modmagic, vermagic); | 2259 | mod->name, modmagic, vermagic); |
2235 | err = -ENOEXEC; | 2260 | return -ENOEXEC; |
2236 | goto free_hdr; | ||
2237 | } | 2261 | } |
2238 | 2262 | ||
2239 | staging = get_modinfo(sechdrs, infoindex, "staging"); | 2263 | if (get_modinfo(info, "staging")) { |
2240 | if (staging) { | ||
2241 | add_taint_module(mod, TAINT_CRAP); | 2264 | add_taint_module(mod, TAINT_CRAP); |
2242 | printk(KERN_WARNING "%s: module is from the staging directory," | 2265 | printk(KERN_WARNING "%s: module is from the staging directory," |
2243 | " the quality is unknown, you have been warned.\n", | 2266 | " the quality is unknown, you have been warned.\n", |
2244 | mod->name); | 2267 | mod->name); |
2245 | } | 2268 | } |
2246 | 2269 | ||
2247 | /* Now copy in args */ | 2270 | /* Set up license info based on the info section */ |
2248 | args = strndup_user(uargs, ~0UL >> 1); | 2271 | set_license(mod, get_modinfo(info, "license")); |
2249 | if (IS_ERR(args)) { | ||
2250 | err = PTR_ERR(args); | ||
2251 | goto free_hdr; | ||
2252 | } | ||
2253 | 2272 | ||
2254 | strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) | 2273 | return 0; |
2255 | * sizeof(long), GFP_KERNEL); | 2274 | } |
2256 | if (!strmap) { | ||
2257 | err = -ENOMEM; | ||
2258 | goto free_mod; | ||
2259 | } | ||
2260 | 2275 | ||
2261 | mod->state = MODULE_STATE_COMING; | 2276 | static void find_module_sections(struct module *mod, struct load_info *info) |
2277 | { | ||
2278 | mod->kp = section_objs(info, "__param", | ||
2279 | sizeof(*mod->kp), &mod->num_kp); | ||
2280 | mod->syms = section_objs(info, "__ksymtab", | ||
2281 | sizeof(*mod->syms), &mod->num_syms); | ||
2282 | mod->crcs = section_addr(info, "__kcrctab"); | ||
2283 | mod->gpl_syms = section_objs(info, "__ksymtab_gpl", | ||
2284 | sizeof(*mod->gpl_syms), | ||
2285 | &mod->num_gpl_syms); | ||
2286 | mod->gpl_crcs = section_addr(info, "__kcrctab_gpl"); | ||
2287 | mod->gpl_future_syms = section_objs(info, | ||
2288 | "__ksymtab_gpl_future", | ||
2289 | sizeof(*mod->gpl_future_syms), | ||
2290 | &mod->num_gpl_future_syms); | ||
2291 | mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future"); | ||
2262 | 2292 | ||
2263 | /* Allow arches to frob section contents and sizes. */ | 2293 | #ifdef CONFIG_UNUSED_SYMBOLS |
2264 | err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); | 2294 | mod->unused_syms = section_objs(info, "__ksymtab_unused", |
2265 | if (err < 0) | 2295 | sizeof(*mod->unused_syms), |
2266 | goto free_mod; | 2296 | &mod->num_unused_syms); |
2297 | mod->unused_crcs = section_addr(info, "__kcrctab_unused"); | ||
2298 | mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl", | ||
2299 | sizeof(*mod->unused_gpl_syms), | ||
2300 | &mod->num_unused_gpl_syms); | ||
2301 | mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl"); | ||
2302 | #endif | ||
2303 | #ifdef CONFIG_CONSTRUCTORS | ||
2304 | mod->ctors = section_objs(info, ".ctors", | ||
2305 | sizeof(*mod->ctors), &mod->num_ctors); | ||
2306 | #endif | ||
2267 | 2307 | ||
2268 | if (pcpuindex) { | 2308 | #ifdef CONFIG_TRACEPOINTS |
2269 | /* We have a special allocation for this section. */ | 2309 | mod->tracepoints = section_objs(info, "__tracepoints", |
2270 | err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, | 2310 | sizeof(*mod->tracepoints), |
2271 | sechdrs[pcpuindex].sh_addralign); | 2311 | &mod->num_tracepoints); |
2272 | if (err) | 2312 | #endif |
2273 | goto free_mod; | 2313 | #ifdef HAVE_JUMP_LABEL |
2274 | sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | 2314 | mod->jump_entries = section_objs(info, "__jump_table", |
2275 | } | 2315 | sizeof(*mod->jump_entries), |
2276 | /* Keep this around for failure path. */ | 2316 | &mod->num_jump_entries); |
2277 | percpu = mod_percpu(mod); | 2317 | #endif |
2318 | #ifdef CONFIG_EVENT_TRACING | ||
2319 | mod->trace_events = section_objs(info, "_ftrace_events", | ||
2320 | sizeof(*mod->trace_events), | ||
2321 | &mod->num_trace_events); | ||
2322 | /* | ||
2323 | * This section contains pointers to allocated objects in the trace | ||
2324 | * code and not scanning it leads to false positives. | ||
2325 | */ | ||
2326 | kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * | ||
2327 | mod->num_trace_events, GFP_KERNEL); | ||
2328 | #endif | ||
2329 | #ifdef CONFIG_FTRACE_MCOUNT_RECORD | ||
2330 | /* sechdrs[0].sh_size is always zero */ | ||
2331 | mod->ftrace_callsites = section_objs(info, "__mcount_loc", | ||
2332 | sizeof(*mod->ftrace_callsites), | ||
2333 | &mod->num_ftrace_callsites); | ||
2334 | #endif | ||
2278 | 2335 | ||
2279 | /* Determine total sizes, and put offsets in sh_entsize. For now | 2336 | mod->extable = section_objs(info, "__ex_table", |
2280 | this is done generically; there doesn't appear to be any | 2337 | sizeof(*mod->extable), &mod->num_exentries); |
2281 | special cases for the architectures. */ | 2338 | |
2282 | layout_sections(mod, hdr, sechdrs, secstrings); | 2339 | if (section_addr(info, "__obsparm")) |
2283 | symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, | 2340 | printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", |
2284 | secstrings, &stroffs, strmap); | 2341 | mod->name); |
2342 | |||
2343 | info->debug = section_objs(info, "__verbose", | ||
2344 | sizeof(*info->debug), &info->num_debug); | ||
2345 | } | ||
2346 | |||
2347 | static int move_module(struct module *mod, struct load_info *info) | ||
2348 | { | ||
2349 | int i; | ||
2350 | void *ptr; | ||
2285 | 2351 | ||
2286 | /* Do the allocs. */ | 2352 | /* Do the allocs. */ |
2287 | ptr = module_alloc_update_bounds(mod->core_size); | 2353 | ptr = module_alloc_update_bounds(mod->core_size); |
@@ -2291,10 +2357,9 @@ static noinline struct module *load_module(void __user *umod, | |||
2291 | * leak. | 2357 | * leak. |
2292 | */ | 2358 | */ |
2293 | kmemleak_not_leak(ptr); | 2359 | kmemleak_not_leak(ptr); |
2294 | if (!ptr) { | 2360 | if (!ptr) |
2295 | err = -ENOMEM; | 2361 | return -ENOMEM; |
2296 | goto free_percpu; | 2362 | |
2297 | } | ||
2298 | memset(ptr, 0, mod->core_size); | 2363 | memset(ptr, 0, mod->core_size); |
2299 | mod->module_core = ptr; | 2364 | mod->module_core = ptr; |
2300 | 2365 | ||
@@ -2307,50 +2372,40 @@ static noinline struct module *load_module(void __user *umod, | |||
2307 | */ | 2372 | */ |
2308 | kmemleak_ignore(ptr); | 2373 | kmemleak_ignore(ptr); |
2309 | if (!ptr && mod->init_size) { | 2374 | if (!ptr && mod->init_size) { |
2310 | err = -ENOMEM; | 2375 | module_free(mod, mod->module_core); |
2311 | goto free_core; | 2376 | return -ENOMEM; |
2312 | } | 2377 | } |
2313 | memset(ptr, 0, mod->init_size); | 2378 | memset(ptr, 0, mod->init_size); |
2314 | mod->module_init = ptr; | 2379 | mod->module_init = ptr; |
2315 | 2380 | ||
2316 | /* Transfer each section which specifies SHF_ALLOC */ | 2381 | /* Transfer each section which specifies SHF_ALLOC */ |
2317 | DEBUGP("final section addresses:\n"); | 2382 | DEBUGP("final section addresses:\n"); |
2318 | for (i = 0; i < hdr->e_shnum; i++) { | 2383 | for (i = 0; i < info->hdr->e_shnum; i++) { |
2319 | void *dest; | 2384 | void *dest; |
2385 | Elf_Shdr *shdr = &info->sechdrs[i]; | ||
2320 | 2386 | ||
2321 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | 2387 | if (!(shdr->sh_flags & SHF_ALLOC)) |
2322 | continue; | 2388 | continue; |
2323 | 2389 | ||
2324 | if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) | 2390 | if (shdr->sh_entsize & INIT_OFFSET_MASK) |
2325 | dest = mod->module_init | 2391 | dest = mod->module_init |
2326 | + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); | 2392 | + (shdr->sh_entsize & ~INIT_OFFSET_MASK); |
2327 | else | 2393 | else |
2328 | dest = mod->module_core + sechdrs[i].sh_entsize; | 2394 | dest = mod->module_core + shdr->sh_entsize; |
2329 | 2395 | ||
2330 | if (sechdrs[i].sh_type != SHT_NOBITS) | 2396 | if (shdr->sh_type != SHT_NOBITS) |
2331 | memcpy(dest, (void *)sechdrs[i].sh_addr, | 2397 | memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); |
2332 | sechdrs[i].sh_size); | ||
2333 | /* Update sh_addr to point to copy in image. */ | 2398 | /* Update sh_addr to point to copy in image. */ |
2334 | sechdrs[i].sh_addr = (unsigned long)dest; | 2399 | shdr->sh_addr = (unsigned long)dest; |
2335 | DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); | 2400 | DEBUGP("\t0x%lx %s\n", |
2336 | } | 2401 | shdr->sh_addr, info->secstrings + shdr->sh_name); |
2337 | /* Module has been moved. */ | ||
2338 | mod = (void *)sechdrs[modindex].sh_addr; | ||
2339 | kmemleak_load_module(mod, hdr, sechdrs, secstrings); | ||
2340 | |||
2341 | #if defined(CONFIG_MODULE_UNLOAD) | ||
2342 | mod->refptr = alloc_percpu(struct module_ref); | ||
2343 | if (!mod->refptr) { | ||
2344 | err = -ENOMEM; | ||
2345 | goto free_init; | ||
2346 | } | 2402 | } |
2347 | #endif | ||
2348 | /* Now we've moved module, initialize linked lists, etc. */ | ||
2349 | module_unload_init(mod); | ||
2350 | 2403 | ||
2351 | /* Set up license info based on the info section */ | 2404 | return 0; |
2352 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); | 2405 | } |
2353 | 2406 | ||
2407 | static int check_module_license_and_versions(struct module *mod) | ||
2408 | { | ||
2354 | /* | 2409 | /* |
2355 | * ndiswrapper is under GPL by itself, but loads proprietary modules. | 2410 | * ndiswrapper is under GPL by itself, but loads proprietary modules. |
2356 | * Don't use add_taint_module(), as it would prevent ndiswrapper from | 2411 | * Don't use add_taint_module(), as it would prevent ndiswrapper from |
@@ -2363,77 +2418,6 @@ static noinline struct module *load_module(void __user *umod, | |||
2363 | if (strcmp(mod->name, "driverloader") == 0) | 2418 | if (strcmp(mod->name, "driverloader") == 0) |
2364 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | 2419 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); |
2365 | 2420 | ||
2366 | /* Set up MODINFO_ATTR fields */ | ||
2367 | setup_modinfo(mod, sechdrs, infoindex); | ||
2368 | |||
2369 | /* Fix up syms, so that st_value is a pointer to location. */ | ||
2370 | err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, | ||
2371 | mod); | ||
2372 | if (err < 0) | ||
2373 | goto cleanup; | ||
2374 | |||
2375 | /* Now we've got everything in the final locations, we can | ||
2376 | * find optional sections. */ | ||
2377 | mod->kp = section_objs(hdr, sechdrs, secstrings, "__param", | ||
2378 | sizeof(*mod->kp), &mod->num_kp); | ||
2379 | mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab", | ||
2380 | sizeof(*mod->syms), &mod->num_syms); | ||
2381 | mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab"); | ||
2382 | mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl", | ||
2383 | sizeof(*mod->gpl_syms), | ||
2384 | &mod->num_gpl_syms); | ||
2385 | mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl"); | ||
2386 | mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings, | ||
2387 | "__ksymtab_gpl_future", | ||
2388 | sizeof(*mod->gpl_future_syms), | ||
2389 | &mod->num_gpl_future_syms); | ||
2390 | mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings, | ||
2391 | "__kcrctab_gpl_future"); | ||
2392 | |||
2393 | #ifdef CONFIG_UNUSED_SYMBOLS | ||
2394 | mod->unused_syms = section_objs(hdr, sechdrs, secstrings, | ||
2395 | "__ksymtab_unused", | ||
2396 | sizeof(*mod->unused_syms), | ||
2397 | &mod->num_unused_syms); | ||
2398 | mod->unused_crcs = section_addr(hdr, sechdrs, secstrings, | ||
2399 | "__kcrctab_unused"); | ||
2400 | mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings, | ||
2401 | "__ksymtab_unused_gpl", | ||
2402 | sizeof(*mod->unused_gpl_syms), | ||
2403 | &mod->num_unused_gpl_syms); | ||
2404 | mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings, | ||
2405 | "__kcrctab_unused_gpl"); | ||
2406 | #endif | ||
2407 | #ifdef CONFIG_CONSTRUCTORS | ||
2408 | mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors", | ||
2409 | sizeof(*mod->ctors), &mod->num_ctors); | ||
2410 | #endif | ||
2411 | |||
2412 | #ifdef CONFIG_TRACEPOINTS | ||
2413 | mod->tracepoints = section_objs(hdr, sechdrs, secstrings, | ||
2414 | "__tracepoints", | ||
2415 | sizeof(*mod->tracepoints), | ||
2416 | &mod->num_tracepoints); | ||
2417 | #endif | ||
2418 | #ifdef CONFIG_EVENT_TRACING | ||
2419 | mod->trace_events = section_objs(hdr, sechdrs, secstrings, | ||
2420 | "_ftrace_events", | ||
2421 | sizeof(*mod->trace_events), | ||
2422 | &mod->num_trace_events); | ||
2423 | /* | ||
2424 | * This section contains pointers to allocated objects in the trace | ||
2425 | * code and not scanning it leads to false positives. | ||
2426 | */ | ||
2427 | kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * | ||
2428 | mod->num_trace_events, GFP_KERNEL); | ||
2429 | #endif | ||
2430 | #ifdef CONFIG_FTRACE_MCOUNT_RECORD | ||
2431 | /* sechdrs[0].sh_size is always zero */ | ||
2432 | mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings, | ||
2433 | "__mcount_loc", | ||
2434 | sizeof(*mod->ftrace_callsites), | ||
2435 | &mod->num_ftrace_callsites); | ||
2436 | #endif | ||
2437 | #ifdef CONFIG_MODVERSIONS | 2421 | #ifdef CONFIG_MODVERSIONS |
2438 | if ((mod->num_syms && !mod->crcs) | 2422 | if ((mod->num_syms && !mod->crcs) |
2439 | || (mod->num_gpl_syms && !mod->gpl_crcs) | 2423 | || (mod->num_gpl_syms && !mod->gpl_crcs) |
@@ -2443,56 +2427,16 @@ static noinline struct module *load_module(void __user *umod, | |||
2443 | || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) | 2427 | || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) |
2444 | #endif | 2428 | #endif |
2445 | ) { | 2429 | ) { |
2446 | err = try_to_force_load(mod, | 2430 | return try_to_force_load(mod, |
2447 | "no versions for exported symbols"); | 2431 | "no versions for exported symbols"); |
2448 | if (err) | ||
2449 | goto cleanup; | ||
2450 | } | 2432 | } |
2451 | #endif | 2433 | #endif |
2434 | return 0; | ||
2435 | } | ||
2452 | 2436 | ||
2453 | /* Now do relocations. */ | 2437 | static void flush_module_icache(const struct module *mod) |
2454 | for (i = 1; i < hdr->e_shnum; i++) { | 2438 | { |
2455 | const char *strtab = (char *)sechdrs[strindex].sh_addr; | 2439 | mm_segment_t old_fs; |
2456 | unsigned int info = sechdrs[i].sh_info; | ||
2457 | |||
2458 | /* Not a valid relocation section? */ | ||
2459 | if (info >= hdr->e_shnum) | ||
2460 | continue; | ||
2461 | |||
2462 | /* Don't bother with non-allocated sections */ | ||
2463 | if (!(sechdrs[info].sh_flags & SHF_ALLOC)) | ||
2464 | continue; | ||
2465 | |||
2466 | if (sechdrs[i].sh_type == SHT_REL) | ||
2467 | err = apply_relocate(sechdrs, strtab, symindex, i,mod); | ||
2468 | else if (sechdrs[i].sh_type == SHT_RELA) | ||
2469 | err = apply_relocate_add(sechdrs, strtab, symindex, i, | ||
2470 | mod); | ||
2471 | if (err < 0) | ||
2472 | goto cleanup; | ||
2473 | } | ||
2474 | |||
2475 | /* Set up and sort exception table */ | ||
2476 | mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", | ||
2477 | sizeof(*mod->extable), &mod->num_exentries); | ||
2478 | sort_extable(mod->extable, mod->extable + mod->num_exentries); | ||
2479 | |||
2480 | /* Finally, copy percpu area over. */ | ||
2481 | percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr, | ||
2482 | sechdrs[pcpuindex].sh_size); | ||
2483 | |||
2484 | add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, | ||
2485 | symoffs, stroffs, secstrings, strmap); | ||
2486 | kfree(strmap); | ||
2487 | strmap = NULL; | ||
2488 | |||
2489 | if (!mod->taints) | ||
2490 | debug = section_objs(hdr, sechdrs, secstrings, "__verbose", | ||
2491 | sizeof(*debug), &num_debug); | ||
2492 | |||
2493 | err = module_finalize(hdr, sechdrs, mod); | ||
2494 | if (err < 0) | ||
2495 | goto cleanup; | ||
2496 | 2440 | ||
2497 | /* flush the icache in correct context */ | 2441 | /* flush the icache in correct context */ |
2498 | old_fs = get_fs(); | 2442 | old_fs = get_fs(); |
@@ -2511,11 +2455,160 @@ static noinline struct module *load_module(void __user *umod, | |||
2511 | (unsigned long)mod->module_core + mod->core_size); | 2455 | (unsigned long)mod->module_core + mod->core_size); |
2512 | 2456 | ||
2513 | set_fs(old_fs); | 2457 | set_fs(old_fs); |
2458 | } | ||
2514 | 2459 | ||
2515 | mod->args = args; | 2460 | static struct module *layout_and_allocate(struct load_info *info) |
2516 | if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) | 2461 | { |
2517 | printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", | 2462 | /* Module within temporary copy. */ |
2518 | mod->name); | 2463 | struct module *mod; |
2464 | Elf_Shdr *pcpusec; | ||
2465 | int err; | ||
2466 | |||
2467 | mod = setup_load_info(info); | ||
2468 | if (IS_ERR(mod)) | ||
2469 | return mod; | ||
2470 | |||
2471 | err = check_modinfo(mod, info); | ||
2472 | if (err) | ||
2473 | return ERR_PTR(err); | ||
2474 | |||
2475 | /* Allow arches to frob section contents and sizes. */ | ||
2476 | err = module_frob_arch_sections(info->hdr, info->sechdrs, | ||
2477 | info->secstrings, mod); | ||
2478 | if (err < 0) | ||
2479 | goto out; | ||
2480 | |||
2481 | pcpusec = &info->sechdrs[info->index.pcpu]; | ||
2482 | if (pcpusec->sh_size) { | ||
2483 | /* We have a special allocation for this section. */ | ||
2484 | err = percpu_modalloc(mod, | ||
2485 | pcpusec->sh_size, pcpusec->sh_addralign); | ||
2486 | if (err) | ||
2487 | goto out; | ||
2488 | pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
2489 | } | ||
2490 | |||
2491 | /* Determine total sizes, and put offsets in sh_entsize. For now | ||
2492 | this is done generically; there doesn't appear to be any | ||
2493 | special cases for the architectures. */ | ||
2494 | layout_sections(mod, info); | ||
2495 | |||
2496 | info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size) | ||
2497 | * sizeof(long), GFP_KERNEL); | ||
2498 | if (!info->strmap) { | ||
2499 | err = -ENOMEM; | ||
2500 | goto free_percpu; | ||
2501 | } | ||
2502 | layout_symtab(mod, info); | ||
2503 | |||
2504 | /* Allocate and move to the final place */ | ||
2505 | err = move_module(mod, info); | ||
2506 | if (err) | ||
2507 | goto free_strmap; | ||
2508 | |||
2509 | /* Module has been copied to its final place now: return it. */ | ||
2510 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; | ||
2511 | kmemleak_load_module(mod, info); | ||
2512 | return mod; | ||
2513 | |||
2514 | free_strmap: | ||
2515 | kfree(info->strmap); | ||
2516 | free_percpu: | ||
2517 | percpu_modfree(mod); | ||
2518 | out: | ||
2519 | return ERR_PTR(err); | ||
2520 | } | ||
2521 | |||
2522 | /* mod is no longer valid after this! */ | ||
2523 | static void module_deallocate(struct module *mod, struct load_info *info) | ||
2524 | { | ||
2525 | kfree(info->strmap); | ||
2526 | percpu_modfree(mod); | ||
2527 | module_free(mod, mod->module_init); | ||
2528 | module_free(mod, mod->module_core); | ||
2529 | } | ||
2530 | |||
2531 | static int post_relocation(struct module *mod, const struct load_info *info) | ||
2532 | { | ||
2533 | /* Sort exception table now relocations are done. */ | ||
2534 | sort_extable(mod->extable, mod->extable + mod->num_exentries); | ||
2535 | |||
2536 | /* Copy relocated percpu area over. */ | ||
2537 | percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr, | ||
2538 | info->sechdrs[info->index.pcpu].sh_size); | ||
2539 | |||
2540 | /* Setup kallsyms-specific fields. */ | ||
2541 | add_kallsyms(mod, info); | ||
2542 | |||
2543 | /* Arch-specific module finalizing. */ | ||
2544 | return module_finalize(info->hdr, info->sechdrs, mod); | ||
2545 | } | ||
2546 | |||
2547 | /* Allocate and load the module: note that size of section 0 is always | ||
2548 | zero, and we rely on this for optional sections. */ | ||
2549 | static struct module *load_module(void __user *umod, | ||
2550 | unsigned long len, | ||
2551 | const char __user *uargs) | ||
2552 | { | ||
2553 | struct load_info info = { NULL, }; | ||
2554 | struct module *mod; | ||
2555 | long err; | ||
2556 | |||
2557 | DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", | ||
2558 | umod, len, uargs); | ||
2559 | |||
2560 | /* Copy in the blobs from userspace, check they are vaguely sane. */ | ||
2561 | err = copy_and_check(&info, umod, len, uargs); | ||
2562 | if (err) | ||
2563 | return ERR_PTR(err); | ||
2564 | |||
2565 | /* Figure out module layout, and allocate all the memory. */ | ||
2566 | mod = layout_and_allocate(&info); | ||
2567 | if (IS_ERR(mod)) { | ||
2568 | err = PTR_ERR(mod); | ||
2569 | goto free_copy; | ||
2570 | } | ||
2571 | |||
2572 | /* Now module is in final location, initialize linked lists, etc. */ | ||
2573 | err = module_unload_init(mod); | ||
2574 | if (err) | ||
2575 | goto free_module; | ||
2576 | |||
2577 | /* Now we've got everything in the final locations, we can | ||
2578 | * find optional sections. */ | ||
2579 | find_module_sections(mod, &info); | ||
2580 | |||
2581 | err = check_module_license_and_versions(mod); | ||
2582 | if (err) | ||
2583 | goto free_unload; | ||
2584 | |||
2585 | /* Set up MODINFO_ATTR fields */ | ||
2586 | setup_modinfo(mod, &info); | ||
2587 | |||
2588 | /* Fix up syms, so that st_value is a pointer to location. */ | ||
2589 | err = simplify_symbols(mod, &info); | ||
2590 | if (err < 0) | ||
2591 | goto free_modinfo; | ||
2592 | |||
2593 | err = apply_relocations(mod, &info); | ||
2594 | if (err < 0) | ||
2595 | goto free_modinfo; | ||
2596 | |||
2597 | err = post_relocation(mod, &info); | ||
2598 | if (err < 0) | ||
2599 | goto free_modinfo; | ||
2600 | |||
2601 | flush_module_icache(mod); | ||
2602 | |||
2603 | /* Now copy in args */ | ||
2604 | mod->args = strndup_user(uargs, ~0UL >> 1); | ||
2605 | if (IS_ERR(mod->args)) { | ||
2606 | err = PTR_ERR(mod->args); | ||
2607 | goto free_arch_cleanup; | ||
2608 | } | ||
2609 | |||
2610 | /* Mark state as coming so strong_try_module_get() ignores us. */ | ||
2611 | mod->state = MODULE_STATE_COMING; | ||
2519 | 2612 | ||
2520 | /* Now sew it into the lists so we can get lockdep and oops | 2613 | /* Now sew it into the lists so we can get lockdep and oops |
2521 | * info during argument parsing. Noone should access us, since | 2614 | * info during argument parsing. Noone should access us, since |
@@ -2530,70 +2623,61 @@ static noinline struct module *load_module(void __user *umod, | |||
2530 | goto unlock; | 2623 | goto unlock; |
2531 | } | 2624 | } |
2532 | 2625 | ||
2533 | if (debug) | 2626 | /* This has to be done once we're sure module name is unique. */ |
2534 | dynamic_debug_setup(debug, num_debug); | 2627 | if (!mod->taints) |
2628 | dynamic_debug_setup(info.debug, info.num_debug); | ||
2535 | 2629 | ||
2536 | /* Find duplicate symbols */ | 2630 | /* Find duplicate symbols */ |
2537 | err = verify_export_symbols(mod); | 2631 | err = verify_export_symbols(mod); |
2538 | if (err < 0) | 2632 | if (err < 0) |
2539 | goto ddebug; | 2633 | goto ddebug; |
2540 | 2634 | ||
2635 | module_bug_finalize(info.hdr, info.sechdrs, mod); | ||
2541 | list_add_rcu(&mod->list, &modules); | 2636 | list_add_rcu(&mod->list, &modules); |
2542 | mutex_unlock(&module_mutex); | 2637 | mutex_unlock(&module_mutex); |
2543 | 2638 | ||
2639 | /* Module is ready to execute: parsing args may do that. */ | ||
2544 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); | 2640 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); |
2545 | if (err < 0) | 2641 | if (err < 0) |
2546 | goto unlink; | 2642 | goto unlink; |
2547 | 2643 | ||
2548 | err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); | 2644 | /* Link in to syfs. */ |
2645 | err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); | ||
2549 | if (err < 0) | 2646 | if (err < 0) |
2550 | goto unlink; | 2647 | goto unlink; |
2551 | 2648 | ||
2552 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 2649 | /* Get rid of temporary copy and strmap. */ |
2553 | add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 2650 | kfree(info.strmap); |
2554 | 2651 | free_copy(&info); | |
2555 | /* Get rid of temporary copy */ | ||
2556 | vfree(hdr); | ||
2557 | |||
2558 | trace_module_load(mod); | ||
2559 | 2652 | ||
2560 | /* Done! */ | 2653 | /* Done! */ |
2654 | trace_module_load(mod); | ||
2561 | return mod; | 2655 | return mod; |
2562 | 2656 | ||
2563 | unlink: | 2657 | unlink: |
2564 | mutex_lock(&module_mutex); | 2658 | mutex_lock(&module_mutex); |
2565 | /* Unlink carefully: kallsyms could be walking list. */ | 2659 | /* Unlink carefully: kallsyms could be walking list. */ |
2566 | list_del_rcu(&mod->list); | 2660 | list_del_rcu(&mod->list); |
2661 | module_bug_cleanup(mod); | ||
2662 | |||
2567 | ddebug: | 2663 | ddebug: |
2568 | dynamic_debug_remove(debug); | 2664 | if (!mod->taints) |
2665 | dynamic_debug_remove(info.debug); | ||
2569 | unlock: | 2666 | unlock: |
2570 | mutex_unlock(&module_mutex); | 2667 | mutex_unlock(&module_mutex); |
2571 | synchronize_sched(); | 2668 | synchronize_sched(); |
2669 | kfree(mod->args); | ||
2670 | free_arch_cleanup: | ||
2572 | module_arch_cleanup(mod); | 2671 | module_arch_cleanup(mod); |
2573 | cleanup: | 2672 | free_modinfo: |
2574 | free_modinfo(mod); | 2673 | free_modinfo(mod); |
2674 | free_unload: | ||
2575 | module_unload_free(mod); | 2675 | module_unload_free(mod); |
2576 | #if defined(CONFIG_MODULE_UNLOAD) | 2676 | free_module: |
2577 | free_percpu(mod->refptr); | 2677 | module_deallocate(mod, &info); |
2578 | free_init: | 2678 | free_copy: |
2579 | #endif | 2679 | free_copy(&info); |
2580 | module_free(mod, mod->module_init); | ||
2581 | free_core: | ||
2582 | module_free(mod, mod->module_core); | ||
2583 | /* mod will be freed with core. Don't access it beyond this line! */ | ||
2584 | free_percpu: | ||
2585 | free_percpu(percpu); | ||
2586 | free_mod: | ||
2587 | kfree(args); | ||
2588 | kfree(strmap); | ||
2589 | free_hdr: | ||
2590 | vfree(hdr); | ||
2591 | return ERR_PTR(err); | 2680 | return ERR_PTR(err); |
2592 | |||
2593 | truncated: | ||
2594 | printk(KERN_ERR "Module len %lu truncated\n", len); | ||
2595 | err = -ENOEXEC; | ||
2596 | goto free_hdr; | ||
2597 | } | 2681 | } |
2598 | 2682 | ||
2599 | /* Call module constructors. */ | 2683 | /* Call module constructors. */ |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 4c0b7b3e6d2e..200407c1502f 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -36,15 +36,6 @@ | |||
36 | # include <asm/mutex.h> | 36 | # include <asm/mutex.h> |
37 | #endif | 37 | #endif |
38 | 38 | ||
39 | /*** | ||
40 | * mutex_init - initialize the mutex | ||
41 | * @lock: the mutex to be initialized | ||
42 | * @key: the lock_class_key for the class; used by mutex lock debugging | ||
43 | * | ||
44 | * Initialize the mutex to unlocked state. | ||
45 | * | ||
46 | * It is not allowed to initialize an already locked mutex. | ||
47 | */ | ||
48 | void | 39 | void |
49 | __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) | 40 | __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) |
50 | { | 41 | { |
@@ -68,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init); | |||
68 | static __used noinline void __sched | 59 | static __used noinline void __sched |
69 | __mutex_lock_slowpath(atomic_t *lock_count); | 60 | __mutex_lock_slowpath(atomic_t *lock_count); |
70 | 61 | ||
71 | /*** | 62 | /** |
72 | * mutex_lock - acquire the mutex | 63 | * mutex_lock - acquire the mutex |
73 | * @lock: the mutex to be acquired | 64 | * @lock: the mutex to be acquired |
74 | * | 65 | * |
@@ -105,7 +96,7 @@ EXPORT_SYMBOL(mutex_lock); | |||
105 | 96 | ||
106 | static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); | 97 | static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); |
107 | 98 | ||
108 | /*** | 99 | /** |
109 | * mutex_unlock - release the mutex | 100 | * mutex_unlock - release the mutex |
110 | * @lock: the mutex to be released | 101 | * @lock: the mutex to be released |
111 | * | 102 | * |
@@ -364,8 +355,8 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count); | |||
364 | static noinline int __sched | 355 | static noinline int __sched |
365 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count); | 356 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count); |
366 | 357 | ||
367 | /*** | 358 | /** |
368 | * mutex_lock_interruptible - acquire the mutex, interruptable | 359 | * mutex_lock_interruptible - acquire the mutex, interruptible |
369 | * @lock: the mutex to be acquired | 360 | * @lock: the mutex to be acquired |
370 | * | 361 | * |
371 | * Lock the mutex like mutex_lock(), and return 0 if the mutex has | 362 | * Lock the mutex like mutex_lock(), and return 0 if the mutex has |
@@ -456,15 +447,15 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | |||
456 | return prev == 1; | 447 | return prev == 1; |
457 | } | 448 | } |
458 | 449 | ||
459 | /*** | 450 | /** |
460 | * mutex_trylock - try acquire the mutex, without waiting | 451 | * mutex_trylock - try to acquire the mutex, without waiting |
461 | * @lock: the mutex to be acquired | 452 | * @lock: the mutex to be acquired |
462 | * | 453 | * |
463 | * Try to acquire the mutex atomically. Returns 1 if the mutex | 454 | * Try to acquire the mutex atomically. Returns 1 if the mutex |
464 | * has been acquired successfully, and 0 on contention. | 455 | * has been acquired successfully, and 0 on contention. |
465 | * | 456 | * |
466 | * NOTE: this function follows the spin_trylock() convention, so | 457 | * NOTE: this function follows the spin_trylock() convention, so |
467 | * it is negated to the down_trylock() return values! Be careful | 458 | * it is negated from the down_trylock() return values! Be careful |
468 | * about this when converting semaphore users to mutexes. | 459 | * about this when converting semaphore users to mutexes. |
469 | * | 460 | * |
470 | * This function must not be used in interrupt context. The | 461 | * This function must not be used in interrupt context. The |
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 2a5dfec8efe0..2c98ad94ba0e 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c | |||
@@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, | |||
85 | return ERR_PTR(-EPERM); | 85 | return ERR_PTR(-EPERM); |
86 | if (!cgroup_is_descendant(cgroup, current)) | 86 | if (!cgroup_is_descendant(cgroup, current)) |
87 | return ERR_PTR(-EPERM); | 87 | return ERR_PTR(-EPERM); |
88 | if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) { | ||
89 | printk("ns_cgroup can't be created with parent " | ||
90 | "'clone_children' set.\n"); | ||
91 | return ERR_PTR(-EINVAL); | ||
92 | } | ||
93 | |||
94 | printk_once("ns_cgroup deprecated: consider using the " | ||
95 | "'clone_children' flag without the ns_cgroup.\n"); | ||
88 | 96 | ||
89 | ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); | 97 | ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); |
90 | if (!ns_cgroup) | 98 | if (!ns_cgroup) |
diff --git a/kernel/padata.c b/kernel/padata.c index fdd8ae609ce3..751019415d23 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -26,18 +26,19 @@ | |||
26 | #include <linux/mutex.h> | 26 | #include <linux/mutex.h> |
27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
28 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
29 | #include <linux/sysfs.h> | ||
29 | #include <linux/rcupdate.h> | 30 | #include <linux/rcupdate.h> |
30 | 31 | ||
31 | #define MAX_SEQ_NR INT_MAX - NR_CPUS | 32 | #define MAX_SEQ_NR (INT_MAX - NR_CPUS) |
32 | #define MAX_OBJ_NUM 1000 | 33 | #define MAX_OBJ_NUM 1000 |
33 | 34 | ||
34 | static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) | 35 | static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) |
35 | { | 36 | { |
36 | int cpu, target_cpu; | 37 | int cpu, target_cpu; |
37 | 38 | ||
38 | target_cpu = cpumask_first(pd->cpumask); | 39 | target_cpu = cpumask_first(pd->cpumask.pcpu); |
39 | for (cpu = 0; cpu < cpu_index; cpu++) | 40 | for (cpu = 0; cpu < cpu_index; cpu++) |
40 | target_cpu = cpumask_next(target_cpu, pd->cpumask); | 41 | target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu); |
41 | 42 | ||
42 | return target_cpu; | 43 | return target_cpu; |
43 | } | 44 | } |
@@ -53,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata) | |||
53 | * Hash the sequence numbers to the cpus by taking | 54 | * Hash the sequence numbers to the cpus by taking |
54 | * seq_nr mod. number of cpus in use. | 55 | * seq_nr mod. number of cpus in use. |
55 | */ | 56 | */ |
56 | cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); | 57 | cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); |
57 | 58 | ||
58 | return padata_index_to_cpu(pd, cpu_index); | 59 | return padata_index_to_cpu(pd, cpu_index); |
59 | } | 60 | } |
60 | 61 | ||
61 | static void padata_parallel_worker(struct work_struct *work) | 62 | static void padata_parallel_worker(struct work_struct *parallel_work) |
62 | { | 63 | { |
63 | struct padata_queue *queue; | 64 | struct padata_parallel_queue *pqueue; |
64 | struct parallel_data *pd; | 65 | struct parallel_data *pd; |
65 | struct padata_instance *pinst; | 66 | struct padata_instance *pinst; |
66 | LIST_HEAD(local_list); | 67 | LIST_HEAD(local_list); |
67 | 68 | ||
68 | local_bh_disable(); | 69 | local_bh_disable(); |
69 | queue = container_of(work, struct padata_queue, pwork); | 70 | pqueue = container_of(parallel_work, |
70 | pd = queue->pd; | 71 | struct padata_parallel_queue, work); |
72 | pd = pqueue->pd; | ||
71 | pinst = pd->pinst; | 73 | pinst = pd->pinst; |
72 | 74 | ||
73 | spin_lock(&queue->parallel.lock); | 75 | spin_lock(&pqueue->parallel.lock); |
74 | list_replace_init(&queue->parallel.list, &local_list); | 76 | list_replace_init(&pqueue->parallel.list, &local_list); |
75 | spin_unlock(&queue->parallel.lock); | 77 | spin_unlock(&pqueue->parallel.lock); |
76 | 78 | ||
77 | while (!list_empty(&local_list)) { | 79 | while (!list_empty(&local_list)) { |
78 | struct padata_priv *padata; | 80 | struct padata_priv *padata; |
@@ -94,7 +96,7 @@ static void padata_parallel_worker(struct work_struct *work) | |||
94 | * @pinst: padata instance | 96 | * @pinst: padata instance |
95 | * @padata: object to be parallelized | 97 | * @padata: object to be parallelized |
96 | * @cb_cpu: cpu the serialization callback function will run on, | 98 | * @cb_cpu: cpu the serialization callback function will run on, |
97 | * must be in the cpumask of padata. | 99 | * must be in the serial cpumask of padata(i.e. cpumask.cbcpu). |
98 | * | 100 | * |
99 | * The parallelization callback function will run with BHs off. | 101 | * The parallelization callback function will run with BHs off. |
100 | * Note: Every object which is parallelized by padata_do_parallel | 102 | * Note: Every object which is parallelized by padata_do_parallel |
@@ -104,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst, | |||
104 | struct padata_priv *padata, int cb_cpu) | 106 | struct padata_priv *padata, int cb_cpu) |
105 | { | 107 | { |
106 | int target_cpu, err; | 108 | int target_cpu, err; |
107 | struct padata_queue *queue; | 109 | struct padata_parallel_queue *queue; |
108 | struct parallel_data *pd; | 110 | struct parallel_data *pd; |
109 | 111 | ||
110 | rcu_read_lock_bh(); | 112 | rcu_read_lock_bh(); |
111 | 113 | ||
112 | pd = rcu_dereference(pinst->pd); | 114 | pd = rcu_dereference(pinst->pd); |
113 | 115 | ||
114 | err = 0; | 116 | err = -EINVAL; |
115 | if (!(pinst->flags & PADATA_INIT)) | 117 | if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) |
118 | goto out; | ||
119 | |||
120 | if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu)) | ||
116 | goto out; | 121 | goto out; |
117 | 122 | ||
118 | err = -EBUSY; | 123 | err = -EBUSY; |
@@ -122,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst, | |||
122 | if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) | 127 | if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) |
123 | goto out; | 128 | goto out; |
124 | 129 | ||
125 | err = -EINVAL; | 130 | err = 0; |
126 | if (!cpumask_test_cpu(cb_cpu, pd->cpumask)) | ||
127 | goto out; | ||
128 | |||
129 | err = -EINPROGRESS; | ||
130 | atomic_inc(&pd->refcnt); | 131 | atomic_inc(&pd->refcnt); |
131 | padata->pd = pd; | 132 | padata->pd = pd; |
132 | padata->cb_cpu = cb_cpu; | 133 | padata->cb_cpu = cb_cpu; |
@@ -137,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst, | |||
137 | padata->seq_nr = atomic_inc_return(&pd->seq_nr); | 138 | padata->seq_nr = atomic_inc_return(&pd->seq_nr); |
138 | 139 | ||
139 | target_cpu = padata_cpu_hash(padata); | 140 | target_cpu = padata_cpu_hash(padata); |
140 | queue = per_cpu_ptr(pd->queue, target_cpu); | 141 | queue = per_cpu_ptr(pd->pqueue, target_cpu); |
141 | 142 | ||
142 | spin_lock(&queue->parallel.lock); | 143 | spin_lock(&queue->parallel.lock); |
143 | list_add_tail(&padata->list, &queue->parallel.list); | 144 | list_add_tail(&padata->list, &queue->parallel.list); |
144 | spin_unlock(&queue->parallel.lock); | 145 | spin_unlock(&queue->parallel.lock); |
145 | 146 | ||
146 | queue_work_on(target_cpu, pinst->wq, &queue->pwork); | 147 | queue_work_on(target_cpu, pinst->wq, &queue->work); |
147 | 148 | ||
148 | out: | 149 | out: |
149 | rcu_read_unlock_bh(); | 150 | rcu_read_unlock_bh(); |
@@ -171,84 +172,52 @@ EXPORT_SYMBOL(padata_do_parallel); | |||
171 | */ | 172 | */ |
172 | static struct padata_priv *padata_get_next(struct parallel_data *pd) | 173 | static struct padata_priv *padata_get_next(struct parallel_data *pd) |
173 | { | 174 | { |
174 | int cpu, num_cpus, empty, calc_seq_nr; | 175 | int cpu, num_cpus; |
175 | int seq_nr, next_nr, overrun, next_overrun; | 176 | int next_nr, next_index; |
176 | struct padata_queue *queue, *next_queue; | 177 | struct padata_parallel_queue *queue, *next_queue; |
177 | struct padata_priv *padata; | 178 | struct padata_priv *padata; |
178 | struct padata_list *reorder; | 179 | struct padata_list *reorder; |
179 | 180 | ||
180 | empty = 0; | 181 | num_cpus = cpumask_weight(pd->cpumask.pcpu); |
181 | next_nr = -1; | ||
182 | next_overrun = 0; | ||
183 | next_queue = NULL; | ||
184 | |||
185 | num_cpus = cpumask_weight(pd->cpumask); | ||
186 | |||
187 | for_each_cpu(cpu, pd->cpumask) { | ||
188 | queue = per_cpu_ptr(pd->queue, cpu); | ||
189 | reorder = &queue->reorder; | ||
190 | |||
191 | /* | ||
192 | * Calculate the seq_nr of the object that should be | ||
193 | * next in this reorder queue. | ||
194 | */ | ||
195 | overrun = 0; | ||
196 | calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) | ||
197 | + queue->cpu_index; | ||
198 | 182 | ||
199 | if (unlikely(calc_seq_nr > pd->max_seq_nr)) { | 183 | /* |
200 | calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; | 184 | * Calculate the percpu reorder queue and the sequence |
201 | overrun = 1; | 185 | * number of the next object. |
202 | } | 186 | */ |
203 | 187 | next_nr = pd->processed; | |
204 | if (!list_empty(&reorder->list)) { | 188 | next_index = next_nr % num_cpus; |
205 | padata = list_entry(reorder->list.next, | 189 | cpu = padata_index_to_cpu(pd, next_index); |
206 | struct padata_priv, list); | 190 | next_queue = per_cpu_ptr(pd->pqueue, cpu); |
207 | 191 | ||
208 | seq_nr = padata->seq_nr; | 192 | if (unlikely(next_nr > pd->max_seq_nr)) { |
209 | BUG_ON(calc_seq_nr != seq_nr); | 193 | next_nr = next_nr - pd->max_seq_nr - 1; |
210 | } else { | 194 | next_index = next_nr % num_cpus; |
211 | seq_nr = calc_seq_nr; | 195 | cpu = padata_index_to_cpu(pd, next_index); |
212 | empty++; | 196 | next_queue = per_cpu_ptr(pd->pqueue, cpu); |
213 | } | 197 | pd->processed = 0; |
214 | |||
215 | if (next_nr < 0 || seq_nr < next_nr | ||
216 | || (next_overrun && !overrun)) { | ||
217 | next_nr = seq_nr; | ||
218 | next_overrun = overrun; | ||
219 | next_queue = queue; | ||
220 | } | ||
221 | } | 198 | } |
222 | 199 | ||
223 | padata = NULL; | 200 | padata = NULL; |
224 | 201 | ||
225 | if (empty == num_cpus) | ||
226 | goto out; | ||
227 | |||
228 | reorder = &next_queue->reorder; | 202 | reorder = &next_queue->reorder; |
229 | 203 | ||
230 | if (!list_empty(&reorder->list)) { | 204 | if (!list_empty(&reorder->list)) { |
231 | padata = list_entry(reorder->list.next, | 205 | padata = list_entry(reorder->list.next, |
232 | struct padata_priv, list); | 206 | struct padata_priv, list); |
233 | 207 | ||
234 | if (unlikely(next_overrun)) { | 208 | BUG_ON(next_nr != padata->seq_nr); |
235 | for_each_cpu(cpu, pd->cpumask) { | ||
236 | queue = per_cpu_ptr(pd->queue, cpu); | ||
237 | atomic_set(&queue->num_obj, 0); | ||
238 | } | ||
239 | } | ||
240 | 209 | ||
241 | spin_lock(&reorder->lock); | 210 | spin_lock(&reorder->lock); |
242 | list_del_init(&padata->list); | 211 | list_del_init(&padata->list); |
243 | atomic_dec(&pd->reorder_objects); | 212 | atomic_dec(&pd->reorder_objects); |
244 | spin_unlock(&reorder->lock); | 213 | spin_unlock(&reorder->lock); |
245 | 214 | ||
246 | atomic_inc(&next_queue->num_obj); | 215 | pd->processed++; |
247 | 216 | ||
248 | goto out; | 217 | goto out; |
249 | } | 218 | } |
250 | 219 | ||
251 | queue = per_cpu_ptr(pd->queue, smp_processor_id()); | 220 | queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); |
252 | if (queue->cpu_index == next_queue->cpu_index) { | 221 | if (queue->cpu_index == next_queue->cpu_index) { |
253 | padata = ERR_PTR(-ENODATA); | 222 | padata = ERR_PTR(-ENODATA); |
254 | goto out; | 223 | goto out; |
@@ -262,7 +231,7 @@ out: | |||
262 | static void padata_reorder(struct parallel_data *pd) | 231 | static void padata_reorder(struct parallel_data *pd) |
263 | { | 232 | { |
264 | struct padata_priv *padata; | 233 | struct padata_priv *padata; |
265 | struct padata_queue *queue; | 234 | struct padata_serial_queue *squeue; |
266 | struct padata_instance *pinst = pd->pinst; | 235 | struct padata_instance *pinst = pd->pinst; |
267 | 236 | ||
268 | /* | 237 | /* |
@@ -301,13 +270,13 @@ static void padata_reorder(struct parallel_data *pd) | |||
301 | return; | 270 | return; |
302 | } | 271 | } |
303 | 272 | ||
304 | queue = per_cpu_ptr(pd->queue, padata->cb_cpu); | 273 | squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); |
305 | 274 | ||
306 | spin_lock(&queue->serial.lock); | 275 | spin_lock(&squeue->serial.lock); |
307 | list_add_tail(&padata->list, &queue->serial.list); | 276 | list_add_tail(&padata->list, &squeue->serial.list); |
308 | spin_unlock(&queue->serial.lock); | 277 | spin_unlock(&squeue->serial.lock); |
309 | 278 | ||
310 | queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); | 279 | queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); |
311 | } | 280 | } |
312 | 281 | ||
313 | spin_unlock_bh(&pd->lock); | 282 | spin_unlock_bh(&pd->lock); |
@@ -333,19 +302,19 @@ static void padata_reorder_timer(unsigned long arg) | |||
333 | padata_reorder(pd); | 302 | padata_reorder(pd); |
334 | } | 303 | } |
335 | 304 | ||
336 | static void padata_serial_worker(struct work_struct *work) | 305 | static void padata_serial_worker(struct work_struct *serial_work) |
337 | { | 306 | { |
338 | struct padata_queue *queue; | 307 | struct padata_serial_queue *squeue; |
339 | struct parallel_data *pd; | 308 | struct parallel_data *pd; |
340 | LIST_HEAD(local_list); | 309 | LIST_HEAD(local_list); |
341 | 310 | ||
342 | local_bh_disable(); | 311 | local_bh_disable(); |
343 | queue = container_of(work, struct padata_queue, swork); | 312 | squeue = container_of(serial_work, struct padata_serial_queue, work); |
344 | pd = queue->pd; | 313 | pd = squeue->pd; |
345 | 314 | ||
346 | spin_lock(&queue->serial.lock); | 315 | spin_lock(&squeue->serial.lock); |
347 | list_replace_init(&queue->serial.list, &local_list); | 316 | list_replace_init(&squeue->serial.list, &local_list); |
348 | spin_unlock(&queue->serial.lock); | 317 | spin_unlock(&squeue->serial.lock); |
349 | 318 | ||
350 | while (!list_empty(&local_list)) { | 319 | while (!list_empty(&local_list)) { |
351 | struct padata_priv *padata; | 320 | struct padata_priv *padata; |
@@ -372,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work) | |||
372 | void padata_do_serial(struct padata_priv *padata) | 341 | void padata_do_serial(struct padata_priv *padata) |
373 | { | 342 | { |
374 | int cpu; | 343 | int cpu; |
375 | struct padata_queue *queue; | 344 | struct padata_parallel_queue *pqueue; |
376 | struct parallel_data *pd; | 345 | struct parallel_data *pd; |
377 | 346 | ||
378 | pd = padata->pd; | 347 | pd = padata->pd; |
379 | 348 | ||
380 | cpu = get_cpu(); | 349 | cpu = get_cpu(); |
381 | queue = per_cpu_ptr(pd->queue, cpu); | 350 | pqueue = per_cpu_ptr(pd->pqueue, cpu); |
382 | 351 | ||
383 | spin_lock(&queue->reorder.lock); | 352 | spin_lock(&pqueue->reorder.lock); |
384 | atomic_inc(&pd->reorder_objects); | 353 | atomic_inc(&pd->reorder_objects); |
385 | list_add_tail(&padata->list, &queue->reorder.list); | 354 | list_add_tail(&padata->list, &pqueue->reorder.list); |
386 | spin_unlock(&queue->reorder.lock); | 355 | spin_unlock(&pqueue->reorder.lock); |
387 | 356 | ||
388 | put_cpu(); | 357 | put_cpu(); |
389 | 358 | ||
@@ -391,52 +360,89 @@ void padata_do_serial(struct padata_priv *padata) | |||
391 | } | 360 | } |
392 | EXPORT_SYMBOL(padata_do_serial); | 361 | EXPORT_SYMBOL(padata_do_serial); |
393 | 362 | ||
394 | /* Allocate and initialize the internal cpumask dependend resources. */ | 363 | static int padata_setup_cpumasks(struct parallel_data *pd, |
395 | static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, | 364 | const struct cpumask *pcpumask, |
396 | const struct cpumask *cpumask) | 365 | const struct cpumask *cbcpumask) |
397 | { | 366 | { |
398 | int cpu, cpu_index, num_cpus; | 367 | if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) |
399 | struct padata_queue *queue; | 368 | return -ENOMEM; |
400 | struct parallel_data *pd; | ||
401 | |||
402 | cpu_index = 0; | ||
403 | 369 | ||
404 | pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); | 370 | cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask); |
405 | if (!pd) | 371 | if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { |
406 | goto err; | 372 | free_cpumask_var(pd->cpumask.cbcpu); |
373 | return -ENOMEM; | ||
374 | } | ||
407 | 375 | ||
408 | pd->queue = alloc_percpu(struct padata_queue); | 376 | cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask); |
409 | if (!pd->queue) | 377 | return 0; |
410 | goto err_free_pd; | 378 | } |
411 | 379 | ||
412 | if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) | 380 | static void __padata_list_init(struct padata_list *pd_list) |
413 | goto err_free_queue; | 381 | { |
382 | INIT_LIST_HEAD(&pd_list->list); | ||
383 | spin_lock_init(&pd_list->lock); | ||
384 | } | ||
414 | 385 | ||
415 | cpumask_and(pd->cpumask, cpumask, cpu_active_mask); | 386 | /* Initialize all percpu queues used by serial workers */ |
387 | static void padata_init_squeues(struct parallel_data *pd) | ||
388 | { | ||
389 | int cpu; | ||
390 | struct padata_serial_queue *squeue; | ||
416 | 391 | ||
417 | for_each_cpu(cpu, pd->cpumask) { | 392 | for_each_cpu(cpu, pd->cpumask.cbcpu) { |
418 | queue = per_cpu_ptr(pd->queue, cpu); | 393 | squeue = per_cpu_ptr(pd->squeue, cpu); |
394 | squeue->pd = pd; | ||
395 | __padata_list_init(&squeue->serial); | ||
396 | INIT_WORK(&squeue->work, padata_serial_worker); | ||
397 | } | ||
398 | } | ||
419 | 399 | ||
420 | queue->pd = pd; | 400 | /* Initialize all percpu queues used by parallel workers */ |
401 | static void padata_init_pqueues(struct parallel_data *pd) | ||
402 | { | ||
403 | int cpu_index, num_cpus, cpu; | ||
404 | struct padata_parallel_queue *pqueue; | ||
421 | 405 | ||
422 | queue->cpu_index = cpu_index; | 406 | cpu_index = 0; |
407 | for_each_cpu(cpu, pd->cpumask.pcpu) { | ||
408 | pqueue = per_cpu_ptr(pd->pqueue, cpu); | ||
409 | pqueue->pd = pd; | ||
410 | pqueue->cpu_index = cpu_index; | ||
423 | cpu_index++; | 411 | cpu_index++; |
424 | 412 | ||
425 | INIT_LIST_HEAD(&queue->reorder.list); | 413 | __padata_list_init(&pqueue->reorder); |
426 | INIT_LIST_HEAD(&queue->parallel.list); | 414 | __padata_list_init(&pqueue->parallel); |
427 | INIT_LIST_HEAD(&queue->serial.list); | 415 | INIT_WORK(&pqueue->work, padata_parallel_worker); |
428 | spin_lock_init(&queue->reorder.lock); | 416 | atomic_set(&pqueue->num_obj, 0); |
429 | spin_lock_init(&queue->parallel.lock); | ||
430 | spin_lock_init(&queue->serial.lock); | ||
431 | |||
432 | INIT_WORK(&queue->pwork, padata_parallel_worker); | ||
433 | INIT_WORK(&queue->swork, padata_serial_worker); | ||
434 | atomic_set(&queue->num_obj, 0); | ||
435 | } | 417 | } |
436 | 418 | ||
437 | num_cpus = cpumask_weight(pd->cpumask); | 419 | num_cpus = cpumask_weight(pd->cpumask.pcpu); |
438 | pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; | 420 | pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0; |
421 | } | ||
422 | |||
423 | /* Allocate and initialize the internal cpumask dependend resources. */ | ||
424 | static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, | ||
425 | const struct cpumask *pcpumask, | ||
426 | const struct cpumask *cbcpumask) | ||
427 | { | ||
428 | struct parallel_data *pd; | ||
439 | 429 | ||
430 | pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); | ||
431 | if (!pd) | ||
432 | goto err; | ||
433 | |||
434 | pd->pqueue = alloc_percpu(struct padata_parallel_queue); | ||
435 | if (!pd->pqueue) | ||
436 | goto err_free_pd; | ||
437 | |||
438 | pd->squeue = alloc_percpu(struct padata_serial_queue); | ||
439 | if (!pd->squeue) | ||
440 | goto err_free_pqueue; | ||
441 | if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0) | ||
442 | goto err_free_squeue; | ||
443 | |||
444 | padata_init_pqueues(pd); | ||
445 | padata_init_squeues(pd); | ||
440 | setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); | 446 | setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); |
441 | atomic_set(&pd->seq_nr, -1); | 447 | atomic_set(&pd->seq_nr, -1); |
442 | atomic_set(&pd->reorder_objects, 0); | 448 | atomic_set(&pd->reorder_objects, 0); |
@@ -446,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, | |||
446 | 452 | ||
447 | return pd; | 453 | return pd; |
448 | 454 | ||
449 | err_free_queue: | 455 | err_free_squeue: |
450 | free_percpu(pd->queue); | 456 | free_percpu(pd->squeue); |
457 | err_free_pqueue: | ||
458 | free_percpu(pd->pqueue); | ||
451 | err_free_pd: | 459 | err_free_pd: |
452 | kfree(pd); | 460 | kfree(pd); |
453 | err: | 461 | err: |
@@ -456,8 +464,10 @@ err: | |||
456 | 464 | ||
457 | static void padata_free_pd(struct parallel_data *pd) | 465 | static void padata_free_pd(struct parallel_data *pd) |
458 | { | 466 | { |
459 | free_cpumask_var(pd->cpumask); | 467 | free_cpumask_var(pd->cpumask.pcpu); |
460 | free_percpu(pd->queue); | 468 | free_cpumask_var(pd->cpumask.cbcpu); |
469 | free_percpu(pd->pqueue); | ||
470 | free_percpu(pd->squeue); | ||
461 | kfree(pd); | 471 | kfree(pd); |
462 | } | 472 | } |
463 | 473 | ||
@@ -465,11 +475,12 @@ static void padata_free_pd(struct parallel_data *pd) | |||
465 | static void padata_flush_queues(struct parallel_data *pd) | 475 | static void padata_flush_queues(struct parallel_data *pd) |
466 | { | 476 | { |
467 | int cpu; | 477 | int cpu; |
468 | struct padata_queue *queue; | 478 | struct padata_parallel_queue *pqueue; |
479 | struct padata_serial_queue *squeue; | ||
469 | 480 | ||
470 | for_each_cpu(cpu, pd->cpumask) { | 481 | for_each_cpu(cpu, pd->cpumask.pcpu) { |
471 | queue = per_cpu_ptr(pd->queue, cpu); | 482 | pqueue = per_cpu_ptr(pd->pqueue, cpu); |
472 | flush_work(&queue->pwork); | 483 | flush_work(&pqueue->work); |
473 | } | 484 | } |
474 | 485 | ||
475 | del_timer_sync(&pd->timer); | 486 | del_timer_sync(&pd->timer); |
@@ -477,19 +488,39 @@ static void padata_flush_queues(struct parallel_data *pd) | |||
477 | if (atomic_read(&pd->reorder_objects)) | 488 | if (atomic_read(&pd->reorder_objects)) |
478 | padata_reorder(pd); | 489 | padata_reorder(pd); |
479 | 490 | ||
480 | for_each_cpu(cpu, pd->cpumask) { | 491 | for_each_cpu(cpu, pd->cpumask.cbcpu) { |
481 | queue = per_cpu_ptr(pd->queue, cpu); | 492 | squeue = per_cpu_ptr(pd->squeue, cpu); |
482 | flush_work(&queue->swork); | 493 | flush_work(&squeue->work); |
483 | } | 494 | } |
484 | 495 | ||
485 | BUG_ON(atomic_read(&pd->refcnt) != 0); | 496 | BUG_ON(atomic_read(&pd->refcnt) != 0); |
486 | } | 497 | } |
487 | 498 | ||
499 | static void __padata_start(struct padata_instance *pinst) | ||
500 | { | ||
501 | pinst->flags |= PADATA_INIT; | ||
502 | } | ||
503 | |||
504 | static void __padata_stop(struct padata_instance *pinst) | ||
505 | { | ||
506 | if (!(pinst->flags & PADATA_INIT)) | ||
507 | return; | ||
508 | |||
509 | pinst->flags &= ~PADATA_INIT; | ||
510 | |||
511 | synchronize_rcu(); | ||
512 | |||
513 | get_online_cpus(); | ||
514 | padata_flush_queues(pinst->pd); | ||
515 | put_online_cpus(); | ||
516 | } | ||
517 | |||
488 | /* Replace the internal control stucture with a new one. */ | 518 | /* Replace the internal control stucture with a new one. */ |
489 | static void padata_replace(struct padata_instance *pinst, | 519 | static void padata_replace(struct padata_instance *pinst, |
490 | struct parallel_data *pd_new) | 520 | struct parallel_data *pd_new) |
491 | { | 521 | { |
492 | struct parallel_data *pd_old = pinst->pd; | 522 | struct parallel_data *pd_old = pinst->pd; |
523 | int notification_mask = 0; | ||
493 | 524 | ||
494 | pinst->flags |= PADATA_RESET; | 525 | pinst->flags |= PADATA_RESET; |
495 | 526 | ||
@@ -497,41 +528,162 @@ static void padata_replace(struct padata_instance *pinst, | |||
497 | 528 | ||
498 | synchronize_rcu(); | 529 | synchronize_rcu(); |
499 | 530 | ||
531 | if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu)) | ||
532 | notification_mask |= PADATA_CPU_PARALLEL; | ||
533 | if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu)) | ||
534 | notification_mask |= PADATA_CPU_SERIAL; | ||
535 | |||
500 | padata_flush_queues(pd_old); | 536 | padata_flush_queues(pd_old); |
501 | padata_free_pd(pd_old); | 537 | padata_free_pd(pd_old); |
502 | 538 | ||
539 | if (notification_mask) | ||
540 | blocking_notifier_call_chain(&pinst->cpumask_change_notifier, | ||
541 | notification_mask, | ||
542 | &pd_new->cpumask); | ||
543 | |||
503 | pinst->flags &= ~PADATA_RESET; | 544 | pinst->flags &= ~PADATA_RESET; |
504 | } | 545 | } |
505 | 546 | ||
506 | /** | 547 | /** |
507 | * padata_set_cpumask - set the cpumask that padata should use | 548 | * padata_register_cpumask_notifier - Registers a notifier that will be called |
549 | * if either pcpu or cbcpu or both cpumasks change. | ||
508 | * | 550 | * |
509 | * @pinst: padata instance | 551 | * @pinst: A poineter to padata instance |
510 | * @cpumask: the cpumask to use | 552 | * @nblock: A pointer to notifier block. |
511 | */ | 553 | */ |
512 | int padata_set_cpumask(struct padata_instance *pinst, | 554 | int padata_register_cpumask_notifier(struct padata_instance *pinst, |
513 | cpumask_var_t cpumask) | 555 | struct notifier_block *nblock) |
514 | { | 556 | { |
557 | return blocking_notifier_chain_register(&pinst->cpumask_change_notifier, | ||
558 | nblock); | ||
559 | } | ||
560 | EXPORT_SYMBOL(padata_register_cpumask_notifier); | ||
561 | |||
562 | /** | ||
563 | * padata_unregister_cpumask_notifier - Unregisters cpumask notifier | ||
564 | * registered earlier using padata_register_cpumask_notifier | ||
565 | * | ||
566 | * @pinst: A pointer to data instance. | ||
567 | * @nlock: A pointer to notifier block. | ||
568 | */ | ||
569 | int padata_unregister_cpumask_notifier(struct padata_instance *pinst, | ||
570 | struct notifier_block *nblock) | ||
571 | { | ||
572 | return blocking_notifier_chain_unregister( | ||
573 | &pinst->cpumask_change_notifier, | ||
574 | nblock); | ||
575 | } | ||
576 | EXPORT_SYMBOL(padata_unregister_cpumask_notifier); | ||
577 | |||
578 | |||
579 | /* If cpumask contains no active cpu, we mark the instance as invalid. */ | ||
580 | static bool padata_validate_cpumask(struct padata_instance *pinst, | ||
581 | const struct cpumask *cpumask) | ||
582 | { | ||
583 | if (!cpumask_intersects(cpumask, cpu_active_mask)) { | ||
584 | pinst->flags |= PADATA_INVALID; | ||
585 | return false; | ||
586 | } | ||
587 | |||
588 | pinst->flags &= ~PADATA_INVALID; | ||
589 | return true; | ||
590 | } | ||
591 | |||
592 | static int __padata_set_cpumasks(struct padata_instance *pinst, | ||
593 | cpumask_var_t pcpumask, | ||
594 | cpumask_var_t cbcpumask) | ||
595 | { | ||
596 | int valid; | ||
515 | struct parallel_data *pd; | 597 | struct parallel_data *pd; |
516 | int err = 0; | 598 | |
599 | valid = padata_validate_cpumask(pinst, pcpumask); | ||
600 | if (!valid) { | ||
601 | __padata_stop(pinst); | ||
602 | goto out_replace; | ||
603 | } | ||
604 | |||
605 | valid = padata_validate_cpumask(pinst, cbcpumask); | ||
606 | if (!valid) | ||
607 | __padata_stop(pinst); | ||
608 | |||
609 | out_replace: | ||
610 | pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); | ||
611 | if (!pd) | ||
612 | return -ENOMEM; | ||
613 | |||
614 | cpumask_copy(pinst->cpumask.pcpu, pcpumask); | ||
615 | cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); | ||
616 | |||
617 | padata_replace(pinst, pd); | ||
618 | |||
619 | if (valid) | ||
620 | __padata_start(pinst); | ||
621 | |||
622 | return 0; | ||
623 | } | ||
624 | |||
625 | /** | ||
626 | * padata_set_cpumasks - Set both parallel and serial cpumasks. The first | ||
627 | * one is used by parallel workers and the second one | ||
628 | * by the wokers doing serialization. | ||
629 | * | ||
630 | * @pinst: padata instance | ||
631 | * @pcpumask: the cpumask to use for parallel workers | ||
632 | * @cbcpumask: the cpumsak to use for serial workers | ||
633 | */ | ||
634 | int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask, | ||
635 | cpumask_var_t cbcpumask) | ||
636 | { | ||
637 | int err; | ||
517 | 638 | ||
518 | mutex_lock(&pinst->lock); | 639 | mutex_lock(&pinst->lock); |
640 | get_online_cpus(); | ||
519 | 641 | ||
642 | err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask); | ||
643 | |||
644 | put_online_cpus(); | ||
645 | mutex_unlock(&pinst->lock); | ||
646 | |||
647 | return err; | ||
648 | |||
649 | } | ||
650 | EXPORT_SYMBOL(padata_set_cpumasks); | ||
651 | |||
652 | /** | ||
653 | * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value | ||
654 | * equivalent to @cpumask. | ||
655 | * | ||
656 | * @pinst: padata instance | ||
657 | * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding | ||
658 | * to parallel and serial cpumasks respectively. | ||
659 | * @cpumask: the cpumask to use | ||
660 | */ | ||
661 | int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, | ||
662 | cpumask_var_t cpumask) | ||
663 | { | ||
664 | struct cpumask *serial_mask, *parallel_mask; | ||
665 | int err = -EINVAL; | ||
666 | |||
667 | mutex_lock(&pinst->lock); | ||
520 | get_online_cpus(); | 668 | get_online_cpus(); |
521 | 669 | ||
522 | pd = padata_alloc_pd(pinst, cpumask); | 670 | switch (cpumask_type) { |
523 | if (!pd) { | 671 | case PADATA_CPU_PARALLEL: |
524 | err = -ENOMEM; | 672 | serial_mask = pinst->cpumask.cbcpu; |
525 | goto out; | 673 | parallel_mask = cpumask; |
674 | break; | ||
675 | case PADATA_CPU_SERIAL: | ||
676 | parallel_mask = pinst->cpumask.pcpu; | ||
677 | serial_mask = cpumask; | ||
678 | break; | ||
679 | default: | ||
680 | goto out; | ||
526 | } | 681 | } |
527 | 682 | ||
528 | cpumask_copy(pinst->cpumask, cpumask); | 683 | err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask); |
529 | |||
530 | padata_replace(pinst, pd); | ||
531 | 684 | ||
532 | out: | 685 | out: |
533 | put_online_cpus(); | 686 | put_online_cpus(); |
534 | |||
535 | mutex_unlock(&pinst->lock); | 687 | mutex_unlock(&pinst->lock); |
536 | 688 | ||
537 | return err; | 689 | return err; |
@@ -543,30 +695,48 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) | |||
543 | struct parallel_data *pd; | 695 | struct parallel_data *pd; |
544 | 696 | ||
545 | if (cpumask_test_cpu(cpu, cpu_active_mask)) { | 697 | if (cpumask_test_cpu(cpu, cpu_active_mask)) { |
546 | pd = padata_alloc_pd(pinst, pinst->cpumask); | 698 | pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, |
699 | pinst->cpumask.cbcpu); | ||
547 | if (!pd) | 700 | if (!pd) |
548 | return -ENOMEM; | 701 | return -ENOMEM; |
549 | 702 | ||
550 | padata_replace(pinst, pd); | 703 | padata_replace(pinst, pd); |
704 | |||
705 | if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) && | ||
706 | padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) | ||
707 | __padata_start(pinst); | ||
551 | } | 708 | } |
552 | 709 | ||
553 | return 0; | 710 | return 0; |
554 | } | 711 | } |
555 | 712 | ||
556 | /** | 713 | /** |
557 | * padata_add_cpu - add a cpu to the padata cpumask | 714 | * padata_add_cpu - add a cpu to one or both(parallel and serial) |
715 | * padata cpumasks. | ||
558 | * | 716 | * |
559 | * @pinst: padata instance | 717 | * @pinst: padata instance |
560 | * @cpu: cpu to add | 718 | * @cpu: cpu to add |
719 | * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added. | ||
720 | * The @mask may be any combination of the following flags: | ||
721 | * PADATA_CPU_SERIAL - serial cpumask | ||
722 | * PADATA_CPU_PARALLEL - parallel cpumask | ||
561 | */ | 723 | */ |
562 | int padata_add_cpu(struct padata_instance *pinst, int cpu) | 724 | |
725 | int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask) | ||
563 | { | 726 | { |
564 | int err; | 727 | int err; |
565 | 728 | ||
729 | if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) | ||
730 | return -EINVAL; | ||
731 | |||
566 | mutex_lock(&pinst->lock); | 732 | mutex_lock(&pinst->lock); |
567 | 733 | ||
568 | get_online_cpus(); | 734 | get_online_cpus(); |
569 | cpumask_set_cpu(cpu, pinst->cpumask); | 735 | if (mask & PADATA_CPU_SERIAL) |
736 | cpumask_set_cpu(cpu, pinst->cpumask.cbcpu); | ||
737 | if (mask & PADATA_CPU_PARALLEL) | ||
738 | cpumask_set_cpu(cpu, pinst->cpumask.pcpu); | ||
739 | |||
570 | err = __padata_add_cpu(pinst, cpu); | 740 | err = __padata_add_cpu(pinst, cpu); |
571 | put_online_cpus(); | 741 | put_online_cpus(); |
572 | 742 | ||
@@ -578,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu); | |||
578 | 748 | ||
579 | static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) | 749 | static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) |
580 | { | 750 | { |
581 | struct parallel_data *pd; | 751 | struct parallel_data *pd = NULL; |
582 | 752 | ||
583 | if (cpumask_test_cpu(cpu, cpu_online_mask)) { | 753 | if (cpumask_test_cpu(cpu, cpu_online_mask)) { |
584 | pd = padata_alloc_pd(pinst, pinst->cpumask); | 754 | |
755 | if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) || | ||
756 | !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) | ||
757 | __padata_stop(pinst); | ||
758 | |||
759 | pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, | ||
760 | pinst->cpumask.cbcpu); | ||
585 | if (!pd) | 761 | if (!pd) |
586 | return -ENOMEM; | 762 | return -ENOMEM; |
587 | 763 | ||
@@ -591,20 +767,32 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) | |||
591 | return 0; | 767 | return 0; |
592 | } | 768 | } |
593 | 769 | ||
594 | /** | 770 | /** |
595 | * padata_remove_cpu - remove a cpu from the padata cpumask | 771 | * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) |
772 | * padata cpumasks. | ||
596 | * | 773 | * |
597 | * @pinst: padata instance | 774 | * @pinst: padata instance |
598 | * @cpu: cpu to remove | 775 | * @cpu: cpu to remove |
776 | * @mask: bitmask specifying from which cpumask @cpu should be removed | ||
777 | * The @mask may be any combination of the following flags: | ||
778 | * PADATA_CPU_SERIAL - serial cpumask | ||
779 | * PADATA_CPU_PARALLEL - parallel cpumask | ||
599 | */ | 780 | */ |
600 | int padata_remove_cpu(struct padata_instance *pinst, int cpu) | 781 | int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) |
601 | { | 782 | { |
602 | int err; | 783 | int err; |
603 | 784 | ||
785 | if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) | ||
786 | return -EINVAL; | ||
787 | |||
604 | mutex_lock(&pinst->lock); | 788 | mutex_lock(&pinst->lock); |
605 | 789 | ||
606 | get_online_cpus(); | 790 | get_online_cpus(); |
607 | cpumask_clear_cpu(cpu, pinst->cpumask); | 791 | if (mask & PADATA_CPU_SERIAL) |
792 | cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu); | ||
793 | if (mask & PADATA_CPU_PARALLEL) | ||
794 | cpumask_clear_cpu(cpu, pinst->cpumask.pcpu); | ||
795 | |||
608 | err = __padata_remove_cpu(pinst, cpu); | 796 | err = __padata_remove_cpu(pinst, cpu); |
609 | put_online_cpus(); | 797 | put_online_cpus(); |
610 | 798 | ||
@@ -619,11 +807,20 @@ EXPORT_SYMBOL(padata_remove_cpu); | |||
619 | * | 807 | * |
620 | * @pinst: padata instance to start | 808 | * @pinst: padata instance to start |
621 | */ | 809 | */ |
622 | void padata_start(struct padata_instance *pinst) | 810 | int padata_start(struct padata_instance *pinst) |
623 | { | 811 | { |
812 | int err = 0; | ||
813 | |||
624 | mutex_lock(&pinst->lock); | 814 | mutex_lock(&pinst->lock); |
625 | pinst->flags |= PADATA_INIT; | 815 | |
816 | if (pinst->flags & PADATA_INVALID) | ||
817 | err =-EINVAL; | ||
818 | |||
819 | __padata_start(pinst); | ||
820 | |||
626 | mutex_unlock(&pinst->lock); | 821 | mutex_unlock(&pinst->lock); |
822 | |||
823 | return err; | ||
627 | } | 824 | } |
628 | EXPORT_SYMBOL(padata_start); | 825 | EXPORT_SYMBOL(padata_start); |
629 | 826 | ||
@@ -635,12 +832,20 @@ EXPORT_SYMBOL(padata_start); | |||
635 | void padata_stop(struct padata_instance *pinst) | 832 | void padata_stop(struct padata_instance *pinst) |
636 | { | 833 | { |
637 | mutex_lock(&pinst->lock); | 834 | mutex_lock(&pinst->lock); |
638 | pinst->flags &= ~PADATA_INIT; | 835 | __padata_stop(pinst); |
639 | mutex_unlock(&pinst->lock); | 836 | mutex_unlock(&pinst->lock); |
640 | } | 837 | } |
641 | EXPORT_SYMBOL(padata_stop); | 838 | EXPORT_SYMBOL(padata_stop); |
642 | 839 | ||
643 | #ifdef CONFIG_HOTPLUG_CPU | 840 | #ifdef CONFIG_HOTPLUG_CPU |
841 | |||
842 | static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) | ||
843 | { | ||
844 | return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || | ||
845 | cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); | ||
846 | } | ||
847 | |||
848 | |||
644 | static int padata_cpu_callback(struct notifier_block *nfb, | 849 | static int padata_cpu_callback(struct notifier_block *nfb, |
645 | unsigned long action, void *hcpu) | 850 | unsigned long action, void *hcpu) |
646 | { | 851 | { |
@@ -653,7 +858,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, | |||
653 | switch (action) { | 858 | switch (action) { |
654 | case CPU_ONLINE: | 859 | case CPU_ONLINE: |
655 | case CPU_ONLINE_FROZEN: | 860 | case CPU_ONLINE_FROZEN: |
656 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | 861 | if (!pinst_has_cpu(pinst, cpu)) |
657 | break; | 862 | break; |
658 | mutex_lock(&pinst->lock); | 863 | mutex_lock(&pinst->lock); |
659 | err = __padata_add_cpu(pinst, cpu); | 864 | err = __padata_add_cpu(pinst, cpu); |
@@ -664,7 +869,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, | |||
664 | 869 | ||
665 | case CPU_DOWN_PREPARE: | 870 | case CPU_DOWN_PREPARE: |
666 | case CPU_DOWN_PREPARE_FROZEN: | 871 | case CPU_DOWN_PREPARE_FROZEN: |
667 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | 872 | if (!pinst_has_cpu(pinst, cpu)) |
668 | break; | 873 | break; |
669 | mutex_lock(&pinst->lock); | 874 | mutex_lock(&pinst->lock); |
670 | err = __padata_remove_cpu(pinst, cpu); | 875 | err = __padata_remove_cpu(pinst, cpu); |
@@ -675,7 +880,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, | |||
675 | 880 | ||
676 | case CPU_UP_CANCELED: | 881 | case CPU_UP_CANCELED: |
677 | case CPU_UP_CANCELED_FROZEN: | 882 | case CPU_UP_CANCELED_FROZEN: |
678 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | 883 | if (!pinst_has_cpu(pinst, cpu)) |
679 | break; | 884 | break; |
680 | mutex_lock(&pinst->lock); | 885 | mutex_lock(&pinst->lock); |
681 | __padata_remove_cpu(pinst, cpu); | 886 | __padata_remove_cpu(pinst, cpu); |
@@ -683,7 +888,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, | |||
683 | 888 | ||
684 | case CPU_DOWN_FAILED: | 889 | case CPU_DOWN_FAILED: |
685 | case CPU_DOWN_FAILED_FROZEN: | 890 | case CPU_DOWN_FAILED_FROZEN: |
686 | if (!cpumask_test_cpu(cpu, pinst->cpumask)) | 891 | if (!pinst_has_cpu(pinst, cpu)) |
687 | break; | 892 | break; |
688 | mutex_lock(&pinst->lock); | 893 | mutex_lock(&pinst->lock); |
689 | __padata_add_cpu(pinst, cpu); | 894 | __padata_add_cpu(pinst, cpu); |
@@ -694,36 +899,202 @@ static int padata_cpu_callback(struct notifier_block *nfb, | |||
694 | } | 899 | } |
695 | #endif | 900 | #endif |
696 | 901 | ||
902 | static void __padata_free(struct padata_instance *pinst) | ||
903 | { | ||
904 | #ifdef CONFIG_HOTPLUG_CPU | ||
905 | unregister_hotcpu_notifier(&pinst->cpu_notifier); | ||
906 | #endif | ||
907 | |||
908 | padata_stop(pinst); | ||
909 | padata_free_pd(pinst->pd); | ||
910 | free_cpumask_var(pinst->cpumask.pcpu); | ||
911 | free_cpumask_var(pinst->cpumask.cbcpu); | ||
912 | kfree(pinst); | ||
913 | } | ||
914 | |||
915 | #define kobj2pinst(_kobj) \ | ||
916 | container_of(_kobj, struct padata_instance, kobj) | ||
917 | #define attr2pentry(_attr) \ | ||
918 | container_of(_attr, struct padata_sysfs_entry, attr) | ||
919 | |||
920 | static void padata_sysfs_release(struct kobject *kobj) | ||
921 | { | ||
922 | struct padata_instance *pinst = kobj2pinst(kobj); | ||
923 | __padata_free(pinst); | ||
924 | } | ||
925 | |||
926 | struct padata_sysfs_entry { | ||
927 | struct attribute attr; | ||
928 | ssize_t (*show)(struct padata_instance *, struct attribute *, char *); | ||
929 | ssize_t (*store)(struct padata_instance *, struct attribute *, | ||
930 | const char *, size_t); | ||
931 | }; | ||
932 | |||
933 | static ssize_t show_cpumask(struct padata_instance *pinst, | ||
934 | struct attribute *attr, char *buf) | ||
935 | { | ||
936 | struct cpumask *cpumask; | ||
937 | ssize_t len; | ||
938 | |||
939 | mutex_lock(&pinst->lock); | ||
940 | if (!strcmp(attr->name, "serial_cpumask")) | ||
941 | cpumask = pinst->cpumask.cbcpu; | ||
942 | else | ||
943 | cpumask = pinst->cpumask.pcpu; | ||
944 | |||
945 | len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask), | ||
946 | nr_cpu_ids); | ||
947 | if (PAGE_SIZE - len < 2) | ||
948 | len = -EINVAL; | ||
949 | else | ||
950 | len += sprintf(buf + len, "\n"); | ||
951 | |||
952 | mutex_unlock(&pinst->lock); | ||
953 | return len; | ||
954 | } | ||
955 | |||
956 | static ssize_t store_cpumask(struct padata_instance *pinst, | ||
957 | struct attribute *attr, | ||
958 | const char *buf, size_t count) | ||
959 | { | ||
960 | cpumask_var_t new_cpumask; | ||
961 | ssize_t ret; | ||
962 | int mask_type; | ||
963 | |||
964 | if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL)) | ||
965 | return -ENOMEM; | ||
966 | |||
967 | ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask), | ||
968 | nr_cpumask_bits); | ||
969 | if (ret < 0) | ||
970 | goto out; | ||
971 | |||
972 | mask_type = !strcmp(attr->name, "serial_cpumask") ? | ||
973 | PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL; | ||
974 | ret = padata_set_cpumask(pinst, mask_type, new_cpumask); | ||
975 | if (!ret) | ||
976 | ret = count; | ||
977 | |||
978 | out: | ||
979 | free_cpumask_var(new_cpumask); | ||
980 | return ret; | ||
981 | } | ||
982 | |||
983 | #define PADATA_ATTR_RW(_name, _show_name, _store_name) \ | ||
984 | static struct padata_sysfs_entry _name##_attr = \ | ||
985 | __ATTR(_name, 0644, _show_name, _store_name) | ||
986 | #define PADATA_ATTR_RO(_name, _show_name) \ | ||
987 | static struct padata_sysfs_entry _name##_attr = \ | ||
988 | __ATTR(_name, 0400, _show_name, NULL) | ||
989 | |||
990 | PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask); | ||
991 | PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask); | ||
992 | |||
993 | /* | ||
994 | * Padata sysfs provides the following objects: | ||
995 | * serial_cpumask [RW] - cpumask for serial workers | ||
996 | * parallel_cpumask [RW] - cpumask for parallel workers | ||
997 | */ | ||
998 | static struct attribute *padata_default_attrs[] = { | ||
999 | &serial_cpumask_attr.attr, | ||
1000 | ¶llel_cpumask_attr.attr, | ||
1001 | NULL, | ||
1002 | }; | ||
1003 | |||
1004 | static ssize_t padata_sysfs_show(struct kobject *kobj, | ||
1005 | struct attribute *attr, char *buf) | ||
1006 | { | ||
1007 | struct padata_instance *pinst; | ||
1008 | struct padata_sysfs_entry *pentry; | ||
1009 | ssize_t ret = -EIO; | ||
1010 | |||
1011 | pinst = kobj2pinst(kobj); | ||
1012 | pentry = attr2pentry(attr); | ||
1013 | if (pentry->show) | ||
1014 | ret = pentry->show(pinst, attr, buf); | ||
1015 | |||
1016 | return ret; | ||
1017 | } | ||
1018 | |||
1019 | static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr, | ||
1020 | const char *buf, size_t count) | ||
1021 | { | ||
1022 | struct padata_instance *pinst; | ||
1023 | struct padata_sysfs_entry *pentry; | ||
1024 | ssize_t ret = -EIO; | ||
1025 | |||
1026 | pinst = kobj2pinst(kobj); | ||
1027 | pentry = attr2pentry(attr); | ||
1028 | if (pentry->show) | ||
1029 | ret = pentry->store(pinst, attr, buf, count); | ||
1030 | |||
1031 | return ret; | ||
1032 | } | ||
1033 | |||
1034 | static const struct sysfs_ops padata_sysfs_ops = { | ||
1035 | .show = padata_sysfs_show, | ||
1036 | .store = padata_sysfs_store, | ||
1037 | }; | ||
1038 | |||
1039 | static struct kobj_type padata_attr_type = { | ||
1040 | .sysfs_ops = &padata_sysfs_ops, | ||
1041 | .default_attrs = padata_default_attrs, | ||
1042 | .release = padata_sysfs_release, | ||
1043 | }; | ||
1044 | |||
697 | /** | 1045 | /** |
698 | * padata_alloc - allocate and initialize a padata instance | 1046 | * padata_alloc_possible - Allocate and initialize padata instance. |
1047 | * Use the cpu_possible_mask for serial and | ||
1048 | * parallel workers. | ||
699 | * | 1049 | * |
700 | * @cpumask: cpumask that padata uses for parallelization | ||
701 | * @wq: workqueue to use for the allocated padata instance | 1050 | * @wq: workqueue to use for the allocated padata instance |
702 | */ | 1051 | */ |
703 | struct padata_instance *padata_alloc(const struct cpumask *cpumask, | 1052 | struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) |
704 | struct workqueue_struct *wq) | 1053 | { |
1054 | return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); | ||
1055 | } | ||
1056 | EXPORT_SYMBOL(padata_alloc_possible); | ||
1057 | |||
1058 | /** | ||
1059 | * padata_alloc - allocate and initialize a padata instance and specify | ||
1060 | * cpumasks for serial and parallel workers. | ||
1061 | * | ||
1062 | * @wq: workqueue to use for the allocated padata instance | ||
1063 | * @pcpumask: cpumask that will be used for padata parallelization | ||
1064 | * @cbcpumask: cpumask that will be used for padata serialization | ||
1065 | */ | ||
1066 | struct padata_instance *padata_alloc(struct workqueue_struct *wq, | ||
1067 | const struct cpumask *pcpumask, | ||
1068 | const struct cpumask *cbcpumask) | ||
705 | { | 1069 | { |
706 | struct padata_instance *pinst; | 1070 | struct padata_instance *pinst; |
707 | struct parallel_data *pd; | 1071 | struct parallel_data *pd = NULL; |
708 | 1072 | ||
709 | pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); | 1073 | pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); |
710 | if (!pinst) | 1074 | if (!pinst) |
711 | goto err; | 1075 | goto err; |
712 | 1076 | ||
713 | get_online_cpus(); | 1077 | get_online_cpus(); |
714 | 1078 | if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) | |
715 | pd = padata_alloc_pd(pinst, cpumask); | ||
716 | if (!pd) | ||
717 | goto err_free_inst; | 1079 | goto err_free_inst; |
1080 | if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { | ||
1081 | free_cpumask_var(pinst->cpumask.pcpu); | ||
1082 | goto err_free_inst; | ||
1083 | } | ||
1084 | if (!padata_validate_cpumask(pinst, pcpumask) || | ||
1085 | !padata_validate_cpumask(pinst, cbcpumask)) | ||
1086 | goto err_free_masks; | ||
718 | 1087 | ||
719 | if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) | 1088 | pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); |
720 | goto err_free_pd; | 1089 | if (!pd) |
1090 | goto err_free_masks; | ||
721 | 1091 | ||
722 | rcu_assign_pointer(pinst->pd, pd); | 1092 | rcu_assign_pointer(pinst->pd, pd); |
723 | 1093 | ||
724 | pinst->wq = wq; | 1094 | pinst->wq = wq; |
725 | 1095 | ||
726 | cpumask_copy(pinst->cpumask, cpumask); | 1096 | cpumask_copy(pinst->cpumask.pcpu, pcpumask); |
1097 | cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); | ||
727 | 1098 | ||
728 | pinst->flags = 0; | 1099 | pinst->flags = 0; |
729 | 1100 | ||
@@ -735,12 +1106,15 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask, | |||
735 | 1106 | ||
736 | put_online_cpus(); | 1107 | put_online_cpus(); |
737 | 1108 | ||
1109 | BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); | ||
1110 | kobject_init(&pinst->kobj, &padata_attr_type); | ||
738 | mutex_init(&pinst->lock); | 1111 | mutex_init(&pinst->lock); |
739 | 1112 | ||
740 | return pinst; | 1113 | return pinst; |
741 | 1114 | ||
742 | err_free_pd: | 1115 | err_free_masks: |
743 | padata_free_pd(pd); | 1116 | free_cpumask_var(pinst->cpumask.pcpu); |
1117 | free_cpumask_var(pinst->cpumask.cbcpu); | ||
744 | err_free_inst: | 1118 | err_free_inst: |
745 | kfree(pinst); | 1119 | kfree(pinst); |
746 | put_online_cpus(); | 1120 | put_online_cpus(); |
@@ -756,19 +1130,6 @@ EXPORT_SYMBOL(padata_alloc); | |||
756 | */ | 1130 | */ |
757 | void padata_free(struct padata_instance *pinst) | 1131 | void padata_free(struct padata_instance *pinst) |
758 | { | 1132 | { |
759 | padata_stop(pinst); | 1133 | kobject_put(&pinst->kobj); |
760 | |||
761 | synchronize_rcu(); | ||
762 | |||
763 | #ifdef CONFIG_HOTPLUG_CPU | ||
764 | unregister_hotcpu_notifier(&pinst->cpu_notifier); | ||
765 | #endif | ||
766 | get_online_cpus(); | ||
767 | padata_flush_queues(pinst->pd); | ||
768 | put_online_cpus(); | ||
769 | |||
770 | padata_free_pd(pinst->pd); | ||
771 | free_cpumask_var(pinst->cpumask); | ||
772 | kfree(pinst); | ||
773 | } | 1134 | } |
774 | EXPORT_SYMBOL(padata_free); | 1135 | EXPORT_SYMBOL(padata_free); |
diff --git a/kernel/panic.c b/kernel/panic.c index 3b16cd93fa7d..4c13b1a88ebb 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -24,6 +24,9 @@ | |||
24 | #include <linux/nmi.h> | 24 | #include <linux/nmi.h> |
25 | #include <linux/dmi.h> | 25 | #include <linux/dmi.h> |
26 | 26 | ||
27 | #define PANIC_TIMER_STEP 100 | ||
28 | #define PANIC_BLINK_SPD 18 | ||
29 | |||
27 | int panic_on_oops; | 30 | int panic_on_oops; |
28 | static unsigned long tainted_mask; | 31 | static unsigned long tainted_mask; |
29 | static int pause_on_oops; | 32 | static int pause_on_oops; |
@@ -36,36 +39,15 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); | |||
36 | 39 | ||
37 | EXPORT_SYMBOL(panic_notifier_list); | 40 | EXPORT_SYMBOL(panic_notifier_list); |
38 | 41 | ||
39 | /* Returns how long it waited in ms */ | 42 | static long no_blink(int state) |
40 | long (*panic_blink)(long time); | ||
41 | EXPORT_SYMBOL(panic_blink); | ||
42 | |||
43 | static void panic_blink_one_second(void) | ||
44 | { | 43 | { |
45 | static long i = 0, end; | 44 | return 0; |
46 | |||
47 | if (panic_blink) { | ||
48 | end = i + MSEC_PER_SEC; | ||
49 | |||
50 | while (i < end) { | ||
51 | i += panic_blink(i); | ||
52 | mdelay(1); | ||
53 | i++; | ||
54 | } | ||
55 | } else { | ||
56 | /* | ||
57 | * When running under a hypervisor a small mdelay may get | ||
58 | * rounded up to the hypervisor timeslice. For example, with | ||
59 | * a 1ms in 10ms hypervisor timeslice we might inflate a | ||
60 | * mdelay(1) loop by 10x. | ||
61 | * | ||
62 | * If we have nothing to blink, spin on 1 second calls to | ||
63 | * mdelay to avoid this. | ||
64 | */ | ||
65 | mdelay(MSEC_PER_SEC); | ||
66 | } | ||
67 | } | 45 | } |
68 | 46 | ||
47 | /* Returns how long it waited in ms */ | ||
48 | long (*panic_blink)(int state); | ||
49 | EXPORT_SYMBOL(panic_blink); | ||
50 | |||
69 | /** | 51 | /** |
70 | * panic - halt the system | 52 | * panic - halt the system |
71 | * @fmt: The text string to print | 53 | * @fmt: The text string to print |
@@ -78,7 +60,8 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
78 | { | 60 | { |
79 | static char buf[1024]; | 61 | static char buf[1024]; |
80 | va_list args; | 62 | va_list args; |
81 | long i; | 63 | long i, i_next = 0; |
64 | int state = 0; | ||
82 | 65 | ||
83 | /* | 66 | /* |
84 | * It's possible to come here directly from a panic-assertion and | 67 | * It's possible to come here directly from a panic-assertion and |
@@ -117,6 +100,9 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
117 | 100 | ||
118 | bust_spinlocks(0); | 101 | bust_spinlocks(0); |
119 | 102 | ||
103 | if (!panic_blink) | ||
104 | panic_blink = no_blink; | ||
105 | |||
120 | if (panic_timeout > 0) { | 106 | if (panic_timeout > 0) { |
121 | /* | 107 | /* |
122 | * Delay timeout seconds before rebooting the machine. | 108 | * Delay timeout seconds before rebooting the machine. |
@@ -124,9 +110,13 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
124 | */ | 110 | */ |
125 | printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); | 111 | printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); |
126 | 112 | ||
127 | for (i = 0; i < panic_timeout; i++) { | 113 | for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { |
128 | touch_nmi_watchdog(); | 114 | touch_nmi_watchdog(); |
129 | panic_blink_one_second(); | 115 | if (i >= i_next) { |
116 | i += panic_blink(state ^= 1); | ||
117 | i_next = i + 3600 / PANIC_BLINK_SPD; | ||
118 | } | ||
119 | mdelay(PANIC_TIMER_STEP); | ||
130 | } | 120 | } |
131 | /* | 121 | /* |
132 | * This will not be a clean reboot, with everything | 122 | * This will not be a clean reboot, with everything |
@@ -152,9 +142,13 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
152 | } | 142 | } |
153 | #endif | 143 | #endif |
154 | local_irq_enable(); | 144 | local_irq_enable(); |
155 | while (1) { | 145 | for (i = 0; ; i += PANIC_TIMER_STEP) { |
156 | touch_softlockup_watchdog(); | 146 | touch_softlockup_watchdog(); |
157 | panic_blink_one_second(); | 147 | if (i >= i_next) { |
148 | i += panic_blink(state ^= 1); | ||
149 | i_next = i + 3600 / PANIC_BLINK_SPD; | ||
150 | } | ||
151 | mdelay(PANIC_TIMER_STEP); | ||
158 | } | 152 | } |
159 | } | 153 | } |
160 | 154 | ||
@@ -344,7 +338,7 @@ static int init_oops_id(void) | |||
344 | } | 338 | } |
345 | late_initcall(init_oops_id); | 339 | late_initcall(init_oops_id); |
346 | 340 | ||
347 | static void print_oops_end_marker(void) | 341 | void print_oops_end_marker(void) |
348 | { | 342 | { |
349 | init_oops_id(); | 343 | init_oops_id(); |
350 | printk(KERN_WARNING "---[ end trace %016llx ]---\n", | 344 | printk(KERN_WARNING "---[ end trace %016llx ]---\n", |
diff --git a/kernel/params.c b/kernel/params.c index 0b30ecd53a52..08107d181758 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -31,6 +31,42 @@ | |||
31 | #define DEBUGP(fmt, a...) | 31 | #define DEBUGP(fmt, a...) |
32 | #endif | 32 | #endif |
33 | 33 | ||
34 | /* Protects all parameters, and incidentally kmalloced_param list. */ | ||
35 | static DEFINE_MUTEX(param_lock); | ||
36 | |||
37 | /* This just allows us to keep track of which parameters are kmalloced. */ | ||
38 | struct kmalloced_param { | ||
39 | struct list_head list; | ||
40 | char val[]; | ||
41 | }; | ||
42 | static LIST_HEAD(kmalloced_params); | ||
43 | |||
44 | static void *kmalloc_parameter(unsigned int size) | ||
45 | { | ||
46 | struct kmalloced_param *p; | ||
47 | |||
48 | p = kmalloc(sizeof(*p) + size, GFP_KERNEL); | ||
49 | if (!p) | ||
50 | return NULL; | ||
51 | |||
52 | list_add(&p->list, &kmalloced_params); | ||
53 | return p->val; | ||
54 | } | ||
55 | |||
56 | /* Does nothing if parameter wasn't kmalloced above. */ | ||
57 | static void maybe_kfree_parameter(void *param) | ||
58 | { | ||
59 | struct kmalloced_param *p; | ||
60 | |||
61 | list_for_each_entry(p, &kmalloced_params, list) { | ||
62 | if (p->val == param) { | ||
63 | list_del(&p->list); | ||
64 | kfree(p); | ||
65 | break; | ||
66 | } | ||
67 | } | ||
68 | } | ||
69 | |||
34 | static inline char dash2underscore(char c) | 70 | static inline char dash2underscore(char c) |
35 | { | 71 | { |
36 | if (c == '-') | 72 | if (c == '-') |
@@ -49,18 +85,25 @@ static inline int parameq(const char *input, const char *paramname) | |||
49 | 85 | ||
50 | static int parse_one(char *param, | 86 | static int parse_one(char *param, |
51 | char *val, | 87 | char *val, |
52 | struct kernel_param *params, | 88 | const struct kernel_param *params, |
53 | unsigned num_params, | 89 | unsigned num_params, |
54 | int (*handle_unknown)(char *param, char *val)) | 90 | int (*handle_unknown)(char *param, char *val)) |
55 | { | 91 | { |
56 | unsigned int i; | 92 | unsigned int i; |
93 | int err; | ||
57 | 94 | ||
58 | /* Find parameter */ | 95 | /* Find parameter */ |
59 | for (i = 0; i < num_params; i++) { | 96 | for (i = 0; i < num_params; i++) { |
60 | if (parameq(param, params[i].name)) { | 97 | if (parameq(param, params[i].name)) { |
98 | /* Noone handled NULL, so do it here. */ | ||
99 | if (!val && params[i].ops->set != param_set_bool) | ||
100 | return -EINVAL; | ||
61 | DEBUGP("They are equal! Calling %p\n", | 101 | DEBUGP("They are equal! Calling %p\n", |
62 | params[i].set); | 102 | params[i].ops->set); |
63 | return params[i].set(val, ¶ms[i]); | 103 | mutex_lock(¶m_lock); |
104 | err = params[i].ops->set(val, ¶ms[i]); | ||
105 | mutex_unlock(¶m_lock); | ||
106 | return err; | ||
64 | } | 107 | } |
65 | } | 108 | } |
66 | 109 | ||
@@ -128,7 +171,7 @@ static char *next_arg(char *args, char **param, char **val) | |||
128 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ | 171 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ |
129 | int parse_args(const char *name, | 172 | int parse_args(const char *name, |
130 | char *args, | 173 | char *args, |
131 | struct kernel_param *params, | 174 | const struct kernel_param *params, |
132 | unsigned num, | 175 | unsigned num, |
133 | int (*unknown)(char *param, char *val)) | 176 | int (*unknown)(char *param, char *val)) |
134 | { | 177 | { |
@@ -176,22 +219,29 @@ int parse_args(const char *name, | |||
176 | 219 | ||
177 | /* Lazy bastard, eh? */ | 220 | /* Lazy bastard, eh? */ |
178 | #define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ | 221 | #define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ |
179 | int param_set_##name(const char *val, struct kernel_param *kp) \ | 222 | int param_set_##name(const char *val, const struct kernel_param *kp) \ |
180 | { \ | 223 | { \ |
181 | tmptype l; \ | 224 | tmptype l; \ |
182 | int ret; \ | 225 | int ret; \ |
183 | \ | 226 | \ |
184 | if (!val) return -EINVAL; \ | ||
185 | ret = strtolfn(val, 0, &l); \ | 227 | ret = strtolfn(val, 0, &l); \ |
186 | if (ret == -EINVAL || ((type)l != l)) \ | 228 | if (ret == -EINVAL || ((type)l != l)) \ |
187 | return -EINVAL; \ | 229 | return -EINVAL; \ |
188 | *((type *)kp->arg) = l; \ | 230 | *((type *)kp->arg) = l; \ |
189 | return 0; \ | 231 | return 0; \ |
190 | } \ | 232 | } \ |
191 | int param_get_##name(char *buffer, struct kernel_param *kp) \ | 233 | int param_get_##name(char *buffer, const struct kernel_param *kp) \ |
192 | { \ | 234 | { \ |
193 | return sprintf(buffer, format, *((type *)kp->arg)); \ | 235 | return sprintf(buffer, format, *((type *)kp->arg)); \ |
194 | } | 236 | } \ |
237 | struct kernel_param_ops param_ops_##name = { \ | ||
238 | .set = param_set_##name, \ | ||
239 | .get = param_get_##name, \ | ||
240 | }; \ | ||
241 | EXPORT_SYMBOL(param_set_##name); \ | ||
242 | EXPORT_SYMBOL(param_get_##name); \ | ||
243 | EXPORT_SYMBOL(param_ops_##name) | ||
244 | |||
195 | 245 | ||
196 | STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); | 246 | STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); |
197 | STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); | 247 | STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); |
@@ -201,39 +251,50 @@ STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); | |||
201 | STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); | 251 | STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); |
202 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); | 252 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); |
203 | 253 | ||
204 | int param_set_charp(const char *val, struct kernel_param *kp) | 254 | int param_set_charp(const char *val, const struct kernel_param *kp) |
205 | { | 255 | { |
206 | if (!val) { | ||
207 | printk(KERN_ERR "%s: string parameter expected\n", | ||
208 | kp->name); | ||
209 | return -EINVAL; | ||
210 | } | ||
211 | |||
212 | if (strlen(val) > 1024) { | 256 | if (strlen(val) > 1024) { |
213 | printk(KERN_ERR "%s: string parameter too long\n", | 257 | printk(KERN_ERR "%s: string parameter too long\n", |
214 | kp->name); | 258 | kp->name); |
215 | return -ENOSPC; | 259 | return -ENOSPC; |
216 | } | 260 | } |
217 | 261 | ||
218 | /* This is a hack. We can't need to strdup in early boot, and we | 262 | maybe_kfree_parameter(*(char **)kp->arg); |
263 | |||
264 | /* This is a hack. We can't kmalloc in early boot, and we | ||
219 | * don't need to; this mangled commandline is preserved. */ | 265 | * don't need to; this mangled commandline is preserved. */ |
220 | if (slab_is_available()) { | 266 | if (slab_is_available()) { |
221 | *(char **)kp->arg = kstrdup(val, GFP_KERNEL); | 267 | *(char **)kp->arg = kmalloc_parameter(strlen(val)+1); |
222 | if (!*(char **)kp->arg) | 268 | if (!*(char **)kp->arg) |
223 | return -ENOMEM; | 269 | return -ENOMEM; |
270 | strcpy(*(char **)kp->arg, val); | ||
224 | } else | 271 | } else |
225 | *(const char **)kp->arg = val; | 272 | *(const char **)kp->arg = val; |
226 | 273 | ||
227 | return 0; | 274 | return 0; |
228 | } | 275 | } |
276 | EXPORT_SYMBOL(param_set_charp); | ||
229 | 277 | ||
230 | int param_get_charp(char *buffer, struct kernel_param *kp) | 278 | int param_get_charp(char *buffer, const struct kernel_param *kp) |
231 | { | 279 | { |
232 | return sprintf(buffer, "%s", *((char **)kp->arg)); | 280 | return sprintf(buffer, "%s", *((char **)kp->arg)); |
233 | } | 281 | } |
282 | EXPORT_SYMBOL(param_get_charp); | ||
283 | |||
284 | static void param_free_charp(void *arg) | ||
285 | { | ||
286 | maybe_kfree_parameter(*((char **)arg)); | ||
287 | } | ||
288 | |||
289 | struct kernel_param_ops param_ops_charp = { | ||
290 | .set = param_set_charp, | ||
291 | .get = param_get_charp, | ||
292 | .free = param_free_charp, | ||
293 | }; | ||
294 | EXPORT_SYMBOL(param_ops_charp); | ||
234 | 295 | ||
235 | /* Actually could be a bool or an int, for historical reasons. */ | 296 | /* Actually could be a bool or an int, for historical reasons. */ |
236 | int param_set_bool(const char *val, struct kernel_param *kp) | 297 | int param_set_bool(const char *val, const struct kernel_param *kp) |
237 | { | 298 | { |
238 | bool v; | 299 | bool v; |
239 | 300 | ||
@@ -258,8 +319,9 @@ int param_set_bool(const char *val, struct kernel_param *kp) | |||
258 | *(int *)kp->arg = v; | 319 | *(int *)kp->arg = v; |
259 | return 0; | 320 | return 0; |
260 | } | 321 | } |
322 | EXPORT_SYMBOL(param_set_bool); | ||
261 | 323 | ||
262 | int param_get_bool(char *buffer, struct kernel_param *kp) | 324 | int param_get_bool(char *buffer, const struct kernel_param *kp) |
263 | { | 325 | { |
264 | bool val; | 326 | bool val; |
265 | if (kp->flags & KPARAM_ISBOOL) | 327 | if (kp->flags & KPARAM_ISBOOL) |
@@ -270,9 +332,16 @@ int param_get_bool(char *buffer, struct kernel_param *kp) | |||
270 | /* Y and N chosen as being relatively non-coder friendly */ | 332 | /* Y and N chosen as being relatively non-coder friendly */ |
271 | return sprintf(buffer, "%c", val ? 'Y' : 'N'); | 333 | return sprintf(buffer, "%c", val ? 'Y' : 'N'); |
272 | } | 334 | } |
335 | EXPORT_SYMBOL(param_get_bool); | ||
336 | |||
337 | struct kernel_param_ops param_ops_bool = { | ||
338 | .set = param_set_bool, | ||
339 | .get = param_get_bool, | ||
340 | }; | ||
341 | EXPORT_SYMBOL(param_ops_bool); | ||
273 | 342 | ||
274 | /* This one must be bool. */ | 343 | /* This one must be bool. */ |
275 | int param_set_invbool(const char *val, struct kernel_param *kp) | 344 | int param_set_invbool(const char *val, const struct kernel_param *kp) |
276 | { | 345 | { |
277 | int ret; | 346 | int ret; |
278 | bool boolval; | 347 | bool boolval; |
@@ -285,18 +354,26 @@ int param_set_invbool(const char *val, struct kernel_param *kp) | |||
285 | *(bool *)kp->arg = !boolval; | 354 | *(bool *)kp->arg = !boolval; |
286 | return ret; | 355 | return ret; |
287 | } | 356 | } |
357 | EXPORT_SYMBOL(param_set_invbool); | ||
288 | 358 | ||
289 | int param_get_invbool(char *buffer, struct kernel_param *kp) | 359 | int param_get_invbool(char *buffer, const struct kernel_param *kp) |
290 | { | 360 | { |
291 | return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); | 361 | return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); |
292 | } | 362 | } |
363 | EXPORT_SYMBOL(param_get_invbool); | ||
364 | |||
365 | struct kernel_param_ops param_ops_invbool = { | ||
366 | .set = param_set_invbool, | ||
367 | .get = param_get_invbool, | ||
368 | }; | ||
369 | EXPORT_SYMBOL(param_ops_invbool); | ||
293 | 370 | ||
294 | /* We break the rule and mangle the string. */ | 371 | /* We break the rule and mangle the string. */ |
295 | static int param_array(const char *name, | 372 | static int param_array(const char *name, |
296 | const char *val, | 373 | const char *val, |
297 | unsigned int min, unsigned int max, | 374 | unsigned int min, unsigned int max, |
298 | void *elem, int elemsize, | 375 | void *elem, int elemsize, |
299 | int (*set)(const char *, struct kernel_param *kp), | 376 | int (*set)(const char *, const struct kernel_param *kp), |
300 | u16 flags, | 377 | u16 flags, |
301 | unsigned int *num) | 378 | unsigned int *num) |
302 | { | 379 | { |
@@ -309,12 +386,6 @@ static int param_array(const char *name, | |||
309 | kp.arg = elem; | 386 | kp.arg = elem; |
310 | kp.flags = flags; | 387 | kp.flags = flags; |
311 | 388 | ||
312 | /* No equals sign? */ | ||
313 | if (!val) { | ||
314 | printk(KERN_ERR "%s: expects arguments\n", name); | ||
315 | return -EINVAL; | ||
316 | } | ||
317 | |||
318 | *num = 0; | 389 | *num = 0; |
319 | /* We expect a comma-separated list of values. */ | 390 | /* We expect a comma-separated list of values. */ |
320 | do { | 391 | do { |
@@ -330,6 +401,7 @@ static int param_array(const char *name, | |||
330 | /* nul-terminate and parse */ | 401 | /* nul-terminate and parse */ |
331 | save = val[len]; | 402 | save = val[len]; |
332 | ((char *)val)[len] = '\0'; | 403 | ((char *)val)[len] = '\0'; |
404 | BUG_ON(!mutex_is_locked(¶m_lock)); | ||
333 | ret = set(val, &kp); | 405 | ret = set(val, &kp); |
334 | 406 | ||
335 | if (ret != 0) | 407 | if (ret != 0) |
@@ -347,17 +419,17 @@ static int param_array(const char *name, | |||
347 | return 0; | 419 | return 0; |
348 | } | 420 | } |
349 | 421 | ||
350 | int param_array_set(const char *val, struct kernel_param *kp) | 422 | static int param_array_set(const char *val, const struct kernel_param *kp) |
351 | { | 423 | { |
352 | const struct kparam_array *arr = kp->arr; | 424 | const struct kparam_array *arr = kp->arr; |
353 | unsigned int temp_num; | 425 | unsigned int temp_num; |
354 | 426 | ||
355 | return param_array(kp->name, val, 1, arr->max, arr->elem, | 427 | return param_array(kp->name, val, 1, arr->max, arr->elem, |
356 | arr->elemsize, arr->set, kp->flags, | 428 | arr->elemsize, arr->ops->set, kp->flags, |
357 | arr->num ?: &temp_num); | 429 | arr->num ?: &temp_num); |
358 | } | 430 | } |
359 | 431 | ||
360 | int param_array_get(char *buffer, struct kernel_param *kp) | 432 | static int param_array_get(char *buffer, const struct kernel_param *kp) |
361 | { | 433 | { |
362 | int i, off, ret; | 434 | int i, off, ret; |
363 | const struct kparam_array *arr = kp->arr; | 435 | const struct kparam_array *arr = kp->arr; |
@@ -368,7 +440,8 @@ int param_array_get(char *buffer, struct kernel_param *kp) | |||
368 | if (i) | 440 | if (i) |
369 | buffer[off++] = ','; | 441 | buffer[off++] = ','; |
370 | p.arg = arr->elem + arr->elemsize * i; | 442 | p.arg = arr->elem + arr->elemsize * i; |
371 | ret = arr->get(buffer + off, &p); | 443 | BUG_ON(!mutex_is_locked(¶m_lock)); |
444 | ret = arr->ops->get(buffer + off, &p); | ||
372 | if (ret < 0) | 445 | if (ret < 0) |
373 | return ret; | 446 | return ret; |
374 | off += ret; | 447 | off += ret; |
@@ -377,14 +450,27 @@ int param_array_get(char *buffer, struct kernel_param *kp) | |||
377 | return off; | 450 | return off; |
378 | } | 451 | } |
379 | 452 | ||
380 | int param_set_copystring(const char *val, struct kernel_param *kp) | 453 | static void param_array_free(void *arg) |
454 | { | ||
455 | unsigned int i; | ||
456 | const struct kparam_array *arr = arg; | ||
457 | |||
458 | if (arr->ops->free) | ||
459 | for (i = 0; i < (arr->num ? *arr->num : arr->max); i++) | ||
460 | arr->ops->free(arr->elem + arr->elemsize * i); | ||
461 | } | ||
462 | |||
463 | struct kernel_param_ops param_array_ops = { | ||
464 | .set = param_array_set, | ||
465 | .get = param_array_get, | ||
466 | .free = param_array_free, | ||
467 | }; | ||
468 | EXPORT_SYMBOL(param_array_ops); | ||
469 | |||
470 | int param_set_copystring(const char *val, const struct kernel_param *kp) | ||
381 | { | 471 | { |
382 | const struct kparam_string *kps = kp->str; | 472 | const struct kparam_string *kps = kp->str; |
383 | 473 | ||
384 | if (!val) { | ||
385 | printk(KERN_ERR "%s: missing param set value\n", kp->name); | ||
386 | return -EINVAL; | ||
387 | } | ||
388 | if (strlen(val)+1 > kps->maxlen) { | 474 | if (strlen(val)+1 > kps->maxlen) { |
389 | printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", | 475 | printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", |
390 | kp->name, kps->maxlen-1); | 476 | kp->name, kps->maxlen-1); |
@@ -393,12 +479,20 @@ int param_set_copystring(const char *val, struct kernel_param *kp) | |||
393 | strcpy(kps->string, val); | 479 | strcpy(kps->string, val); |
394 | return 0; | 480 | return 0; |
395 | } | 481 | } |
482 | EXPORT_SYMBOL(param_set_copystring); | ||
396 | 483 | ||
397 | int param_get_string(char *buffer, struct kernel_param *kp) | 484 | int param_get_string(char *buffer, const struct kernel_param *kp) |
398 | { | 485 | { |
399 | const struct kparam_string *kps = kp->str; | 486 | const struct kparam_string *kps = kp->str; |
400 | return strlcpy(buffer, kps->string, kps->maxlen); | 487 | return strlcpy(buffer, kps->string, kps->maxlen); |
401 | } | 488 | } |
489 | EXPORT_SYMBOL(param_get_string); | ||
490 | |||
491 | struct kernel_param_ops param_ops_string = { | ||
492 | .set = param_set_copystring, | ||
493 | .get = param_get_string, | ||
494 | }; | ||
495 | EXPORT_SYMBOL(param_ops_string); | ||
402 | 496 | ||
403 | /* sysfs output in /sys/modules/XYZ/parameters/ */ | 497 | /* sysfs output in /sys/modules/XYZ/parameters/ */ |
404 | #define to_module_attr(n) container_of(n, struct module_attribute, attr) | 498 | #define to_module_attr(n) container_of(n, struct module_attribute, attr) |
@@ -409,7 +503,7 @@ extern struct kernel_param __start___param[], __stop___param[]; | |||
409 | struct param_attribute | 503 | struct param_attribute |
410 | { | 504 | { |
411 | struct module_attribute mattr; | 505 | struct module_attribute mattr; |
412 | struct kernel_param *param; | 506 | const struct kernel_param *param; |
413 | }; | 507 | }; |
414 | 508 | ||
415 | struct module_param_attrs | 509 | struct module_param_attrs |
@@ -428,10 +522,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr, | |||
428 | int count; | 522 | int count; |
429 | struct param_attribute *attribute = to_param_attr(mattr); | 523 | struct param_attribute *attribute = to_param_attr(mattr); |
430 | 524 | ||
431 | if (!attribute->param->get) | 525 | if (!attribute->param->ops->get) |
432 | return -EPERM; | 526 | return -EPERM; |
433 | 527 | ||
434 | count = attribute->param->get(buf, attribute->param); | 528 | mutex_lock(¶m_lock); |
529 | count = attribute->param->ops->get(buf, attribute->param); | ||
530 | mutex_unlock(¶m_lock); | ||
435 | if (count > 0) { | 531 | if (count > 0) { |
436 | strcat(buf, "\n"); | 532 | strcat(buf, "\n"); |
437 | ++count; | 533 | ++count; |
@@ -447,10 +543,12 @@ static ssize_t param_attr_store(struct module_attribute *mattr, | |||
447 | int err; | 543 | int err; |
448 | struct param_attribute *attribute = to_param_attr(mattr); | 544 | struct param_attribute *attribute = to_param_attr(mattr); |
449 | 545 | ||
450 | if (!attribute->param->set) | 546 | if (!attribute->param->ops->set) |
451 | return -EPERM; | 547 | return -EPERM; |
452 | 548 | ||
453 | err = attribute->param->set(buf, attribute->param); | 549 | mutex_lock(¶m_lock); |
550 | err = attribute->param->ops->set(buf, attribute->param); | ||
551 | mutex_unlock(¶m_lock); | ||
454 | if (!err) | 552 | if (!err) |
455 | return len; | 553 | return len; |
456 | return err; | 554 | return err; |
@@ -464,6 +562,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr, | |||
464 | #endif | 562 | #endif |
465 | 563 | ||
466 | #ifdef CONFIG_SYSFS | 564 | #ifdef CONFIG_SYSFS |
565 | void __kernel_param_lock(void) | ||
566 | { | ||
567 | mutex_lock(¶m_lock); | ||
568 | } | ||
569 | EXPORT_SYMBOL(__kernel_param_lock); | ||
570 | |||
571 | void __kernel_param_unlock(void) | ||
572 | { | ||
573 | mutex_unlock(¶m_lock); | ||
574 | } | ||
575 | EXPORT_SYMBOL(__kernel_param_unlock); | ||
576 | |||
467 | /* | 577 | /* |
468 | * add_sysfs_param - add a parameter to sysfs | 578 | * add_sysfs_param - add a parameter to sysfs |
469 | * @mk: struct module_kobject | 579 | * @mk: struct module_kobject |
@@ -475,7 +585,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr, | |||
475 | * if there's an error. | 585 | * if there's an error. |
476 | */ | 586 | */ |
477 | static __modinit int add_sysfs_param(struct module_kobject *mk, | 587 | static __modinit int add_sysfs_param(struct module_kobject *mk, |
478 | struct kernel_param *kp, | 588 | const struct kernel_param *kp, |
479 | const char *name) | 589 | const char *name) |
480 | { | 590 | { |
481 | struct module_param_attrs *new; | 591 | struct module_param_attrs *new; |
@@ -557,7 +667,7 @@ static void free_module_param_attrs(struct module_kobject *mk) | |||
557 | * /sys/module/[mod->name]/parameters/ | 667 | * /sys/module/[mod->name]/parameters/ |
558 | */ | 668 | */ |
559 | int module_param_sysfs_setup(struct module *mod, | 669 | int module_param_sysfs_setup(struct module *mod, |
560 | struct kernel_param *kparam, | 670 | const struct kernel_param *kparam, |
561 | unsigned int num_params) | 671 | unsigned int num_params) |
562 | { | 672 | { |
563 | int i, err; | 673 | int i, err; |
@@ -602,7 +712,11 @@ void module_param_sysfs_remove(struct module *mod) | |||
602 | 712 | ||
603 | void destroy_params(const struct kernel_param *params, unsigned num) | 713 | void destroy_params(const struct kernel_param *params, unsigned num) |
604 | { | 714 | { |
605 | /* FIXME: This should free kmalloced charp parameters. It doesn't. */ | 715 | unsigned int i; |
716 | |||
717 | for (i = 0; i < num; i++) | ||
718 | if (params[i].ops->free) | ||
719 | params[i].ops->free(params[i].arg); | ||
606 | } | 720 | } |
607 | 721 | ||
608 | static void __init kernel_add_sysfs_param(const char *name, | 722 | static void __init kernel_add_sysfs_param(const char *name, |
@@ -768,28 +882,3 @@ static int __init param_sysfs_init(void) | |||
768 | subsys_initcall(param_sysfs_init); | 882 | subsys_initcall(param_sysfs_init); |
769 | 883 | ||
770 | #endif /* CONFIG_SYSFS */ | 884 | #endif /* CONFIG_SYSFS */ |
771 | |||
772 | EXPORT_SYMBOL(param_set_byte); | ||
773 | EXPORT_SYMBOL(param_get_byte); | ||
774 | EXPORT_SYMBOL(param_set_short); | ||
775 | EXPORT_SYMBOL(param_get_short); | ||
776 | EXPORT_SYMBOL(param_set_ushort); | ||
777 | EXPORT_SYMBOL(param_get_ushort); | ||
778 | EXPORT_SYMBOL(param_set_int); | ||
779 | EXPORT_SYMBOL(param_get_int); | ||
780 | EXPORT_SYMBOL(param_set_uint); | ||
781 | EXPORT_SYMBOL(param_get_uint); | ||
782 | EXPORT_SYMBOL(param_set_long); | ||
783 | EXPORT_SYMBOL(param_get_long); | ||
784 | EXPORT_SYMBOL(param_set_ulong); | ||
785 | EXPORT_SYMBOL(param_get_ulong); | ||
786 | EXPORT_SYMBOL(param_set_charp); | ||
787 | EXPORT_SYMBOL(param_get_charp); | ||
788 | EXPORT_SYMBOL(param_set_bool); | ||
789 | EXPORT_SYMBOL(param_get_bool); | ||
790 | EXPORT_SYMBOL(param_set_invbool); | ||
791 | EXPORT_SYMBOL(param_get_invbool); | ||
792 | EXPORT_SYMBOL(param_array_set); | ||
793 | EXPORT_SYMBOL(param_array_get); | ||
794 | EXPORT_SYMBOL(param_set_copystring); | ||
795 | EXPORT_SYMBOL(param_get_string); | ||
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index ff86c558af4c..517d827f4982 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -31,24 +31,18 @@ | |||
31 | #include <linux/kernel_stat.h> | 31 | #include <linux/kernel_stat.h> |
32 | #include <linux/perf_event.h> | 32 | #include <linux/perf_event.h> |
33 | #include <linux/ftrace_event.h> | 33 | #include <linux/ftrace_event.h> |
34 | #include <linux/hw_breakpoint.h> | ||
35 | 34 | ||
36 | #include <asm/irq_regs.h> | 35 | #include <asm/irq_regs.h> |
37 | 36 | ||
38 | /* | 37 | atomic_t perf_task_events __read_mostly; |
39 | * Each CPU has a list of per CPU events: | ||
40 | */ | ||
41 | static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); | ||
42 | |||
43 | int perf_max_events __read_mostly = 1; | ||
44 | static int perf_reserved_percpu __read_mostly; | ||
45 | static int perf_overcommit __read_mostly = 1; | ||
46 | |||
47 | static atomic_t nr_events __read_mostly; | ||
48 | static atomic_t nr_mmap_events __read_mostly; | 38 | static atomic_t nr_mmap_events __read_mostly; |
49 | static atomic_t nr_comm_events __read_mostly; | 39 | static atomic_t nr_comm_events __read_mostly; |
50 | static atomic_t nr_task_events __read_mostly; | 40 | static atomic_t nr_task_events __read_mostly; |
51 | 41 | ||
42 | static LIST_HEAD(pmus); | ||
43 | static DEFINE_MUTEX(pmus_lock); | ||
44 | static struct srcu_struct pmus_srcu; | ||
45 | |||
52 | /* | 46 | /* |
53 | * perf event paranoia level: | 47 | * perf event paranoia level: |
54 | * -1 - not paranoid at all | 48 | * -1 - not paranoid at all |
@@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000; | |||
67 | 61 | ||
68 | static atomic64_t perf_event_id; | 62 | static atomic64_t perf_event_id; |
69 | 63 | ||
70 | /* | 64 | void __weak perf_event_print_debug(void) { } |
71 | * Lock for (sysadmin-configurable) event reservations: | ||
72 | */ | ||
73 | static DEFINE_SPINLOCK(perf_resource_lock); | ||
74 | 65 | ||
75 | /* | 66 | extern __weak const char *perf_pmu_name(void) |
76 | * Architecture provided APIs - weak aliases: | ||
77 | */ | ||
78 | extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) | ||
79 | { | 67 | { |
80 | return NULL; | 68 | return "pmu"; |
81 | } | 69 | } |
82 | 70 | ||
83 | void __weak hw_perf_disable(void) { barrier(); } | 71 | void perf_pmu_disable(struct pmu *pmu) |
84 | void __weak hw_perf_enable(void) { barrier(); } | 72 | { |
85 | 73 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | |
86 | void __weak perf_event_print_debug(void) { } | 74 | if (!(*count)++) |
87 | 75 | pmu->pmu_disable(pmu); | |
88 | static DEFINE_PER_CPU(int, perf_disable_count); | 76 | } |
89 | 77 | ||
90 | void perf_disable(void) | 78 | void perf_pmu_enable(struct pmu *pmu) |
91 | { | 79 | { |
92 | if (!__get_cpu_var(perf_disable_count)++) | 80 | int *count = this_cpu_ptr(pmu->pmu_disable_count); |
93 | hw_perf_disable(); | 81 | if (!--(*count)) |
82 | pmu->pmu_enable(pmu); | ||
94 | } | 83 | } |
95 | 84 | ||
96 | void perf_enable(void) | 85 | static DEFINE_PER_CPU(struct list_head, rotation_list); |
86 | |||
87 | /* | ||
88 | * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized | ||
89 | * because they're strictly cpu affine and rotate_start is called with IRQs | ||
90 | * disabled, while rotate_context is called from IRQ context. | ||
91 | */ | ||
92 | static void perf_pmu_rotate_start(struct pmu *pmu) | ||
97 | { | 93 | { |
98 | if (!--__get_cpu_var(perf_disable_count)) | 94 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
99 | hw_perf_enable(); | 95 | struct list_head *head = &__get_cpu_var(rotation_list); |
96 | |||
97 | WARN_ON(!irqs_disabled()); | ||
98 | |||
99 | if (list_empty(&cpuctx->rotation_list)) | ||
100 | list_add(&cpuctx->rotation_list, head); | ||
100 | } | 101 | } |
101 | 102 | ||
102 | static void get_ctx(struct perf_event_context *ctx) | 103 | static void get_ctx(struct perf_event_context *ctx) |
@@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event) | |||
151 | * the context could get moved to another task. | 152 | * the context could get moved to another task. |
152 | */ | 153 | */ |
153 | static struct perf_event_context * | 154 | static struct perf_event_context * |
154 | perf_lock_task_context(struct task_struct *task, unsigned long *flags) | 155 | perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) |
155 | { | 156 | { |
156 | struct perf_event_context *ctx; | 157 | struct perf_event_context *ctx; |
157 | 158 | ||
158 | rcu_read_lock(); | 159 | rcu_read_lock(); |
159 | retry: | 160 | retry: |
160 | ctx = rcu_dereference(task->perf_event_ctxp); | 161 | ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); |
161 | if (ctx) { | 162 | if (ctx) { |
162 | /* | 163 | /* |
163 | * If this context is a clone of another, it might | 164 | * If this context is a clone of another, it might |
@@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) | |||
170 | * can't get swapped on us any more. | 171 | * can't get swapped on us any more. |
171 | */ | 172 | */ |
172 | raw_spin_lock_irqsave(&ctx->lock, *flags); | 173 | raw_spin_lock_irqsave(&ctx->lock, *flags); |
173 | if (ctx != rcu_dereference(task->perf_event_ctxp)) { | 174 | if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { |
174 | raw_spin_unlock_irqrestore(&ctx->lock, *flags); | 175 | raw_spin_unlock_irqrestore(&ctx->lock, *flags); |
175 | goto retry; | 176 | goto retry; |
176 | } | 177 | } |
@@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) | |||
189 | * can't get swapped to another task. This also increments its | 190 | * can't get swapped to another task. This also increments its |
190 | * reference count so that the context can't get freed. | 191 | * reference count so that the context can't get freed. |
191 | */ | 192 | */ |
192 | static struct perf_event_context *perf_pin_task_context(struct task_struct *task) | 193 | static struct perf_event_context * |
194 | perf_pin_task_context(struct task_struct *task, int ctxn) | ||
193 | { | 195 | { |
194 | struct perf_event_context *ctx; | 196 | struct perf_event_context *ctx; |
195 | unsigned long flags; | 197 | unsigned long flags; |
196 | 198 | ||
197 | ctx = perf_lock_task_context(task, &flags); | 199 | ctx = perf_lock_task_context(task, ctxn, &flags); |
198 | if (ctx) { | 200 | if (ctx) { |
199 | ++ctx->pin_count; | 201 | ++ctx->pin_count; |
200 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 202 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
@@ -214,7 +216,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) | |||
214 | 216 | ||
215 | static inline u64 perf_clock(void) | 217 | static inline u64 perf_clock(void) |
216 | { | 218 | { |
217 | return cpu_clock(raw_smp_processor_id()); | 219 | return local_clock(); |
218 | } | 220 | } |
219 | 221 | ||
220 | /* | 222 | /* |
@@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
302 | } | 304 | } |
303 | 305 | ||
304 | list_add_rcu(&event->event_entry, &ctx->event_list); | 306 | list_add_rcu(&event->event_entry, &ctx->event_list); |
307 | if (!ctx->nr_events) | ||
308 | perf_pmu_rotate_start(ctx->pmu); | ||
305 | ctx->nr_events++; | 309 | ctx->nr_events++; |
306 | if (event->attr.inherit_stat) | 310 | if (event->attr.inherit_stat) |
307 | ctx->nr_stat++; | 311 | ctx->nr_stat++; |
@@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event) | |||
311 | { | 315 | { |
312 | struct perf_event *group_leader = event->group_leader; | 316 | struct perf_event *group_leader = event->group_leader; |
313 | 317 | ||
314 | WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); | 318 | /* |
319 | * We can have double attach due to group movement in perf_event_open. | ||
320 | */ | ||
321 | if (event->attach_state & PERF_ATTACH_GROUP) | ||
322 | return; | ||
323 | |||
315 | event->attach_state |= PERF_ATTACH_GROUP; | 324 | event->attach_state |= PERF_ATTACH_GROUP; |
316 | 325 | ||
317 | if (group_leader == event) | 326 | if (group_leader == event) |
@@ -402,11 +411,31 @@ static void perf_group_detach(struct perf_event *event) | |||
402 | } | 411 | } |
403 | } | 412 | } |
404 | 413 | ||
414 | static inline int | ||
415 | event_filter_match(struct perf_event *event) | ||
416 | { | ||
417 | return event->cpu == -1 || event->cpu == smp_processor_id(); | ||
418 | } | ||
419 | |||
405 | static void | 420 | static void |
406 | event_sched_out(struct perf_event *event, | 421 | event_sched_out(struct perf_event *event, |
407 | struct perf_cpu_context *cpuctx, | 422 | struct perf_cpu_context *cpuctx, |
408 | struct perf_event_context *ctx) | 423 | struct perf_event_context *ctx) |
409 | { | 424 | { |
425 | u64 delta; | ||
426 | /* | ||
427 | * An event which could not be activated because of | ||
428 | * filter mismatch still needs to have its timings | ||
429 | * maintained, otherwise bogus information is return | ||
430 | * via read() for time_enabled, time_running: | ||
431 | */ | ||
432 | if (event->state == PERF_EVENT_STATE_INACTIVE | ||
433 | && !event_filter_match(event)) { | ||
434 | delta = ctx->time - event->tstamp_stopped; | ||
435 | event->tstamp_running += delta; | ||
436 | event->tstamp_stopped = ctx->time; | ||
437 | } | ||
438 | |||
410 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 439 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
411 | return; | 440 | return; |
412 | 441 | ||
@@ -416,7 +445,7 @@ event_sched_out(struct perf_event *event, | |||
416 | event->state = PERF_EVENT_STATE_OFF; | 445 | event->state = PERF_EVENT_STATE_OFF; |
417 | } | 446 | } |
418 | event->tstamp_stopped = ctx->time; | 447 | event->tstamp_stopped = ctx->time; |
419 | event->pmu->disable(event); | 448 | event->pmu->del(event, 0); |
420 | event->oncpu = -1; | 449 | event->oncpu = -1; |
421 | 450 | ||
422 | if (!is_software_event(event)) | 451 | if (!is_software_event(event)) |
@@ -432,9 +461,7 @@ group_sched_out(struct perf_event *group_event, | |||
432 | struct perf_event_context *ctx) | 461 | struct perf_event_context *ctx) |
433 | { | 462 | { |
434 | struct perf_event *event; | 463 | struct perf_event *event; |
435 | 464 | int state = group_event->state; | |
436 | if (group_event->state != PERF_EVENT_STATE_ACTIVE) | ||
437 | return; | ||
438 | 465 | ||
439 | event_sched_out(group_event, cpuctx, ctx); | 466 | event_sched_out(group_event, cpuctx, ctx); |
440 | 467 | ||
@@ -444,10 +471,16 @@ group_sched_out(struct perf_event *group_event, | |||
444 | list_for_each_entry(event, &group_event->sibling_list, group_entry) | 471 | list_for_each_entry(event, &group_event->sibling_list, group_entry) |
445 | event_sched_out(event, cpuctx, ctx); | 472 | event_sched_out(event, cpuctx, ctx); |
446 | 473 | ||
447 | if (group_event->attr.exclusive) | 474 | if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) |
448 | cpuctx->exclusive = 0; | 475 | cpuctx->exclusive = 0; |
449 | } | 476 | } |
450 | 477 | ||
478 | static inline struct perf_cpu_context * | ||
479 | __get_cpu_context(struct perf_event_context *ctx) | ||
480 | { | ||
481 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
482 | } | ||
483 | |||
451 | /* | 484 | /* |
452 | * Cross CPU call to remove a performance event | 485 | * Cross CPU call to remove a performance event |
453 | * | 486 | * |
@@ -456,9 +489,9 @@ group_sched_out(struct perf_event *group_event, | |||
456 | */ | 489 | */ |
457 | static void __perf_event_remove_from_context(void *info) | 490 | static void __perf_event_remove_from_context(void *info) |
458 | { | 491 | { |
459 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
460 | struct perf_event *event = info; | 492 | struct perf_event *event = info; |
461 | struct perf_event_context *ctx = event->ctx; | 493 | struct perf_event_context *ctx = event->ctx; |
494 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
462 | 495 | ||
463 | /* | 496 | /* |
464 | * If this is a task context, we need to check whether it is | 497 | * If this is a task context, we need to check whether it is |
@@ -469,27 +502,11 @@ static void __perf_event_remove_from_context(void *info) | |||
469 | return; | 502 | return; |
470 | 503 | ||
471 | raw_spin_lock(&ctx->lock); | 504 | raw_spin_lock(&ctx->lock); |
472 | /* | ||
473 | * Protect the list operation against NMI by disabling the | ||
474 | * events on a global level. | ||
475 | */ | ||
476 | perf_disable(); | ||
477 | 505 | ||
478 | event_sched_out(event, cpuctx, ctx); | 506 | event_sched_out(event, cpuctx, ctx); |
479 | 507 | ||
480 | list_del_event(event, ctx); | 508 | list_del_event(event, ctx); |
481 | 509 | ||
482 | if (!ctx->task) { | ||
483 | /* | ||
484 | * Allow more per task events with respect to the | ||
485 | * reservation: | ||
486 | */ | ||
487 | cpuctx->max_pertask = | ||
488 | min(perf_max_events - ctx->nr_events, | ||
489 | perf_max_events - perf_reserved_percpu); | ||
490 | } | ||
491 | |||
492 | perf_enable(); | ||
493 | raw_spin_unlock(&ctx->lock); | 510 | raw_spin_unlock(&ctx->lock); |
494 | } | 511 | } |
495 | 512 | ||
@@ -554,8 +571,8 @@ retry: | |||
554 | static void __perf_event_disable(void *info) | 571 | static void __perf_event_disable(void *info) |
555 | { | 572 | { |
556 | struct perf_event *event = info; | 573 | struct perf_event *event = info; |
557 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
558 | struct perf_event_context *ctx = event->ctx; | 574 | struct perf_event_context *ctx = event->ctx; |
575 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
559 | 576 | ||
560 | /* | 577 | /* |
561 | * If this is a per-task event, need to check whether this | 578 | * If this is a per-task event, need to check whether this |
@@ -610,7 +627,7 @@ void perf_event_disable(struct perf_event *event) | |||
610 | return; | 627 | return; |
611 | } | 628 | } |
612 | 629 | ||
613 | retry: | 630 | retry: |
614 | task_oncpu_function_call(task, __perf_event_disable, event); | 631 | task_oncpu_function_call(task, __perf_event_disable, event); |
615 | 632 | ||
616 | raw_spin_lock_irq(&ctx->lock); | 633 | raw_spin_lock_irq(&ctx->lock); |
@@ -649,7 +666,7 @@ event_sched_in(struct perf_event *event, | |||
649 | */ | 666 | */ |
650 | smp_wmb(); | 667 | smp_wmb(); |
651 | 668 | ||
652 | if (event->pmu->enable(event)) { | 669 | if (event->pmu->add(event, PERF_EF_START)) { |
653 | event->state = PERF_EVENT_STATE_INACTIVE; | 670 | event->state = PERF_EVENT_STATE_INACTIVE; |
654 | event->oncpu = -1; | 671 | event->oncpu = -1; |
655 | return -EAGAIN; | 672 | return -EAGAIN; |
@@ -673,23 +690,17 @@ group_sched_in(struct perf_event *group_event, | |||
673 | struct perf_event_context *ctx) | 690 | struct perf_event_context *ctx) |
674 | { | 691 | { |
675 | struct perf_event *event, *partial_group = NULL; | 692 | struct perf_event *event, *partial_group = NULL; |
676 | const struct pmu *pmu = group_event->pmu; | 693 | struct pmu *pmu = group_event->pmu; |
677 | bool txn = false; | 694 | u64 now = ctx->time; |
678 | int ret; | 695 | bool simulate = false; |
679 | 696 | ||
680 | if (group_event->state == PERF_EVENT_STATE_OFF) | 697 | if (group_event->state == PERF_EVENT_STATE_OFF) |
681 | return 0; | 698 | return 0; |
682 | 699 | ||
683 | /* Check if group transaction availabe */ | 700 | pmu->start_txn(pmu); |
684 | if (pmu->start_txn) | ||
685 | txn = true; | ||
686 | |||
687 | if (txn) | ||
688 | pmu->start_txn(pmu); | ||
689 | 701 | ||
690 | if (event_sched_in(group_event, cpuctx, ctx)) { | 702 | if (event_sched_in(group_event, cpuctx, ctx)) { |
691 | if (txn) | 703 | pmu->cancel_txn(pmu); |
692 | pmu->cancel_txn(pmu); | ||
693 | return -EAGAIN; | 704 | return -EAGAIN; |
694 | } | 705 | } |
695 | 706 | ||
@@ -703,29 +714,38 @@ group_sched_in(struct perf_event *group_event, | |||
703 | } | 714 | } |
704 | } | 715 | } |
705 | 716 | ||
706 | if (!txn) | 717 | if (!pmu->commit_txn(pmu)) |
707 | return 0; | 718 | return 0; |
708 | 719 | ||
709 | ret = pmu->commit_txn(pmu); | ||
710 | if (!ret) { | ||
711 | pmu->cancel_txn(pmu); | ||
712 | return 0; | ||
713 | } | ||
714 | |||
715 | group_error: | 720 | group_error: |
716 | /* | 721 | /* |
717 | * Groups can be scheduled in as one unit only, so undo any | 722 | * Groups can be scheduled in as one unit only, so undo any |
718 | * partial group before returning: | 723 | * partial group before returning: |
724 | * The events up to the failed event are scheduled out normally, | ||
725 | * tstamp_stopped will be updated. | ||
726 | * | ||
727 | * The failed events and the remaining siblings need to have | ||
728 | * their timings updated as if they had gone thru event_sched_in() | ||
729 | * and event_sched_out(). This is required to get consistent timings | ||
730 | * across the group. This also takes care of the case where the group | ||
731 | * could never be scheduled by ensuring tstamp_stopped is set to mark | ||
732 | * the time the event was actually stopped, such that time delta | ||
733 | * calculation in update_event_times() is correct. | ||
719 | */ | 734 | */ |
720 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | 735 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { |
721 | if (event == partial_group) | 736 | if (event == partial_group) |
722 | break; | 737 | simulate = true; |
723 | event_sched_out(event, cpuctx, ctx); | 738 | |
739 | if (simulate) { | ||
740 | event->tstamp_running += now - event->tstamp_stopped; | ||
741 | event->tstamp_stopped = now; | ||
742 | } else { | ||
743 | event_sched_out(event, cpuctx, ctx); | ||
744 | } | ||
724 | } | 745 | } |
725 | event_sched_out(group_event, cpuctx, ctx); | 746 | event_sched_out(group_event, cpuctx, ctx); |
726 | 747 | ||
727 | if (txn) | 748 | pmu->cancel_txn(pmu); |
728 | pmu->cancel_txn(pmu); | ||
729 | 749 | ||
730 | return -EAGAIN; | 750 | return -EAGAIN; |
731 | } | 751 | } |
@@ -778,10 +798,10 @@ static void add_event_to_ctx(struct perf_event *event, | |||
778 | */ | 798 | */ |
779 | static void __perf_install_in_context(void *info) | 799 | static void __perf_install_in_context(void *info) |
780 | { | 800 | { |
781 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
782 | struct perf_event *event = info; | 801 | struct perf_event *event = info; |
783 | struct perf_event_context *ctx = event->ctx; | 802 | struct perf_event_context *ctx = event->ctx; |
784 | struct perf_event *leader = event->group_leader; | 803 | struct perf_event *leader = event->group_leader; |
804 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
785 | int err; | 805 | int err; |
786 | 806 | ||
787 | /* | 807 | /* |
@@ -801,12 +821,6 @@ static void __perf_install_in_context(void *info) | |||
801 | ctx->is_active = 1; | 821 | ctx->is_active = 1; |
802 | update_context_time(ctx); | 822 | update_context_time(ctx); |
803 | 823 | ||
804 | /* | ||
805 | * Protect the list operation against NMI by disabling the | ||
806 | * events on a global level. NOP for non NMI based events. | ||
807 | */ | ||
808 | perf_disable(); | ||
809 | |||
810 | add_event_to_ctx(event, ctx); | 824 | add_event_to_ctx(event, ctx); |
811 | 825 | ||
812 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 826 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
@@ -844,12 +858,7 @@ static void __perf_install_in_context(void *info) | |||
844 | } | 858 | } |
845 | } | 859 | } |
846 | 860 | ||
847 | if (!err && !ctx->task && cpuctx->max_pertask) | 861 | unlock: |
848 | cpuctx->max_pertask--; | ||
849 | |||
850 | unlock: | ||
851 | perf_enable(); | ||
852 | |||
853 | raw_spin_unlock(&ctx->lock); | 862 | raw_spin_unlock(&ctx->lock); |
854 | } | 863 | } |
855 | 864 | ||
@@ -872,6 +881,8 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
872 | { | 881 | { |
873 | struct task_struct *task = ctx->task; | 882 | struct task_struct *task = ctx->task; |
874 | 883 | ||
884 | event->ctx = ctx; | ||
885 | |||
875 | if (!task) { | 886 | if (!task) { |
876 | /* | 887 | /* |
877 | * Per cpu events are installed via an smp call and | 888 | * Per cpu events are installed via an smp call and |
@@ -920,10 +931,12 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
920 | 931 | ||
921 | event->state = PERF_EVENT_STATE_INACTIVE; | 932 | event->state = PERF_EVENT_STATE_INACTIVE; |
922 | event->tstamp_enabled = ctx->time - event->total_time_enabled; | 933 | event->tstamp_enabled = ctx->time - event->total_time_enabled; |
923 | list_for_each_entry(sub, &event->sibling_list, group_entry) | 934 | list_for_each_entry(sub, &event->sibling_list, group_entry) { |
924 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) | 935 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) { |
925 | sub->tstamp_enabled = | 936 | sub->tstamp_enabled = |
926 | ctx->time - sub->total_time_enabled; | 937 | ctx->time - sub->total_time_enabled; |
938 | } | ||
939 | } | ||
927 | } | 940 | } |
928 | 941 | ||
929 | /* | 942 | /* |
@@ -932,9 +945,9 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
932 | static void __perf_event_enable(void *info) | 945 | static void __perf_event_enable(void *info) |
933 | { | 946 | { |
934 | struct perf_event *event = info; | 947 | struct perf_event *event = info; |
935 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
936 | struct perf_event_context *ctx = event->ctx; | 948 | struct perf_event_context *ctx = event->ctx; |
937 | struct perf_event *leader = event->group_leader; | 949 | struct perf_event *leader = event->group_leader; |
950 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
938 | int err; | 951 | int err; |
939 | 952 | ||
940 | /* | 953 | /* |
@@ -968,12 +981,10 @@ static void __perf_event_enable(void *info) | |||
968 | if (!group_can_go_on(event, cpuctx, 1)) { | 981 | if (!group_can_go_on(event, cpuctx, 1)) { |
969 | err = -EEXIST; | 982 | err = -EEXIST; |
970 | } else { | 983 | } else { |
971 | perf_disable(); | ||
972 | if (event == leader) | 984 | if (event == leader) |
973 | err = group_sched_in(event, cpuctx, ctx); | 985 | err = group_sched_in(event, cpuctx, ctx); |
974 | else | 986 | else |
975 | err = event_sched_in(event, cpuctx, ctx); | 987 | err = event_sched_in(event, cpuctx, ctx); |
976 | perf_enable(); | ||
977 | } | 988 | } |
978 | 989 | ||
979 | if (err) { | 990 | if (err) { |
@@ -989,7 +1000,7 @@ static void __perf_event_enable(void *info) | |||
989 | } | 1000 | } |
990 | } | 1001 | } |
991 | 1002 | ||
992 | unlock: | 1003 | unlock: |
993 | raw_spin_unlock(&ctx->lock); | 1004 | raw_spin_unlock(&ctx->lock); |
994 | } | 1005 | } |
995 | 1006 | ||
@@ -1030,7 +1041,7 @@ void perf_event_enable(struct perf_event *event) | |||
1030 | if (event->state == PERF_EVENT_STATE_ERROR) | 1041 | if (event->state == PERF_EVENT_STATE_ERROR) |
1031 | event->state = PERF_EVENT_STATE_OFF; | 1042 | event->state = PERF_EVENT_STATE_OFF; |
1032 | 1043 | ||
1033 | retry: | 1044 | retry: |
1034 | raw_spin_unlock_irq(&ctx->lock); | 1045 | raw_spin_unlock_irq(&ctx->lock); |
1035 | task_oncpu_function_call(task, __perf_event_enable, event); | 1046 | task_oncpu_function_call(task, __perf_event_enable, event); |
1036 | 1047 | ||
@@ -1050,7 +1061,7 @@ void perf_event_enable(struct perf_event *event) | |||
1050 | if (event->state == PERF_EVENT_STATE_OFF) | 1061 | if (event->state == PERF_EVENT_STATE_OFF) |
1051 | __perf_event_mark_enabled(event, ctx); | 1062 | __perf_event_mark_enabled(event, ctx); |
1052 | 1063 | ||
1053 | out: | 1064 | out: |
1054 | raw_spin_unlock_irq(&ctx->lock); | 1065 | raw_spin_unlock_irq(&ctx->lock); |
1055 | } | 1066 | } |
1056 | 1067 | ||
@@ -1081,26 +1092,26 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
1081 | struct perf_event *event; | 1092 | struct perf_event *event; |
1082 | 1093 | ||
1083 | raw_spin_lock(&ctx->lock); | 1094 | raw_spin_lock(&ctx->lock); |
1095 | perf_pmu_disable(ctx->pmu); | ||
1084 | ctx->is_active = 0; | 1096 | ctx->is_active = 0; |
1085 | if (likely(!ctx->nr_events)) | 1097 | if (likely(!ctx->nr_events)) |
1086 | goto out; | 1098 | goto out; |
1087 | update_context_time(ctx); | 1099 | update_context_time(ctx); |
1088 | 1100 | ||
1089 | perf_disable(); | ||
1090 | if (!ctx->nr_active) | 1101 | if (!ctx->nr_active) |
1091 | goto out_enable; | 1102 | goto out; |
1092 | 1103 | ||
1093 | if (event_type & EVENT_PINNED) | 1104 | if (event_type & EVENT_PINNED) { |
1094 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) | 1105 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) |
1095 | group_sched_out(event, cpuctx, ctx); | 1106 | group_sched_out(event, cpuctx, ctx); |
1107 | } | ||
1096 | 1108 | ||
1097 | if (event_type & EVENT_FLEXIBLE) | 1109 | if (event_type & EVENT_FLEXIBLE) { |
1098 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) | 1110 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) |
1099 | group_sched_out(event, cpuctx, ctx); | 1111 | group_sched_out(event, cpuctx, ctx); |
1100 | 1112 | } | |
1101 | out_enable: | 1113 | out: |
1102 | perf_enable(); | 1114 | perf_pmu_enable(ctx->pmu); |
1103 | out: | ||
1104 | raw_spin_unlock(&ctx->lock); | 1115 | raw_spin_unlock(&ctx->lock); |
1105 | } | 1116 | } |
1106 | 1117 | ||
@@ -1155,9 +1166,9 @@ static void __perf_event_sync_stat(struct perf_event *event, | |||
1155 | * In order to keep per-task stats reliable we need to flip the event | 1166 | * In order to keep per-task stats reliable we need to flip the event |
1156 | * values when we flip the contexts. | 1167 | * values when we flip the contexts. |
1157 | */ | 1168 | */ |
1158 | value = atomic64_read(&next_event->count); | 1169 | value = local64_read(&next_event->count); |
1159 | value = atomic64_xchg(&event->count, value); | 1170 | value = local64_xchg(&event->count, value); |
1160 | atomic64_set(&next_event->count, value); | 1171 | local64_set(&next_event->count, value); |
1161 | 1172 | ||
1162 | swap(event->total_time_enabled, next_event->total_time_enabled); | 1173 | swap(event->total_time_enabled, next_event->total_time_enabled); |
1163 | swap(event->total_time_running, next_event->total_time_running); | 1174 | swap(event->total_time_running, next_event->total_time_running); |
@@ -1198,34 +1209,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, | |||
1198 | } | 1209 | } |
1199 | } | 1210 | } |
1200 | 1211 | ||
1201 | /* | 1212 | void perf_event_context_sched_out(struct task_struct *task, int ctxn, |
1202 | * Called from scheduler to remove the events of the current task, | 1213 | struct task_struct *next) |
1203 | * with interrupts disabled. | ||
1204 | * | ||
1205 | * We stop each event and update the event value in event->count. | ||
1206 | * | ||
1207 | * This does not protect us against NMI, but disable() | ||
1208 | * sets the disabled bit in the control field of event _before_ | ||
1209 | * accessing the event control register. If a NMI hits, then it will | ||
1210 | * not restart the event. | ||
1211 | */ | ||
1212 | void perf_event_task_sched_out(struct task_struct *task, | ||
1213 | struct task_struct *next) | ||
1214 | { | 1214 | { |
1215 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1215 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
1216 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1217 | struct perf_event_context *next_ctx; | 1216 | struct perf_event_context *next_ctx; |
1218 | struct perf_event_context *parent; | 1217 | struct perf_event_context *parent; |
1218 | struct perf_cpu_context *cpuctx; | ||
1219 | int do_switch = 1; | 1219 | int do_switch = 1; |
1220 | 1220 | ||
1221 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); | 1221 | if (likely(!ctx)) |
1222 | return; | ||
1222 | 1223 | ||
1223 | if (likely(!ctx || !cpuctx->task_ctx)) | 1224 | cpuctx = __get_cpu_context(ctx); |
1225 | if (!cpuctx->task_ctx) | ||
1224 | return; | 1226 | return; |
1225 | 1227 | ||
1226 | rcu_read_lock(); | 1228 | rcu_read_lock(); |
1227 | parent = rcu_dereference(ctx->parent_ctx); | 1229 | parent = rcu_dereference(ctx->parent_ctx); |
1228 | next_ctx = next->perf_event_ctxp; | 1230 | next_ctx = next->perf_event_ctxp[ctxn]; |
1229 | if (parent && next_ctx && | 1231 | if (parent && next_ctx && |
1230 | rcu_dereference(next_ctx->parent_ctx) == parent) { | 1232 | rcu_dereference(next_ctx->parent_ctx) == parent) { |
1231 | /* | 1233 | /* |
@@ -1244,8 +1246,8 @@ void perf_event_task_sched_out(struct task_struct *task, | |||
1244 | * XXX do we need a memory barrier of sorts | 1246 | * XXX do we need a memory barrier of sorts |
1245 | * wrt to rcu_dereference() of perf_event_ctxp | 1247 | * wrt to rcu_dereference() of perf_event_ctxp |
1246 | */ | 1248 | */ |
1247 | task->perf_event_ctxp = next_ctx; | 1249 | task->perf_event_ctxp[ctxn] = next_ctx; |
1248 | next->perf_event_ctxp = ctx; | 1250 | next->perf_event_ctxp[ctxn] = ctx; |
1249 | ctx->task = next; | 1251 | ctx->task = next; |
1250 | next_ctx->task = task; | 1252 | next_ctx->task = task; |
1251 | do_switch = 0; | 1253 | do_switch = 0; |
@@ -1263,10 +1265,35 @@ void perf_event_task_sched_out(struct task_struct *task, | |||
1263 | } | 1265 | } |
1264 | } | 1266 | } |
1265 | 1267 | ||
1268 | #define for_each_task_context_nr(ctxn) \ | ||
1269 | for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) | ||
1270 | |||
1271 | /* | ||
1272 | * Called from scheduler to remove the events of the current task, | ||
1273 | * with interrupts disabled. | ||
1274 | * | ||
1275 | * We stop each event and update the event value in event->count. | ||
1276 | * | ||
1277 | * This does not protect us against NMI, but disable() | ||
1278 | * sets the disabled bit in the control field of event _before_ | ||
1279 | * accessing the event control register. If a NMI hits, then it will | ||
1280 | * not restart the event. | ||
1281 | */ | ||
1282 | void __perf_event_task_sched_out(struct task_struct *task, | ||
1283 | struct task_struct *next) | ||
1284 | { | ||
1285 | int ctxn; | ||
1286 | |||
1287 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); | ||
1288 | |||
1289 | for_each_task_context_nr(ctxn) | ||
1290 | perf_event_context_sched_out(task, ctxn, next); | ||
1291 | } | ||
1292 | |||
1266 | static void task_ctx_sched_out(struct perf_event_context *ctx, | 1293 | static void task_ctx_sched_out(struct perf_event_context *ctx, |
1267 | enum event_type_t event_type) | 1294 | enum event_type_t event_type) |
1268 | { | 1295 | { |
1269 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1296 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
1270 | 1297 | ||
1271 | if (!cpuctx->task_ctx) | 1298 | if (!cpuctx->task_ctx) |
1272 | return; | 1299 | return; |
@@ -1281,14 +1308,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, | |||
1281 | /* | 1308 | /* |
1282 | * Called with IRQs disabled | 1309 | * Called with IRQs disabled |
1283 | */ | 1310 | */ |
1284 | static void __perf_event_task_sched_out(struct perf_event_context *ctx) | ||
1285 | { | ||
1286 | task_ctx_sched_out(ctx, EVENT_ALL); | ||
1287 | } | ||
1288 | |||
1289 | /* | ||
1290 | * Called with IRQs disabled | ||
1291 | */ | ||
1292 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | 1311 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, |
1293 | enum event_type_t event_type) | 1312 | enum event_type_t event_type) |
1294 | { | 1313 | { |
@@ -1339,9 +1358,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1339 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1358 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
1340 | continue; | 1359 | continue; |
1341 | 1360 | ||
1342 | if (group_can_go_on(event, cpuctx, can_add_hw)) | 1361 | if (group_can_go_on(event, cpuctx, can_add_hw)) { |
1343 | if (group_sched_in(event, cpuctx, ctx)) | 1362 | if (group_sched_in(event, cpuctx, ctx)) |
1344 | can_add_hw = 0; | 1363 | can_add_hw = 0; |
1364 | } | ||
1345 | } | 1365 | } |
1346 | } | 1366 | } |
1347 | 1367 | ||
@@ -1357,8 +1377,6 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
1357 | 1377 | ||
1358 | ctx->timestamp = perf_clock(); | 1378 | ctx->timestamp = perf_clock(); |
1359 | 1379 | ||
1360 | perf_disable(); | ||
1361 | |||
1362 | /* | 1380 | /* |
1363 | * First go through the list and put on any pinned groups | 1381 | * First go through the list and put on any pinned groups |
1364 | * in order to give them the best chance of going on. | 1382 | * in order to give them the best chance of going on. |
@@ -1370,8 +1388,7 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
1370 | if (event_type & EVENT_FLEXIBLE) | 1388 | if (event_type & EVENT_FLEXIBLE) |
1371 | ctx_flexible_sched_in(ctx, cpuctx); | 1389 | ctx_flexible_sched_in(ctx, cpuctx); |
1372 | 1390 | ||
1373 | perf_enable(); | 1391 | out: |
1374 | out: | ||
1375 | raw_spin_unlock(&ctx->lock); | 1392 | raw_spin_unlock(&ctx->lock); |
1376 | } | 1393 | } |
1377 | 1394 | ||
@@ -1383,43 +1400,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | |||
1383 | ctx_sched_in(ctx, cpuctx, event_type); | 1400 | ctx_sched_in(ctx, cpuctx, event_type); |
1384 | } | 1401 | } |
1385 | 1402 | ||
1386 | static void task_ctx_sched_in(struct task_struct *task, | 1403 | static void task_ctx_sched_in(struct perf_event_context *ctx, |
1387 | enum event_type_t event_type) | 1404 | enum event_type_t event_type) |
1388 | { | 1405 | { |
1389 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1406 | struct perf_cpu_context *cpuctx; |
1390 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1391 | 1407 | ||
1392 | if (likely(!ctx)) | 1408 | cpuctx = __get_cpu_context(ctx); |
1393 | return; | ||
1394 | if (cpuctx->task_ctx == ctx) | 1409 | if (cpuctx->task_ctx == ctx) |
1395 | return; | 1410 | return; |
1411 | |||
1396 | ctx_sched_in(ctx, cpuctx, event_type); | 1412 | ctx_sched_in(ctx, cpuctx, event_type); |
1397 | cpuctx->task_ctx = ctx; | 1413 | cpuctx->task_ctx = ctx; |
1398 | } | 1414 | } |
1399 | /* | ||
1400 | * Called from scheduler to add the events of the current task | ||
1401 | * with interrupts disabled. | ||
1402 | * | ||
1403 | * We restore the event value and then enable it. | ||
1404 | * | ||
1405 | * This does not protect us against NMI, but enable() | ||
1406 | * sets the enabled bit in the control field of event _before_ | ||
1407 | * accessing the event control register. If a NMI hits, then it will | ||
1408 | * keep the event running. | ||
1409 | */ | ||
1410 | void perf_event_task_sched_in(struct task_struct *task) | ||
1411 | { | ||
1412 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
1413 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
1414 | 1415 | ||
1415 | if (likely(!ctx)) | 1416 | void perf_event_context_sched_in(struct perf_event_context *ctx) |
1416 | return; | 1417 | { |
1418 | struct perf_cpu_context *cpuctx; | ||
1417 | 1419 | ||
1420 | cpuctx = __get_cpu_context(ctx); | ||
1418 | if (cpuctx->task_ctx == ctx) | 1421 | if (cpuctx->task_ctx == ctx) |
1419 | return; | 1422 | return; |
1420 | 1423 | ||
1421 | perf_disable(); | 1424 | perf_pmu_disable(ctx->pmu); |
1422 | |||
1423 | /* | 1425 | /* |
1424 | * We want to keep the following priority order: | 1426 | * We want to keep the following priority order: |
1425 | * cpu pinned (that don't need to move), task pinned, | 1427 | * cpu pinned (that don't need to move), task pinned, |
@@ -1433,7 +1435,37 @@ void perf_event_task_sched_in(struct task_struct *task) | |||
1433 | 1435 | ||
1434 | cpuctx->task_ctx = ctx; | 1436 | cpuctx->task_ctx = ctx; |
1435 | 1437 | ||
1436 | perf_enable(); | 1438 | /* |
1439 | * Since these rotations are per-cpu, we need to ensure the | ||
1440 | * cpu-context we got scheduled on is actually rotating. | ||
1441 | */ | ||
1442 | perf_pmu_rotate_start(ctx->pmu); | ||
1443 | perf_pmu_enable(ctx->pmu); | ||
1444 | } | ||
1445 | |||
1446 | /* | ||
1447 | * Called from scheduler to add the events of the current task | ||
1448 | * with interrupts disabled. | ||
1449 | * | ||
1450 | * We restore the event value and then enable it. | ||
1451 | * | ||
1452 | * This does not protect us against NMI, but enable() | ||
1453 | * sets the enabled bit in the control field of event _before_ | ||
1454 | * accessing the event control register. If a NMI hits, then it will | ||
1455 | * keep the event running. | ||
1456 | */ | ||
1457 | void __perf_event_task_sched_in(struct task_struct *task) | ||
1458 | { | ||
1459 | struct perf_event_context *ctx; | ||
1460 | int ctxn; | ||
1461 | |||
1462 | for_each_task_context_nr(ctxn) { | ||
1463 | ctx = task->perf_event_ctxp[ctxn]; | ||
1464 | if (likely(!ctx)) | ||
1465 | continue; | ||
1466 | |||
1467 | perf_event_context_sched_in(ctx); | ||
1468 | } | ||
1437 | } | 1469 | } |
1438 | 1470 | ||
1439 | #define MAX_INTERRUPTS (~0ULL) | 1471 | #define MAX_INTERRUPTS (~0ULL) |
@@ -1513,22 +1545,6 @@ do { \ | |||
1513 | return div64_u64(dividend, divisor); | 1545 | return div64_u64(dividend, divisor); |
1514 | } | 1546 | } |
1515 | 1547 | ||
1516 | static void perf_event_stop(struct perf_event *event) | ||
1517 | { | ||
1518 | if (!event->pmu->stop) | ||
1519 | return event->pmu->disable(event); | ||
1520 | |||
1521 | return event->pmu->stop(event); | ||
1522 | } | ||
1523 | |||
1524 | static int perf_event_start(struct perf_event *event) | ||
1525 | { | ||
1526 | if (!event->pmu->start) | ||
1527 | return event->pmu->enable(event); | ||
1528 | |||
1529 | return event->pmu->start(event); | ||
1530 | } | ||
1531 | |||
1532 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | 1548 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) |
1533 | { | 1549 | { |
1534 | struct hw_perf_event *hwc = &event->hw; | 1550 | struct hw_perf_event *hwc = &event->hw; |
@@ -1547,16 +1563,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | |||
1547 | 1563 | ||
1548 | hwc->sample_period = sample_period; | 1564 | hwc->sample_period = sample_period; |
1549 | 1565 | ||
1550 | if (atomic64_read(&hwc->period_left) > 8*sample_period) { | 1566 | if (local64_read(&hwc->period_left) > 8*sample_period) { |
1551 | perf_disable(); | 1567 | event->pmu->stop(event, PERF_EF_UPDATE); |
1552 | perf_event_stop(event); | 1568 | local64_set(&hwc->period_left, 0); |
1553 | atomic64_set(&hwc->period_left, 0); | 1569 | event->pmu->start(event, PERF_EF_RELOAD); |
1554 | perf_event_start(event); | ||
1555 | perf_enable(); | ||
1556 | } | 1570 | } |
1557 | } | 1571 | } |
1558 | 1572 | ||
1559 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | 1573 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) |
1560 | { | 1574 | { |
1561 | struct perf_event *event; | 1575 | struct perf_event *event; |
1562 | struct hw_perf_event *hwc; | 1576 | struct hw_perf_event *hwc; |
@@ -1581,23 +1595,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
1581 | */ | 1595 | */ |
1582 | if (interrupts == MAX_INTERRUPTS) { | 1596 | if (interrupts == MAX_INTERRUPTS) { |
1583 | perf_log_throttle(event, 1); | 1597 | perf_log_throttle(event, 1); |
1584 | perf_disable(); | 1598 | event->pmu->start(event, 0); |
1585 | event->pmu->unthrottle(event); | ||
1586 | perf_enable(); | ||
1587 | } | 1599 | } |
1588 | 1600 | ||
1589 | if (!event->attr.freq || !event->attr.sample_freq) | 1601 | if (!event->attr.freq || !event->attr.sample_freq) |
1590 | continue; | 1602 | continue; |
1591 | 1603 | ||
1592 | perf_disable(); | ||
1593 | event->pmu->read(event); | 1604 | event->pmu->read(event); |
1594 | now = atomic64_read(&event->count); | 1605 | now = local64_read(&event->count); |
1595 | delta = now - hwc->freq_count_stamp; | 1606 | delta = now - hwc->freq_count_stamp; |
1596 | hwc->freq_count_stamp = now; | 1607 | hwc->freq_count_stamp = now; |
1597 | 1608 | ||
1598 | if (delta > 0) | 1609 | if (delta > 0) |
1599 | perf_adjust_period(event, TICK_NSEC, delta); | 1610 | perf_adjust_period(event, period, delta); |
1600 | perf_enable(); | ||
1601 | } | 1611 | } |
1602 | raw_spin_unlock(&ctx->lock); | 1612 | raw_spin_unlock(&ctx->lock); |
1603 | } | 1613 | } |
@@ -1615,32 +1625,38 @@ static void rotate_ctx(struct perf_event_context *ctx) | |||
1615 | raw_spin_unlock(&ctx->lock); | 1625 | raw_spin_unlock(&ctx->lock); |
1616 | } | 1626 | } |
1617 | 1627 | ||
1618 | void perf_event_task_tick(struct task_struct *curr) | 1628 | /* |
1629 | * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized | ||
1630 | * because they're strictly cpu affine and rotate_start is called with IRQs | ||
1631 | * disabled, while rotate_context is called from IRQ context. | ||
1632 | */ | ||
1633 | static void perf_rotate_context(struct perf_cpu_context *cpuctx) | ||
1619 | { | 1634 | { |
1620 | struct perf_cpu_context *cpuctx; | 1635 | u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; |
1621 | struct perf_event_context *ctx; | 1636 | struct perf_event_context *ctx = NULL; |
1622 | int rotate = 0; | 1637 | int rotate = 0, remove = 1; |
1623 | 1638 | ||
1624 | if (!atomic_read(&nr_events)) | 1639 | if (cpuctx->ctx.nr_events) { |
1625 | return; | 1640 | remove = 0; |
1626 | 1641 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) | |
1627 | cpuctx = &__get_cpu_var(perf_cpu_context); | 1642 | rotate = 1; |
1628 | if (cpuctx->ctx.nr_events && | 1643 | } |
1629 | cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) | ||
1630 | rotate = 1; | ||
1631 | 1644 | ||
1632 | ctx = curr->perf_event_ctxp; | 1645 | ctx = cpuctx->task_ctx; |
1633 | if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) | 1646 | if (ctx && ctx->nr_events) { |
1634 | rotate = 1; | 1647 | remove = 0; |
1648 | if (ctx->nr_events != ctx->nr_active) | ||
1649 | rotate = 1; | ||
1650 | } | ||
1635 | 1651 | ||
1636 | perf_ctx_adjust_freq(&cpuctx->ctx); | 1652 | perf_pmu_disable(cpuctx->ctx.pmu); |
1653 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); | ||
1637 | if (ctx) | 1654 | if (ctx) |
1638 | perf_ctx_adjust_freq(ctx); | 1655 | perf_ctx_adjust_freq(ctx, interval); |
1639 | 1656 | ||
1640 | if (!rotate) | 1657 | if (!rotate) |
1641 | return; | 1658 | goto done; |
1642 | 1659 | ||
1643 | perf_disable(); | ||
1644 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 1660 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
1645 | if (ctx) | 1661 | if (ctx) |
1646 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); | 1662 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); |
@@ -1651,8 +1667,27 @@ void perf_event_task_tick(struct task_struct *curr) | |||
1651 | 1667 | ||
1652 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 1668 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); |
1653 | if (ctx) | 1669 | if (ctx) |
1654 | task_ctx_sched_in(curr, EVENT_FLEXIBLE); | 1670 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); |
1655 | perf_enable(); | 1671 | |
1672 | done: | ||
1673 | if (remove) | ||
1674 | list_del_init(&cpuctx->rotation_list); | ||
1675 | |||
1676 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
1677 | } | ||
1678 | |||
1679 | void perf_event_task_tick(void) | ||
1680 | { | ||
1681 | struct list_head *head = &__get_cpu_var(rotation_list); | ||
1682 | struct perf_cpu_context *cpuctx, *tmp; | ||
1683 | |||
1684 | WARN_ON(!irqs_disabled()); | ||
1685 | |||
1686 | list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { | ||
1687 | if (cpuctx->jiffies_interval == 1 || | ||
1688 | !(jiffies % cpuctx->jiffies_interval)) | ||
1689 | perf_rotate_context(cpuctx); | ||
1690 | } | ||
1656 | } | 1691 | } |
1657 | 1692 | ||
1658 | static int event_enable_on_exec(struct perf_event *event, | 1693 | static int event_enable_on_exec(struct perf_event *event, |
@@ -1674,20 +1709,18 @@ static int event_enable_on_exec(struct perf_event *event, | |||
1674 | * Enable all of a task's events that have been marked enable-on-exec. | 1709 | * Enable all of a task's events that have been marked enable-on-exec. |
1675 | * This expects task == current. | 1710 | * This expects task == current. |
1676 | */ | 1711 | */ |
1677 | static void perf_event_enable_on_exec(struct task_struct *task) | 1712 | static void perf_event_enable_on_exec(struct perf_event_context *ctx) |
1678 | { | 1713 | { |
1679 | struct perf_event_context *ctx; | ||
1680 | struct perf_event *event; | 1714 | struct perf_event *event; |
1681 | unsigned long flags; | 1715 | unsigned long flags; |
1682 | int enabled = 0; | 1716 | int enabled = 0; |
1683 | int ret; | 1717 | int ret; |
1684 | 1718 | ||
1685 | local_irq_save(flags); | 1719 | local_irq_save(flags); |
1686 | ctx = task->perf_event_ctxp; | ||
1687 | if (!ctx || !ctx->nr_events) | 1720 | if (!ctx || !ctx->nr_events) |
1688 | goto out; | 1721 | goto out; |
1689 | 1722 | ||
1690 | __perf_event_task_sched_out(ctx); | 1723 | task_ctx_sched_out(ctx, EVENT_ALL); |
1691 | 1724 | ||
1692 | raw_spin_lock(&ctx->lock); | 1725 | raw_spin_lock(&ctx->lock); |
1693 | 1726 | ||
@@ -1711,8 +1744,8 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
1711 | 1744 | ||
1712 | raw_spin_unlock(&ctx->lock); | 1745 | raw_spin_unlock(&ctx->lock); |
1713 | 1746 | ||
1714 | perf_event_task_sched_in(task); | 1747 | perf_event_context_sched_in(ctx); |
1715 | out: | 1748 | out: |
1716 | local_irq_restore(flags); | 1749 | local_irq_restore(flags); |
1717 | } | 1750 | } |
1718 | 1751 | ||
@@ -1721,9 +1754,9 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
1721 | */ | 1754 | */ |
1722 | static void __perf_event_read(void *info) | 1755 | static void __perf_event_read(void *info) |
1723 | { | 1756 | { |
1724 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
1725 | struct perf_event *event = info; | 1757 | struct perf_event *event = info; |
1726 | struct perf_event_context *ctx = event->ctx; | 1758 | struct perf_event_context *ctx = event->ctx; |
1759 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
1727 | 1760 | ||
1728 | /* | 1761 | /* |
1729 | * If this is a task context, we need to check whether it is | 1762 | * If this is a task context, we need to check whether it is |
@@ -1743,6 +1776,11 @@ static void __perf_event_read(void *info) | |||
1743 | event->pmu->read(event); | 1776 | event->pmu->read(event); |
1744 | } | 1777 | } |
1745 | 1778 | ||
1779 | static inline u64 perf_event_count(struct perf_event *event) | ||
1780 | { | ||
1781 | return local64_read(&event->count) + atomic64_read(&event->child_count); | ||
1782 | } | ||
1783 | |||
1746 | static u64 perf_event_read(struct perf_event *event) | 1784 | static u64 perf_event_read(struct perf_event *event) |
1747 | { | 1785 | { |
1748 | /* | 1786 | /* |
@@ -1757,20 +1795,234 @@ static u64 perf_event_read(struct perf_event *event) | |||
1757 | unsigned long flags; | 1795 | unsigned long flags; |
1758 | 1796 | ||
1759 | raw_spin_lock_irqsave(&ctx->lock, flags); | 1797 | raw_spin_lock_irqsave(&ctx->lock, flags); |
1760 | update_context_time(ctx); | 1798 | /* |
1799 | * may read while context is not active | ||
1800 | * (e.g., thread is blocked), in that case | ||
1801 | * we cannot update context time | ||
1802 | */ | ||
1803 | if (ctx->is_active) | ||
1804 | update_context_time(ctx); | ||
1761 | update_event_times(event); | 1805 | update_event_times(event); |
1762 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 1806 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1763 | } | 1807 | } |
1764 | 1808 | ||
1765 | return atomic64_read(&event->count); | 1809 | return perf_event_count(event); |
1766 | } | 1810 | } |
1767 | 1811 | ||
1768 | /* | 1812 | /* |
1769 | * Initialize the perf_event context in a task_struct: | 1813 | * Callchain support |
1770 | */ | 1814 | */ |
1815 | |||
1816 | struct callchain_cpus_entries { | ||
1817 | struct rcu_head rcu_head; | ||
1818 | struct perf_callchain_entry *cpu_entries[0]; | ||
1819 | }; | ||
1820 | |||
1821 | static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); | ||
1822 | static atomic_t nr_callchain_events; | ||
1823 | static DEFINE_MUTEX(callchain_mutex); | ||
1824 | struct callchain_cpus_entries *callchain_cpus_entries; | ||
1825 | |||
1826 | |||
1827 | __weak void perf_callchain_kernel(struct perf_callchain_entry *entry, | ||
1828 | struct pt_regs *regs) | ||
1829 | { | ||
1830 | } | ||
1831 | |||
1832 | __weak void perf_callchain_user(struct perf_callchain_entry *entry, | ||
1833 | struct pt_regs *regs) | ||
1834 | { | ||
1835 | } | ||
1836 | |||
1837 | static void release_callchain_buffers_rcu(struct rcu_head *head) | ||
1838 | { | ||
1839 | struct callchain_cpus_entries *entries; | ||
1840 | int cpu; | ||
1841 | |||
1842 | entries = container_of(head, struct callchain_cpus_entries, rcu_head); | ||
1843 | |||
1844 | for_each_possible_cpu(cpu) | ||
1845 | kfree(entries->cpu_entries[cpu]); | ||
1846 | |||
1847 | kfree(entries); | ||
1848 | } | ||
1849 | |||
1850 | static void release_callchain_buffers(void) | ||
1851 | { | ||
1852 | struct callchain_cpus_entries *entries; | ||
1853 | |||
1854 | entries = callchain_cpus_entries; | ||
1855 | rcu_assign_pointer(callchain_cpus_entries, NULL); | ||
1856 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); | ||
1857 | } | ||
1858 | |||
1859 | static int alloc_callchain_buffers(void) | ||
1860 | { | ||
1861 | int cpu; | ||
1862 | int size; | ||
1863 | struct callchain_cpus_entries *entries; | ||
1864 | |||
1865 | /* | ||
1866 | * We can't use the percpu allocation API for data that can be | ||
1867 | * accessed from NMI. Use a temporary manual per cpu allocation | ||
1868 | * until that gets sorted out. | ||
1869 | */ | ||
1870 | size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * | ||
1871 | num_possible_cpus(); | ||
1872 | |||
1873 | entries = kzalloc(size, GFP_KERNEL); | ||
1874 | if (!entries) | ||
1875 | return -ENOMEM; | ||
1876 | |||
1877 | size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; | ||
1878 | |||
1879 | for_each_possible_cpu(cpu) { | ||
1880 | entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, | ||
1881 | cpu_to_node(cpu)); | ||
1882 | if (!entries->cpu_entries[cpu]) | ||
1883 | goto fail; | ||
1884 | } | ||
1885 | |||
1886 | rcu_assign_pointer(callchain_cpus_entries, entries); | ||
1887 | |||
1888 | return 0; | ||
1889 | |||
1890 | fail: | ||
1891 | for_each_possible_cpu(cpu) | ||
1892 | kfree(entries->cpu_entries[cpu]); | ||
1893 | kfree(entries); | ||
1894 | |||
1895 | return -ENOMEM; | ||
1896 | } | ||
1897 | |||
1898 | static int get_callchain_buffers(void) | ||
1899 | { | ||
1900 | int err = 0; | ||
1901 | int count; | ||
1902 | |||
1903 | mutex_lock(&callchain_mutex); | ||
1904 | |||
1905 | count = atomic_inc_return(&nr_callchain_events); | ||
1906 | if (WARN_ON_ONCE(count < 1)) { | ||
1907 | err = -EINVAL; | ||
1908 | goto exit; | ||
1909 | } | ||
1910 | |||
1911 | if (count > 1) { | ||
1912 | /* If the allocation failed, give up */ | ||
1913 | if (!callchain_cpus_entries) | ||
1914 | err = -ENOMEM; | ||
1915 | goto exit; | ||
1916 | } | ||
1917 | |||
1918 | err = alloc_callchain_buffers(); | ||
1919 | if (err) | ||
1920 | release_callchain_buffers(); | ||
1921 | exit: | ||
1922 | mutex_unlock(&callchain_mutex); | ||
1923 | |||
1924 | return err; | ||
1925 | } | ||
1926 | |||
1927 | static void put_callchain_buffers(void) | ||
1928 | { | ||
1929 | if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { | ||
1930 | release_callchain_buffers(); | ||
1931 | mutex_unlock(&callchain_mutex); | ||
1932 | } | ||
1933 | } | ||
1934 | |||
1935 | static int get_recursion_context(int *recursion) | ||
1936 | { | ||
1937 | int rctx; | ||
1938 | |||
1939 | if (in_nmi()) | ||
1940 | rctx = 3; | ||
1941 | else if (in_irq()) | ||
1942 | rctx = 2; | ||
1943 | else if (in_softirq()) | ||
1944 | rctx = 1; | ||
1945 | else | ||
1946 | rctx = 0; | ||
1947 | |||
1948 | if (recursion[rctx]) | ||
1949 | return -1; | ||
1950 | |||
1951 | recursion[rctx]++; | ||
1952 | barrier(); | ||
1953 | |||
1954 | return rctx; | ||
1955 | } | ||
1956 | |||
1957 | static inline void put_recursion_context(int *recursion, int rctx) | ||
1958 | { | ||
1959 | barrier(); | ||
1960 | recursion[rctx]--; | ||
1961 | } | ||
1962 | |||
1963 | static struct perf_callchain_entry *get_callchain_entry(int *rctx) | ||
1964 | { | ||
1965 | int cpu; | ||
1966 | struct callchain_cpus_entries *entries; | ||
1967 | |||
1968 | *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); | ||
1969 | if (*rctx == -1) | ||
1970 | return NULL; | ||
1971 | |||
1972 | entries = rcu_dereference(callchain_cpus_entries); | ||
1973 | if (!entries) | ||
1974 | return NULL; | ||
1975 | |||
1976 | cpu = smp_processor_id(); | ||
1977 | |||
1978 | return &entries->cpu_entries[cpu][*rctx]; | ||
1979 | } | ||
1980 | |||
1771 | static void | 1981 | static void |
1772 | __perf_event_init_context(struct perf_event_context *ctx, | 1982 | put_callchain_entry(int rctx) |
1773 | struct task_struct *task) | 1983 | { |
1984 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | ||
1985 | } | ||
1986 | |||
1987 | static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
1988 | { | ||
1989 | int rctx; | ||
1990 | struct perf_callchain_entry *entry; | ||
1991 | |||
1992 | |||
1993 | entry = get_callchain_entry(&rctx); | ||
1994 | if (rctx == -1) | ||
1995 | return NULL; | ||
1996 | |||
1997 | if (!entry) | ||
1998 | goto exit_put; | ||
1999 | |||
2000 | entry->nr = 0; | ||
2001 | |||
2002 | if (!user_mode(regs)) { | ||
2003 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); | ||
2004 | perf_callchain_kernel(entry, regs); | ||
2005 | if (current->mm) | ||
2006 | regs = task_pt_regs(current); | ||
2007 | else | ||
2008 | regs = NULL; | ||
2009 | } | ||
2010 | |||
2011 | if (regs) { | ||
2012 | perf_callchain_store(entry, PERF_CONTEXT_USER); | ||
2013 | perf_callchain_user(entry, regs); | ||
2014 | } | ||
2015 | |||
2016 | exit_put: | ||
2017 | put_callchain_entry(rctx); | ||
2018 | |||
2019 | return entry; | ||
2020 | } | ||
2021 | |||
2022 | /* | ||
2023 | * Initialize the perf_event context in a task_struct: | ||
2024 | */ | ||
2025 | static void __perf_event_init_context(struct perf_event_context *ctx) | ||
1774 | { | 2026 | { |
1775 | raw_spin_lock_init(&ctx->lock); | 2027 | raw_spin_lock_init(&ctx->lock); |
1776 | mutex_init(&ctx->mutex); | 2028 | mutex_init(&ctx->mutex); |
@@ -1778,45 +2030,38 @@ __perf_event_init_context(struct perf_event_context *ctx, | |||
1778 | INIT_LIST_HEAD(&ctx->flexible_groups); | 2030 | INIT_LIST_HEAD(&ctx->flexible_groups); |
1779 | INIT_LIST_HEAD(&ctx->event_list); | 2031 | INIT_LIST_HEAD(&ctx->event_list); |
1780 | atomic_set(&ctx->refcount, 1); | 2032 | atomic_set(&ctx->refcount, 1); |
1781 | ctx->task = task; | ||
1782 | } | 2033 | } |
1783 | 2034 | ||
1784 | static struct perf_event_context *find_get_context(pid_t pid, int cpu) | 2035 | static struct perf_event_context * |
2036 | alloc_perf_context(struct pmu *pmu, struct task_struct *task) | ||
1785 | { | 2037 | { |
1786 | struct perf_event_context *ctx; | 2038 | struct perf_event_context *ctx; |
1787 | struct perf_cpu_context *cpuctx; | ||
1788 | struct task_struct *task; | ||
1789 | unsigned long flags; | ||
1790 | int err; | ||
1791 | 2039 | ||
1792 | if (pid == -1 && cpu != -1) { | 2040 | ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); |
1793 | /* Must be root to operate on a CPU event: */ | 2041 | if (!ctx) |
1794 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | 2042 | return NULL; |
1795 | return ERR_PTR(-EACCES); | ||
1796 | |||
1797 | if (cpu < 0 || cpu >= nr_cpumask_bits) | ||
1798 | return ERR_PTR(-EINVAL); | ||
1799 | 2043 | ||
1800 | /* | 2044 | __perf_event_init_context(ctx); |
1801 | * We could be clever and allow to attach a event to an | 2045 | if (task) { |
1802 | * offline CPU and activate it when the CPU comes up, but | 2046 | ctx->task = task; |
1803 | * that's for later. | 2047 | get_task_struct(task); |
1804 | */ | 2048 | } |
1805 | if (!cpu_online(cpu)) | 2049 | ctx->pmu = pmu; |
1806 | return ERR_PTR(-ENODEV); | ||
1807 | 2050 | ||
1808 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 2051 | return ctx; |
1809 | ctx = &cpuctx->ctx; | 2052 | } |
1810 | get_ctx(ctx); | ||
1811 | 2053 | ||
1812 | return ctx; | 2054 | static struct task_struct * |
1813 | } | 2055 | find_lively_task_by_vpid(pid_t vpid) |
2056 | { | ||
2057 | struct task_struct *task; | ||
2058 | int err; | ||
1814 | 2059 | ||
1815 | rcu_read_lock(); | 2060 | rcu_read_lock(); |
1816 | if (!pid) | 2061 | if (!vpid) |
1817 | task = current; | 2062 | task = current; |
1818 | else | 2063 | else |
1819 | task = find_task_by_vpid(pid); | 2064 | task = find_task_by_vpid(vpid); |
1820 | if (task) | 2065 | if (task) |
1821 | get_task_struct(task); | 2066 | get_task_struct(task); |
1822 | rcu_read_unlock(); | 2067 | rcu_read_unlock(); |
@@ -1836,36 +2081,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) | |||
1836 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) | 2081 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) |
1837 | goto errout; | 2082 | goto errout; |
1838 | 2083 | ||
1839 | retry: | 2084 | return task; |
1840 | ctx = perf_lock_task_context(task, &flags); | 2085 | errout: |
2086 | put_task_struct(task); | ||
2087 | return ERR_PTR(err); | ||
2088 | |||
2089 | } | ||
2090 | |||
2091 | static struct perf_event_context * | ||
2092 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | ||
2093 | { | ||
2094 | struct perf_event_context *ctx; | ||
2095 | struct perf_cpu_context *cpuctx; | ||
2096 | unsigned long flags; | ||
2097 | int ctxn, err; | ||
2098 | |||
2099 | if (!task && cpu != -1) { | ||
2100 | /* Must be root to operate on a CPU event: */ | ||
2101 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | ||
2102 | return ERR_PTR(-EACCES); | ||
2103 | |||
2104 | if (cpu < 0 || cpu >= nr_cpumask_bits) | ||
2105 | return ERR_PTR(-EINVAL); | ||
2106 | |||
2107 | /* | ||
2108 | * We could be clever and allow to attach a event to an | ||
2109 | * offline CPU and activate it when the CPU comes up, but | ||
2110 | * that's for later. | ||
2111 | */ | ||
2112 | if (!cpu_online(cpu)) | ||
2113 | return ERR_PTR(-ENODEV); | ||
2114 | |||
2115 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
2116 | ctx = &cpuctx->ctx; | ||
2117 | get_ctx(ctx); | ||
2118 | |||
2119 | return ctx; | ||
2120 | } | ||
2121 | |||
2122 | err = -EINVAL; | ||
2123 | ctxn = pmu->task_ctx_nr; | ||
2124 | if (ctxn < 0) | ||
2125 | goto errout; | ||
2126 | |||
2127 | retry: | ||
2128 | ctx = perf_lock_task_context(task, ctxn, &flags); | ||
1841 | if (ctx) { | 2129 | if (ctx) { |
1842 | unclone_ctx(ctx); | 2130 | unclone_ctx(ctx); |
1843 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2131 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1844 | } | 2132 | } |
1845 | 2133 | ||
1846 | if (!ctx) { | 2134 | if (!ctx) { |
1847 | ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); | 2135 | ctx = alloc_perf_context(pmu, task); |
1848 | err = -ENOMEM; | 2136 | err = -ENOMEM; |
1849 | if (!ctx) | 2137 | if (!ctx) |
1850 | goto errout; | 2138 | goto errout; |
1851 | __perf_event_init_context(ctx, task); | 2139 | |
1852 | get_ctx(ctx); | 2140 | get_ctx(ctx); |
1853 | if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { | 2141 | |
2142 | if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { | ||
1854 | /* | 2143 | /* |
1855 | * We raced with some other task; use | 2144 | * We raced with some other task; use |
1856 | * the context they set. | 2145 | * the context they set. |
1857 | */ | 2146 | */ |
2147 | put_task_struct(task); | ||
1858 | kfree(ctx); | 2148 | kfree(ctx); |
1859 | goto retry; | 2149 | goto retry; |
1860 | } | 2150 | } |
1861 | get_task_struct(task); | ||
1862 | } | 2151 | } |
1863 | 2152 | ||
1864 | put_task_struct(task); | ||
1865 | return ctx; | 2153 | return ctx; |
1866 | 2154 | ||
1867 | errout: | 2155 | errout: |
1868 | put_task_struct(task); | ||
1869 | return ERR_PTR(err); | 2156 | return ERR_PTR(err); |
1870 | } | 2157 | } |
1871 | 2158 | ||
@@ -1882,32 +2169,36 @@ static void free_event_rcu(struct rcu_head *head) | |||
1882 | kfree(event); | 2169 | kfree(event); |
1883 | } | 2170 | } |
1884 | 2171 | ||
1885 | static void perf_pending_sync(struct perf_event *event); | 2172 | static void perf_buffer_put(struct perf_buffer *buffer); |
1886 | static void perf_mmap_data_put(struct perf_mmap_data *data); | ||
1887 | 2173 | ||
1888 | static void free_event(struct perf_event *event) | 2174 | static void free_event(struct perf_event *event) |
1889 | { | 2175 | { |
1890 | perf_pending_sync(event); | 2176 | irq_work_sync(&event->pending); |
1891 | 2177 | ||
1892 | if (!event->parent) { | 2178 | if (!event->parent) { |
1893 | atomic_dec(&nr_events); | 2179 | if (event->attach_state & PERF_ATTACH_TASK) |
1894 | if (event->attr.mmap) | 2180 | jump_label_dec(&perf_task_events); |
2181 | if (event->attr.mmap || event->attr.mmap_data) | ||
1895 | atomic_dec(&nr_mmap_events); | 2182 | atomic_dec(&nr_mmap_events); |
1896 | if (event->attr.comm) | 2183 | if (event->attr.comm) |
1897 | atomic_dec(&nr_comm_events); | 2184 | atomic_dec(&nr_comm_events); |
1898 | if (event->attr.task) | 2185 | if (event->attr.task) |
1899 | atomic_dec(&nr_task_events); | 2186 | atomic_dec(&nr_task_events); |
2187 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | ||
2188 | put_callchain_buffers(); | ||
1900 | } | 2189 | } |
1901 | 2190 | ||
1902 | if (event->data) { | 2191 | if (event->buffer) { |
1903 | perf_mmap_data_put(event->data); | 2192 | perf_buffer_put(event->buffer); |
1904 | event->data = NULL; | 2193 | event->buffer = NULL; |
1905 | } | 2194 | } |
1906 | 2195 | ||
1907 | if (event->destroy) | 2196 | if (event->destroy) |
1908 | event->destroy(event); | 2197 | event->destroy(event); |
1909 | 2198 | ||
1910 | put_ctx(event->ctx); | 2199 | if (event->ctx) |
2200 | put_ctx(event->ctx); | ||
2201 | |||
1911 | call_rcu(&event->rcu_head, free_event_rcu); | 2202 | call_rcu(&event->rcu_head, free_event_rcu); |
1912 | } | 2203 | } |
1913 | 2204 | ||
@@ -2126,13 +2417,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
2126 | static unsigned int perf_poll(struct file *file, poll_table *wait) | 2417 | static unsigned int perf_poll(struct file *file, poll_table *wait) |
2127 | { | 2418 | { |
2128 | struct perf_event *event = file->private_data; | 2419 | struct perf_event *event = file->private_data; |
2129 | struct perf_mmap_data *data; | 2420 | struct perf_buffer *buffer; |
2130 | unsigned int events = POLL_HUP; | 2421 | unsigned int events = POLL_HUP; |
2131 | 2422 | ||
2132 | rcu_read_lock(); | 2423 | rcu_read_lock(); |
2133 | data = rcu_dereference(event->data); | 2424 | buffer = rcu_dereference(event->buffer); |
2134 | if (data) | 2425 | if (buffer) |
2135 | events = atomic_xchg(&data->poll, 0); | 2426 | events = atomic_xchg(&buffer->poll, 0); |
2136 | rcu_read_unlock(); | 2427 | rcu_read_unlock(); |
2137 | 2428 | ||
2138 | poll_wait(file, &event->waitq, wait); | 2429 | poll_wait(file, &event->waitq, wait); |
@@ -2143,7 +2434,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
2143 | static void perf_event_reset(struct perf_event *event) | 2434 | static void perf_event_reset(struct perf_event *event) |
2144 | { | 2435 | { |
2145 | (void)perf_event_read(event); | 2436 | (void)perf_event_read(event); |
2146 | atomic64_set(&event->count, 0); | 2437 | local64_set(&event->count, 0); |
2147 | perf_event_update_userpage(event); | 2438 | perf_event_update_userpage(event); |
2148 | } | 2439 | } |
2149 | 2440 | ||
@@ -2186,15 +2477,13 @@ static void perf_event_for_each(struct perf_event *event, | |||
2186 | static int perf_event_period(struct perf_event *event, u64 __user *arg) | 2477 | static int perf_event_period(struct perf_event *event, u64 __user *arg) |
2187 | { | 2478 | { |
2188 | struct perf_event_context *ctx = event->ctx; | 2479 | struct perf_event_context *ctx = event->ctx; |
2189 | unsigned long size; | ||
2190 | int ret = 0; | 2480 | int ret = 0; |
2191 | u64 value; | 2481 | u64 value; |
2192 | 2482 | ||
2193 | if (!event->attr.sample_period) | 2483 | if (!event->attr.sample_period) |
2194 | return -EINVAL; | 2484 | return -EINVAL; |
2195 | 2485 | ||
2196 | size = copy_from_user(&value, arg, sizeof(value)); | 2486 | if (copy_from_user(&value, arg, sizeof(value))) |
2197 | if (size != sizeof(value)) | ||
2198 | return -EFAULT; | 2487 | return -EFAULT; |
2199 | 2488 | ||
2200 | if (!value) | 2489 | if (!value) |
@@ -2328,6 +2617,9 @@ int perf_event_task_disable(void) | |||
2328 | 2617 | ||
2329 | static int perf_event_index(struct perf_event *event) | 2618 | static int perf_event_index(struct perf_event *event) |
2330 | { | 2619 | { |
2620 | if (event->hw.state & PERF_HES_STOPPED) | ||
2621 | return 0; | ||
2622 | |||
2331 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 2623 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
2332 | return 0; | 2624 | return 0; |
2333 | 2625 | ||
@@ -2342,14 +2634,14 @@ static int perf_event_index(struct perf_event *event) | |||
2342 | void perf_event_update_userpage(struct perf_event *event) | 2634 | void perf_event_update_userpage(struct perf_event *event) |
2343 | { | 2635 | { |
2344 | struct perf_event_mmap_page *userpg; | 2636 | struct perf_event_mmap_page *userpg; |
2345 | struct perf_mmap_data *data; | 2637 | struct perf_buffer *buffer; |
2346 | 2638 | ||
2347 | rcu_read_lock(); | 2639 | rcu_read_lock(); |
2348 | data = rcu_dereference(event->data); | 2640 | buffer = rcu_dereference(event->buffer); |
2349 | if (!data) | 2641 | if (!buffer) |
2350 | goto unlock; | 2642 | goto unlock; |
2351 | 2643 | ||
2352 | userpg = data->user_page; | 2644 | userpg = buffer->user_page; |
2353 | 2645 | ||
2354 | /* | 2646 | /* |
2355 | * Disable preemption so as to not let the corresponding user-space | 2647 | * Disable preemption so as to not let the corresponding user-space |
@@ -2359,9 +2651,9 @@ void perf_event_update_userpage(struct perf_event *event) | |||
2359 | ++userpg->lock; | 2651 | ++userpg->lock; |
2360 | barrier(); | 2652 | barrier(); |
2361 | userpg->index = perf_event_index(event); | 2653 | userpg->index = perf_event_index(event); |
2362 | userpg->offset = atomic64_read(&event->count); | 2654 | userpg->offset = perf_event_count(event); |
2363 | if (event->state == PERF_EVENT_STATE_ACTIVE) | 2655 | if (event->state == PERF_EVENT_STATE_ACTIVE) |
2364 | userpg->offset -= atomic64_read(&event->hw.prev_count); | 2656 | userpg->offset -= local64_read(&event->hw.prev_count); |
2365 | 2657 | ||
2366 | userpg->time_enabled = event->total_time_enabled + | 2658 | userpg->time_enabled = event->total_time_enabled + |
2367 | atomic64_read(&event->child_total_time_enabled); | 2659 | atomic64_read(&event->child_total_time_enabled); |
@@ -2376,6 +2668,25 @@ unlock: | |||
2376 | rcu_read_unlock(); | 2668 | rcu_read_unlock(); |
2377 | } | 2669 | } |
2378 | 2670 | ||
2671 | static unsigned long perf_data_size(struct perf_buffer *buffer); | ||
2672 | |||
2673 | static void | ||
2674 | perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) | ||
2675 | { | ||
2676 | long max_size = perf_data_size(buffer); | ||
2677 | |||
2678 | if (watermark) | ||
2679 | buffer->watermark = min(max_size, watermark); | ||
2680 | |||
2681 | if (!buffer->watermark) | ||
2682 | buffer->watermark = max_size / 2; | ||
2683 | |||
2684 | if (flags & PERF_BUFFER_WRITABLE) | ||
2685 | buffer->writable = 1; | ||
2686 | |||
2687 | atomic_set(&buffer->refcount, 1); | ||
2688 | } | ||
2689 | |||
2379 | #ifndef CONFIG_PERF_USE_VMALLOC | 2690 | #ifndef CONFIG_PERF_USE_VMALLOC |
2380 | 2691 | ||
2381 | /* | 2692 | /* |
@@ -2383,15 +2694,15 @@ unlock: | |||
2383 | */ | 2694 | */ |
2384 | 2695 | ||
2385 | static struct page * | 2696 | static struct page * |
2386 | perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) | 2697 | perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) |
2387 | { | 2698 | { |
2388 | if (pgoff > data->nr_pages) | 2699 | if (pgoff > buffer->nr_pages) |
2389 | return NULL; | 2700 | return NULL; |
2390 | 2701 | ||
2391 | if (pgoff == 0) | 2702 | if (pgoff == 0) |
2392 | return virt_to_page(data->user_page); | 2703 | return virt_to_page(buffer->user_page); |
2393 | 2704 | ||
2394 | return virt_to_page(data->data_pages[pgoff - 1]); | 2705 | return virt_to_page(buffer->data_pages[pgoff - 1]); |
2395 | } | 2706 | } |
2396 | 2707 | ||
2397 | static void *perf_mmap_alloc_page(int cpu) | 2708 | static void *perf_mmap_alloc_page(int cpu) |
@@ -2407,42 +2718,44 @@ static void *perf_mmap_alloc_page(int cpu) | |||
2407 | return page_address(page); | 2718 | return page_address(page); |
2408 | } | 2719 | } |
2409 | 2720 | ||
2410 | static struct perf_mmap_data * | 2721 | static struct perf_buffer * |
2411 | perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | 2722 | perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) |
2412 | { | 2723 | { |
2413 | struct perf_mmap_data *data; | 2724 | struct perf_buffer *buffer; |
2414 | unsigned long size; | 2725 | unsigned long size; |
2415 | int i; | 2726 | int i; |
2416 | 2727 | ||
2417 | size = sizeof(struct perf_mmap_data); | 2728 | size = sizeof(struct perf_buffer); |
2418 | size += nr_pages * sizeof(void *); | 2729 | size += nr_pages * sizeof(void *); |
2419 | 2730 | ||
2420 | data = kzalloc(size, GFP_KERNEL); | 2731 | buffer = kzalloc(size, GFP_KERNEL); |
2421 | if (!data) | 2732 | if (!buffer) |
2422 | goto fail; | 2733 | goto fail; |
2423 | 2734 | ||
2424 | data->user_page = perf_mmap_alloc_page(event->cpu); | 2735 | buffer->user_page = perf_mmap_alloc_page(cpu); |
2425 | if (!data->user_page) | 2736 | if (!buffer->user_page) |
2426 | goto fail_user_page; | 2737 | goto fail_user_page; |
2427 | 2738 | ||
2428 | for (i = 0; i < nr_pages; i++) { | 2739 | for (i = 0; i < nr_pages; i++) { |
2429 | data->data_pages[i] = perf_mmap_alloc_page(event->cpu); | 2740 | buffer->data_pages[i] = perf_mmap_alloc_page(cpu); |
2430 | if (!data->data_pages[i]) | 2741 | if (!buffer->data_pages[i]) |
2431 | goto fail_data_pages; | 2742 | goto fail_data_pages; |
2432 | } | 2743 | } |
2433 | 2744 | ||
2434 | data->nr_pages = nr_pages; | 2745 | buffer->nr_pages = nr_pages; |
2435 | 2746 | ||
2436 | return data; | 2747 | perf_buffer_init(buffer, watermark, flags); |
2748 | |||
2749 | return buffer; | ||
2437 | 2750 | ||
2438 | fail_data_pages: | 2751 | fail_data_pages: |
2439 | for (i--; i >= 0; i--) | 2752 | for (i--; i >= 0; i--) |
2440 | free_page((unsigned long)data->data_pages[i]); | 2753 | free_page((unsigned long)buffer->data_pages[i]); |
2441 | 2754 | ||
2442 | free_page((unsigned long)data->user_page); | 2755 | free_page((unsigned long)buffer->user_page); |
2443 | 2756 | ||
2444 | fail_user_page: | 2757 | fail_user_page: |
2445 | kfree(data); | 2758 | kfree(buffer); |
2446 | 2759 | ||
2447 | fail: | 2760 | fail: |
2448 | return NULL; | 2761 | return NULL; |
@@ -2456,17 +2769,17 @@ static void perf_mmap_free_page(unsigned long addr) | |||
2456 | __free_page(page); | 2769 | __free_page(page); |
2457 | } | 2770 | } |
2458 | 2771 | ||
2459 | static void perf_mmap_data_free(struct perf_mmap_data *data) | 2772 | static void perf_buffer_free(struct perf_buffer *buffer) |
2460 | { | 2773 | { |
2461 | int i; | 2774 | int i; |
2462 | 2775 | ||
2463 | perf_mmap_free_page((unsigned long)data->user_page); | 2776 | perf_mmap_free_page((unsigned long)buffer->user_page); |
2464 | for (i = 0; i < data->nr_pages; i++) | 2777 | for (i = 0; i < buffer->nr_pages; i++) |
2465 | perf_mmap_free_page((unsigned long)data->data_pages[i]); | 2778 | perf_mmap_free_page((unsigned long)buffer->data_pages[i]); |
2466 | kfree(data); | 2779 | kfree(buffer); |
2467 | } | 2780 | } |
2468 | 2781 | ||
2469 | static inline int page_order(struct perf_mmap_data *data) | 2782 | static inline int page_order(struct perf_buffer *buffer) |
2470 | { | 2783 | { |
2471 | return 0; | 2784 | return 0; |
2472 | } | 2785 | } |
@@ -2479,18 +2792,18 @@ static inline int page_order(struct perf_mmap_data *data) | |||
2479 | * Required for architectures that have d-cache aliasing issues. | 2792 | * Required for architectures that have d-cache aliasing issues. |
2480 | */ | 2793 | */ |
2481 | 2794 | ||
2482 | static inline int page_order(struct perf_mmap_data *data) | 2795 | static inline int page_order(struct perf_buffer *buffer) |
2483 | { | 2796 | { |
2484 | return data->page_order; | 2797 | return buffer->page_order; |
2485 | } | 2798 | } |
2486 | 2799 | ||
2487 | static struct page * | 2800 | static struct page * |
2488 | perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) | 2801 | perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) |
2489 | { | 2802 | { |
2490 | if (pgoff > (1UL << page_order(data))) | 2803 | if (pgoff > (1UL << page_order(buffer))) |
2491 | return NULL; | 2804 | return NULL; |
2492 | 2805 | ||
2493 | return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); | 2806 | return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); |
2494 | } | 2807 | } |
2495 | 2808 | ||
2496 | static void perf_mmap_unmark_page(void *addr) | 2809 | static void perf_mmap_unmark_page(void *addr) |
@@ -2500,57 +2813,59 @@ static void perf_mmap_unmark_page(void *addr) | |||
2500 | page->mapping = NULL; | 2813 | page->mapping = NULL; |
2501 | } | 2814 | } |
2502 | 2815 | ||
2503 | static void perf_mmap_data_free_work(struct work_struct *work) | 2816 | static void perf_buffer_free_work(struct work_struct *work) |
2504 | { | 2817 | { |
2505 | struct perf_mmap_data *data; | 2818 | struct perf_buffer *buffer; |
2506 | void *base; | 2819 | void *base; |
2507 | int i, nr; | 2820 | int i, nr; |
2508 | 2821 | ||
2509 | data = container_of(work, struct perf_mmap_data, work); | 2822 | buffer = container_of(work, struct perf_buffer, work); |
2510 | nr = 1 << page_order(data); | 2823 | nr = 1 << page_order(buffer); |
2511 | 2824 | ||
2512 | base = data->user_page; | 2825 | base = buffer->user_page; |
2513 | for (i = 0; i < nr + 1; i++) | 2826 | for (i = 0; i < nr + 1; i++) |
2514 | perf_mmap_unmark_page(base + (i * PAGE_SIZE)); | 2827 | perf_mmap_unmark_page(base + (i * PAGE_SIZE)); |
2515 | 2828 | ||
2516 | vfree(base); | 2829 | vfree(base); |
2517 | kfree(data); | 2830 | kfree(buffer); |
2518 | } | 2831 | } |
2519 | 2832 | ||
2520 | static void perf_mmap_data_free(struct perf_mmap_data *data) | 2833 | static void perf_buffer_free(struct perf_buffer *buffer) |
2521 | { | 2834 | { |
2522 | schedule_work(&data->work); | 2835 | schedule_work(&buffer->work); |
2523 | } | 2836 | } |
2524 | 2837 | ||
2525 | static struct perf_mmap_data * | 2838 | static struct perf_buffer * |
2526 | perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | 2839 | perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) |
2527 | { | 2840 | { |
2528 | struct perf_mmap_data *data; | 2841 | struct perf_buffer *buffer; |
2529 | unsigned long size; | 2842 | unsigned long size; |
2530 | void *all_buf; | 2843 | void *all_buf; |
2531 | 2844 | ||
2532 | size = sizeof(struct perf_mmap_data); | 2845 | size = sizeof(struct perf_buffer); |
2533 | size += sizeof(void *); | 2846 | size += sizeof(void *); |
2534 | 2847 | ||
2535 | data = kzalloc(size, GFP_KERNEL); | 2848 | buffer = kzalloc(size, GFP_KERNEL); |
2536 | if (!data) | 2849 | if (!buffer) |
2537 | goto fail; | 2850 | goto fail; |
2538 | 2851 | ||
2539 | INIT_WORK(&data->work, perf_mmap_data_free_work); | 2852 | INIT_WORK(&buffer->work, perf_buffer_free_work); |
2540 | 2853 | ||
2541 | all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); | 2854 | all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); |
2542 | if (!all_buf) | 2855 | if (!all_buf) |
2543 | goto fail_all_buf; | 2856 | goto fail_all_buf; |
2544 | 2857 | ||
2545 | data->user_page = all_buf; | 2858 | buffer->user_page = all_buf; |
2546 | data->data_pages[0] = all_buf + PAGE_SIZE; | 2859 | buffer->data_pages[0] = all_buf + PAGE_SIZE; |
2547 | data->page_order = ilog2(nr_pages); | 2860 | buffer->page_order = ilog2(nr_pages); |
2548 | data->nr_pages = 1; | 2861 | buffer->nr_pages = 1; |
2862 | |||
2863 | perf_buffer_init(buffer, watermark, flags); | ||
2549 | 2864 | ||
2550 | return data; | 2865 | return buffer; |
2551 | 2866 | ||
2552 | fail_all_buf: | 2867 | fail_all_buf: |
2553 | kfree(data); | 2868 | kfree(buffer); |
2554 | 2869 | ||
2555 | fail: | 2870 | fail: |
2556 | return NULL; | 2871 | return NULL; |
@@ -2558,15 +2873,15 @@ fail: | |||
2558 | 2873 | ||
2559 | #endif | 2874 | #endif |
2560 | 2875 | ||
2561 | static unsigned long perf_data_size(struct perf_mmap_data *data) | 2876 | static unsigned long perf_data_size(struct perf_buffer *buffer) |
2562 | { | 2877 | { |
2563 | return data->nr_pages << (PAGE_SHIFT + page_order(data)); | 2878 | return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); |
2564 | } | 2879 | } |
2565 | 2880 | ||
2566 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 2881 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
2567 | { | 2882 | { |
2568 | struct perf_event *event = vma->vm_file->private_data; | 2883 | struct perf_event *event = vma->vm_file->private_data; |
2569 | struct perf_mmap_data *data; | 2884 | struct perf_buffer *buffer; |
2570 | int ret = VM_FAULT_SIGBUS; | 2885 | int ret = VM_FAULT_SIGBUS; |
2571 | 2886 | ||
2572 | if (vmf->flags & FAULT_FLAG_MKWRITE) { | 2887 | if (vmf->flags & FAULT_FLAG_MKWRITE) { |
@@ -2576,14 +2891,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
2576 | } | 2891 | } |
2577 | 2892 | ||
2578 | rcu_read_lock(); | 2893 | rcu_read_lock(); |
2579 | data = rcu_dereference(event->data); | 2894 | buffer = rcu_dereference(event->buffer); |
2580 | if (!data) | 2895 | if (!buffer) |
2581 | goto unlock; | 2896 | goto unlock; |
2582 | 2897 | ||
2583 | if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) | 2898 | if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) |
2584 | goto unlock; | 2899 | goto unlock; |
2585 | 2900 | ||
2586 | vmf->page = perf_mmap_to_page(data, vmf->pgoff); | 2901 | vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); |
2587 | if (!vmf->page) | 2902 | if (!vmf->page) |
2588 | goto unlock; | 2903 | goto unlock; |
2589 | 2904 | ||
@@ -2598,52 +2913,35 @@ unlock: | |||
2598 | return ret; | 2913 | return ret; |
2599 | } | 2914 | } |
2600 | 2915 | ||
2601 | static void | 2916 | static void perf_buffer_free_rcu(struct rcu_head *rcu_head) |
2602 | perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) | ||
2603 | { | ||
2604 | long max_size = perf_data_size(data); | ||
2605 | |||
2606 | if (event->attr.watermark) { | ||
2607 | data->watermark = min_t(long, max_size, | ||
2608 | event->attr.wakeup_watermark); | ||
2609 | } | ||
2610 | |||
2611 | if (!data->watermark) | ||
2612 | data->watermark = max_size / 2; | ||
2613 | |||
2614 | atomic_set(&data->refcount, 1); | ||
2615 | rcu_assign_pointer(event->data, data); | ||
2616 | } | ||
2617 | |||
2618 | static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) | ||
2619 | { | 2917 | { |
2620 | struct perf_mmap_data *data; | 2918 | struct perf_buffer *buffer; |
2621 | 2919 | ||
2622 | data = container_of(rcu_head, struct perf_mmap_data, rcu_head); | 2920 | buffer = container_of(rcu_head, struct perf_buffer, rcu_head); |
2623 | perf_mmap_data_free(data); | 2921 | perf_buffer_free(buffer); |
2624 | } | 2922 | } |
2625 | 2923 | ||
2626 | static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) | 2924 | static struct perf_buffer *perf_buffer_get(struct perf_event *event) |
2627 | { | 2925 | { |
2628 | struct perf_mmap_data *data; | 2926 | struct perf_buffer *buffer; |
2629 | 2927 | ||
2630 | rcu_read_lock(); | 2928 | rcu_read_lock(); |
2631 | data = rcu_dereference(event->data); | 2929 | buffer = rcu_dereference(event->buffer); |
2632 | if (data) { | 2930 | if (buffer) { |
2633 | if (!atomic_inc_not_zero(&data->refcount)) | 2931 | if (!atomic_inc_not_zero(&buffer->refcount)) |
2634 | data = NULL; | 2932 | buffer = NULL; |
2635 | } | 2933 | } |
2636 | rcu_read_unlock(); | 2934 | rcu_read_unlock(); |
2637 | 2935 | ||
2638 | return data; | 2936 | return buffer; |
2639 | } | 2937 | } |
2640 | 2938 | ||
2641 | static void perf_mmap_data_put(struct perf_mmap_data *data) | 2939 | static void perf_buffer_put(struct perf_buffer *buffer) |
2642 | { | 2940 | { |
2643 | if (!atomic_dec_and_test(&data->refcount)) | 2941 | if (!atomic_dec_and_test(&buffer->refcount)) |
2644 | return; | 2942 | return; |
2645 | 2943 | ||
2646 | call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); | 2944 | call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); |
2647 | } | 2945 | } |
2648 | 2946 | ||
2649 | static void perf_mmap_open(struct vm_area_struct *vma) | 2947 | static void perf_mmap_open(struct vm_area_struct *vma) |
@@ -2658,16 +2956,16 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
2658 | struct perf_event *event = vma->vm_file->private_data; | 2956 | struct perf_event *event = vma->vm_file->private_data; |
2659 | 2957 | ||
2660 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { | 2958 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { |
2661 | unsigned long size = perf_data_size(event->data); | 2959 | unsigned long size = perf_data_size(event->buffer); |
2662 | struct user_struct *user = event->mmap_user; | 2960 | struct user_struct *user = event->mmap_user; |
2663 | struct perf_mmap_data *data = event->data; | 2961 | struct perf_buffer *buffer = event->buffer; |
2664 | 2962 | ||
2665 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); | 2963 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); |
2666 | vma->vm_mm->locked_vm -= event->mmap_locked; | 2964 | vma->vm_mm->locked_vm -= event->mmap_locked; |
2667 | rcu_assign_pointer(event->data, NULL); | 2965 | rcu_assign_pointer(event->buffer, NULL); |
2668 | mutex_unlock(&event->mmap_mutex); | 2966 | mutex_unlock(&event->mmap_mutex); |
2669 | 2967 | ||
2670 | perf_mmap_data_put(data); | 2968 | perf_buffer_put(buffer); |
2671 | free_uid(user); | 2969 | free_uid(user); |
2672 | } | 2970 | } |
2673 | } | 2971 | } |
@@ -2685,11 +2983,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2685 | unsigned long user_locked, user_lock_limit; | 2983 | unsigned long user_locked, user_lock_limit; |
2686 | struct user_struct *user = current_user(); | 2984 | struct user_struct *user = current_user(); |
2687 | unsigned long locked, lock_limit; | 2985 | unsigned long locked, lock_limit; |
2688 | struct perf_mmap_data *data; | 2986 | struct perf_buffer *buffer; |
2689 | unsigned long vma_size; | 2987 | unsigned long vma_size; |
2690 | unsigned long nr_pages; | 2988 | unsigned long nr_pages; |
2691 | long user_extra, extra; | 2989 | long user_extra, extra; |
2692 | int ret = 0; | 2990 | int ret = 0, flags = 0; |
2693 | 2991 | ||
2694 | /* | 2992 | /* |
2695 | * Don't allow mmap() of inherited per-task counters. This would | 2993 | * Don't allow mmap() of inherited per-task counters. This would |
@@ -2706,7 +3004,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2706 | nr_pages = (vma_size / PAGE_SIZE) - 1; | 3004 | nr_pages = (vma_size / PAGE_SIZE) - 1; |
2707 | 3005 | ||
2708 | /* | 3006 | /* |
2709 | * If we have data pages ensure they're a power-of-two number, so we | 3007 | * If we have buffer pages ensure they're a power-of-two number, so we |
2710 | * can do bitmasks instead of modulo. | 3008 | * can do bitmasks instead of modulo. |
2711 | */ | 3009 | */ |
2712 | if (nr_pages != 0 && !is_power_of_2(nr_pages)) | 3010 | if (nr_pages != 0 && !is_power_of_2(nr_pages)) |
@@ -2720,9 +3018,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2720 | 3018 | ||
2721 | WARN_ON_ONCE(event->ctx->parent_ctx); | 3019 | WARN_ON_ONCE(event->ctx->parent_ctx); |
2722 | mutex_lock(&event->mmap_mutex); | 3020 | mutex_lock(&event->mmap_mutex); |
2723 | if (event->data) { | 3021 | if (event->buffer) { |
2724 | if (event->data->nr_pages == nr_pages) | 3022 | if (event->buffer->nr_pages == nr_pages) |
2725 | atomic_inc(&event->data->refcount); | 3023 | atomic_inc(&event->buffer->refcount); |
2726 | else | 3024 | else |
2727 | ret = -EINVAL; | 3025 | ret = -EINVAL; |
2728 | goto unlock; | 3026 | goto unlock; |
@@ -2752,17 +3050,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2752 | goto unlock; | 3050 | goto unlock; |
2753 | } | 3051 | } |
2754 | 3052 | ||
2755 | WARN_ON(event->data); | 3053 | WARN_ON(event->buffer); |
3054 | |||
3055 | if (vma->vm_flags & VM_WRITE) | ||
3056 | flags |= PERF_BUFFER_WRITABLE; | ||
2756 | 3057 | ||
2757 | data = perf_mmap_data_alloc(event, nr_pages); | 3058 | buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, |
2758 | if (!data) { | 3059 | event->cpu, flags); |
3060 | if (!buffer) { | ||
2759 | ret = -ENOMEM; | 3061 | ret = -ENOMEM; |
2760 | goto unlock; | 3062 | goto unlock; |
2761 | } | 3063 | } |
2762 | 3064 | rcu_assign_pointer(event->buffer, buffer); | |
2763 | perf_mmap_data_init(event, data); | ||
2764 | if (vma->vm_flags & VM_WRITE) | ||
2765 | event->data->writable = 1; | ||
2766 | 3065 | ||
2767 | atomic_long_add(user_extra, &user->locked_vm); | 3066 | atomic_long_add(user_extra, &user->locked_vm); |
2768 | event->mmap_locked = extra; | 3067 | event->mmap_locked = extra; |
@@ -2824,16 +3123,7 @@ void perf_event_wakeup(struct perf_event *event) | |||
2824 | } | 3123 | } |
2825 | } | 3124 | } |
2826 | 3125 | ||
2827 | /* | 3126 | static void perf_pending_event(struct irq_work *entry) |
2828 | * Pending wakeups | ||
2829 | * | ||
2830 | * Handle the case where we need to wakeup up from NMI (or rq->lock) context. | ||
2831 | * | ||
2832 | * The NMI bit means we cannot possibly take locks. Therefore, maintain a | ||
2833 | * single linked list and use cmpxchg() to add entries lockless. | ||
2834 | */ | ||
2835 | |||
2836 | static void perf_pending_event(struct perf_pending_entry *entry) | ||
2837 | { | 3127 | { |
2838 | struct perf_event *event = container_of(entry, | 3128 | struct perf_event *event = container_of(entry, |
2839 | struct perf_event, pending); | 3129 | struct perf_event, pending); |
@@ -2849,104 +3139,6 @@ static void perf_pending_event(struct perf_pending_entry *entry) | |||
2849 | } | 3139 | } |
2850 | } | 3140 | } |
2851 | 3141 | ||
2852 | #define PENDING_TAIL ((struct perf_pending_entry *)-1UL) | ||
2853 | |||
2854 | static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { | ||
2855 | PENDING_TAIL, | ||
2856 | }; | ||
2857 | |||
2858 | static void perf_pending_queue(struct perf_pending_entry *entry, | ||
2859 | void (*func)(struct perf_pending_entry *)) | ||
2860 | { | ||
2861 | struct perf_pending_entry **head; | ||
2862 | |||
2863 | if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) | ||
2864 | return; | ||
2865 | |||
2866 | entry->func = func; | ||
2867 | |||
2868 | head = &get_cpu_var(perf_pending_head); | ||
2869 | |||
2870 | do { | ||
2871 | entry->next = *head; | ||
2872 | } while (cmpxchg(head, entry->next, entry) != entry->next); | ||
2873 | |||
2874 | set_perf_event_pending(); | ||
2875 | |||
2876 | put_cpu_var(perf_pending_head); | ||
2877 | } | ||
2878 | |||
2879 | static int __perf_pending_run(void) | ||
2880 | { | ||
2881 | struct perf_pending_entry *list; | ||
2882 | int nr = 0; | ||
2883 | |||
2884 | list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); | ||
2885 | while (list != PENDING_TAIL) { | ||
2886 | void (*func)(struct perf_pending_entry *); | ||
2887 | struct perf_pending_entry *entry = list; | ||
2888 | |||
2889 | list = list->next; | ||
2890 | |||
2891 | func = entry->func; | ||
2892 | entry->next = NULL; | ||
2893 | /* | ||
2894 | * Ensure we observe the unqueue before we issue the wakeup, | ||
2895 | * so that we won't be waiting forever. | ||
2896 | * -- see perf_not_pending(). | ||
2897 | */ | ||
2898 | smp_wmb(); | ||
2899 | |||
2900 | func(entry); | ||
2901 | nr++; | ||
2902 | } | ||
2903 | |||
2904 | return nr; | ||
2905 | } | ||
2906 | |||
2907 | static inline int perf_not_pending(struct perf_event *event) | ||
2908 | { | ||
2909 | /* | ||
2910 | * If we flush on whatever cpu we run, there is a chance we don't | ||
2911 | * need to wait. | ||
2912 | */ | ||
2913 | get_cpu(); | ||
2914 | __perf_pending_run(); | ||
2915 | put_cpu(); | ||
2916 | |||
2917 | /* | ||
2918 | * Ensure we see the proper queue state before going to sleep | ||
2919 | * so that we do not miss the wakeup. -- see perf_pending_handle() | ||
2920 | */ | ||
2921 | smp_rmb(); | ||
2922 | return event->pending.next == NULL; | ||
2923 | } | ||
2924 | |||
2925 | static void perf_pending_sync(struct perf_event *event) | ||
2926 | { | ||
2927 | wait_event(event->waitq, perf_not_pending(event)); | ||
2928 | } | ||
2929 | |||
2930 | void perf_event_do_pending(void) | ||
2931 | { | ||
2932 | __perf_pending_run(); | ||
2933 | } | ||
2934 | |||
2935 | /* | ||
2936 | * Callchain support -- arch specific | ||
2937 | */ | ||
2938 | |||
2939 | __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
2940 | { | ||
2941 | return NULL; | ||
2942 | } | ||
2943 | |||
2944 | __weak | ||
2945 | void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) | ||
2946 | { | ||
2947 | } | ||
2948 | |||
2949 | |||
2950 | /* | 3142 | /* |
2951 | * We assume there is only KVM supporting the callbacks. | 3143 | * We assume there is only KVM supporting the callbacks. |
2952 | * Later on, we might change it to a list if there is | 3144 | * Later on, we might change it to a list if there is |
@@ -2971,15 +3163,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); | |||
2971 | /* | 3163 | /* |
2972 | * Output | 3164 | * Output |
2973 | */ | 3165 | */ |
2974 | static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, | 3166 | static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, |
2975 | unsigned long offset, unsigned long head) | 3167 | unsigned long offset, unsigned long head) |
2976 | { | 3168 | { |
2977 | unsigned long mask; | 3169 | unsigned long mask; |
2978 | 3170 | ||
2979 | if (!data->writable) | 3171 | if (!buffer->writable) |
2980 | return true; | 3172 | return true; |
2981 | 3173 | ||
2982 | mask = perf_data_size(data) - 1; | 3174 | mask = perf_data_size(buffer) - 1; |
2983 | 3175 | ||
2984 | offset = (offset - tail) & mask; | 3176 | offset = (offset - tail) & mask; |
2985 | head = (head - tail) & mask; | 3177 | head = (head - tail) & mask; |
@@ -2992,12 +3184,11 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, | |||
2992 | 3184 | ||
2993 | static void perf_output_wakeup(struct perf_output_handle *handle) | 3185 | static void perf_output_wakeup(struct perf_output_handle *handle) |
2994 | { | 3186 | { |
2995 | atomic_set(&handle->data->poll, POLL_IN); | 3187 | atomic_set(&handle->buffer->poll, POLL_IN); |
2996 | 3188 | ||
2997 | if (handle->nmi) { | 3189 | if (handle->nmi) { |
2998 | handle->event->pending_wakeup = 1; | 3190 | handle->event->pending_wakeup = 1; |
2999 | perf_pending_queue(&handle->event->pending, | 3191 | irq_work_queue(&handle->event->pending); |
3000 | perf_pending_event); | ||
3001 | } else | 3192 | } else |
3002 | perf_event_wakeup(handle->event); | 3193 | perf_event_wakeup(handle->event); |
3003 | } | 3194 | } |
@@ -3012,48 +3203,48 @@ static void perf_output_wakeup(struct perf_output_handle *handle) | |||
3012 | */ | 3203 | */ |
3013 | static void perf_output_get_handle(struct perf_output_handle *handle) | 3204 | static void perf_output_get_handle(struct perf_output_handle *handle) |
3014 | { | 3205 | { |
3015 | struct perf_mmap_data *data = handle->data; | 3206 | struct perf_buffer *buffer = handle->buffer; |
3016 | 3207 | ||
3017 | preempt_disable(); | 3208 | preempt_disable(); |
3018 | local_inc(&data->nest); | 3209 | local_inc(&buffer->nest); |
3019 | handle->wakeup = local_read(&data->wakeup); | 3210 | handle->wakeup = local_read(&buffer->wakeup); |
3020 | } | 3211 | } |
3021 | 3212 | ||
3022 | static void perf_output_put_handle(struct perf_output_handle *handle) | 3213 | static void perf_output_put_handle(struct perf_output_handle *handle) |
3023 | { | 3214 | { |
3024 | struct perf_mmap_data *data = handle->data; | 3215 | struct perf_buffer *buffer = handle->buffer; |
3025 | unsigned long head; | 3216 | unsigned long head; |
3026 | 3217 | ||
3027 | again: | 3218 | again: |
3028 | head = local_read(&data->head); | 3219 | head = local_read(&buffer->head); |
3029 | 3220 | ||
3030 | /* | 3221 | /* |
3031 | * IRQ/NMI can happen here, which means we can miss a head update. | 3222 | * IRQ/NMI can happen here, which means we can miss a head update. |
3032 | */ | 3223 | */ |
3033 | 3224 | ||
3034 | if (!local_dec_and_test(&data->nest)) | 3225 | if (!local_dec_and_test(&buffer->nest)) |
3035 | goto out; | 3226 | goto out; |
3036 | 3227 | ||
3037 | /* | 3228 | /* |
3038 | * Publish the known good head. Rely on the full barrier implied | 3229 | * Publish the known good head. Rely on the full barrier implied |
3039 | * by atomic_dec_and_test() order the data->head read and this | 3230 | * by atomic_dec_and_test() order the buffer->head read and this |
3040 | * write. | 3231 | * write. |
3041 | */ | 3232 | */ |
3042 | data->user_page->data_head = head; | 3233 | buffer->user_page->data_head = head; |
3043 | 3234 | ||
3044 | /* | 3235 | /* |
3045 | * Now check if we missed an update, rely on the (compiler) | 3236 | * Now check if we missed an update, rely on the (compiler) |
3046 | * barrier in atomic_dec_and_test() to re-read data->head. | 3237 | * barrier in atomic_dec_and_test() to re-read buffer->head. |
3047 | */ | 3238 | */ |
3048 | if (unlikely(head != local_read(&data->head))) { | 3239 | if (unlikely(head != local_read(&buffer->head))) { |
3049 | local_inc(&data->nest); | 3240 | local_inc(&buffer->nest); |
3050 | goto again; | 3241 | goto again; |
3051 | } | 3242 | } |
3052 | 3243 | ||
3053 | if (handle->wakeup != local_read(&data->wakeup)) | 3244 | if (handle->wakeup != local_read(&buffer->wakeup)) |
3054 | perf_output_wakeup(handle); | 3245 | perf_output_wakeup(handle); |
3055 | 3246 | ||
3056 | out: | 3247 | out: |
3057 | preempt_enable(); | 3248 | preempt_enable(); |
3058 | } | 3249 | } |
3059 | 3250 | ||
@@ -3070,12 +3261,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, | |||
3070 | buf += size; | 3261 | buf += size; |
3071 | handle->size -= size; | 3262 | handle->size -= size; |
3072 | if (!handle->size) { | 3263 | if (!handle->size) { |
3073 | struct perf_mmap_data *data = handle->data; | 3264 | struct perf_buffer *buffer = handle->buffer; |
3074 | 3265 | ||
3075 | handle->page++; | 3266 | handle->page++; |
3076 | handle->page &= data->nr_pages - 1; | 3267 | handle->page &= buffer->nr_pages - 1; |
3077 | handle->addr = data->data_pages[handle->page]; | 3268 | handle->addr = buffer->data_pages[handle->page]; |
3078 | handle->size = PAGE_SIZE << page_order(data); | 3269 | handle->size = PAGE_SIZE << page_order(buffer); |
3079 | } | 3270 | } |
3080 | } while (len); | 3271 | } while (len); |
3081 | } | 3272 | } |
@@ -3084,7 +3275,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3084 | struct perf_event *event, unsigned int size, | 3275 | struct perf_event *event, unsigned int size, |
3085 | int nmi, int sample) | 3276 | int nmi, int sample) |
3086 | { | 3277 | { |
3087 | struct perf_mmap_data *data; | 3278 | struct perf_buffer *buffer; |
3088 | unsigned long tail, offset, head; | 3279 | unsigned long tail, offset, head; |
3089 | int have_lost; | 3280 | int have_lost; |
3090 | struct { | 3281 | struct { |
@@ -3100,19 +3291,19 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3100 | if (event->parent) | 3291 | if (event->parent) |
3101 | event = event->parent; | 3292 | event = event->parent; |
3102 | 3293 | ||
3103 | data = rcu_dereference(event->data); | 3294 | buffer = rcu_dereference(event->buffer); |
3104 | if (!data) | 3295 | if (!buffer) |
3105 | goto out; | 3296 | goto out; |
3106 | 3297 | ||
3107 | handle->data = data; | 3298 | handle->buffer = buffer; |
3108 | handle->event = event; | 3299 | handle->event = event; |
3109 | handle->nmi = nmi; | 3300 | handle->nmi = nmi; |
3110 | handle->sample = sample; | 3301 | handle->sample = sample; |
3111 | 3302 | ||
3112 | if (!data->nr_pages) | 3303 | if (!buffer->nr_pages) |
3113 | goto out; | 3304 | goto out; |
3114 | 3305 | ||
3115 | have_lost = local_read(&data->lost); | 3306 | have_lost = local_read(&buffer->lost); |
3116 | if (have_lost) | 3307 | if (have_lost) |
3117 | size += sizeof(lost_event); | 3308 | size += sizeof(lost_event); |
3118 | 3309 | ||
@@ -3124,30 +3315,30 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3124 | * tail pointer. So that all reads will be completed before the | 3315 | * tail pointer. So that all reads will be completed before the |
3125 | * write is issued. | 3316 | * write is issued. |
3126 | */ | 3317 | */ |
3127 | tail = ACCESS_ONCE(data->user_page->data_tail); | 3318 | tail = ACCESS_ONCE(buffer->user_page->data_tail); |
3128 | smp_rmb(); | 3319 | smp_rmb(); |
3129 | offset = head = local_read(&data->head); | 3320 | offset = head = local_read(&buffer->head); |
3130 | head += size; | 3321 | head += size; |
3131 | if (unlikely(!perf_output_space(data, tail, offset, head))) | 3322 | if (unlikely(!perf_output_space(buffer, tail, offset, head))) |
3132 | goto fail; | 3323 | goto fail; |
3133 | } while (local_cmpxchg(&data->head, offset, head) != offset); | 3324 | } while (local_cmpxchg(&buffer->head, offset, head) != offset); |
3134 | 3325 | ||
3135 | if (head - local_read(&data->wakeup) > data->watermark) | 3326 | if (head - local_read(&buffer->wakeup) > buffer->watermark) |
3136 | local_add(data->watermark, &data->wakeup); | 3327 | local_add(buffer->watermark, &buffer->wakeup); |
3137 | 3328 | ||
3138 | handle->page = offset >> (PAGE_SHIFT + page_order(data)); | 3329 | handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); |
3139 | handle->page &= data->nr_pages - 1; | 3330 | handle->page &= buffer->nr_pages - 1; |
3140 | handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); | 3331 | handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); |
3141 | handle->addr = data->data_pages[handle->page]; | 3332 | handle->addr = buffer->data_pages[handle->page]; |
3142 | handle->addr += handle->size; | 3333 | handle->addr += handle->size; |
3143 | handle->size = (PAGE_SIZE << page_order(data)) - handle->size; | 3334 | handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; |
3144 | 3335 | ||
3145 | if (have_lost) { | 3336 | if (have_lost) { |
3146 | lost_event.header.type = PERF_RECORD_LOST; | 3337 | lost_event.header.type = PERF_RECORD_LOST; |
3147 | lost_event.header.misc = 0; | 3338 | lost_event.header.misc = 0; |
3148 | lost_event.header.size = sizeof(lost_event); | 3339 | lost_event.header.size = sizeof(lost_event); |
3149 | lost_event.id = event->id; | 3340 | lost_event.id = event->id; |
3150 | lost_event.lost = local_xchg(&data->lost, 0); | 3341 | lost_event.lost = local_xchg(&buffer->lost, 0); |
3151 | 3342 | ||
3152 | perf_output_put(handle, lost_event); | 3343 | perf_output_put(handle, lost_event); |
3153 | } | 3344 | } |
@@ -3155,7 +3346,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3155 | return 0; | 3346 | return 0; |
3156 | 3347 | ||
3157 | fail: | 3348 | fail: |
3158 | local_inc(&data->lost); | 3349 | local_inc(&buffer->lost); |
3159 | perf_output_put_handle(handle); | 3350 | perf_output_put_handle(handle); |
3160 | out: | 3351 | out: |
3161 | rcu_read_unlock(); | 3352 | rcu_read_unlock(); |
@@ -3166,15 +3357,15 @@ out: | |||
3166 | void perf_output_end(struct perf_output_handle *handle) | 3357 | void perf_output_end(struct perf_output_handle *handle) |
3167 | { | 3358 | { |
3168 | struct perf_event *event = handle->event; | 3359 | struct perf_event *event = handle->event; |
3169 | struct perf_mmap_data *data = handle->data; | 3360 | struct perf_buffer *buffer = handle->buffer; |
3170 | 3361 | ||
3171 | int wakeup_events = event->attr.wakeup_events; | 3362 | int wakeup_events = event->attr.wakeup_events; |
3172 | 3363 | ||
3173 | if (handle->sample && wakeup_events) { | 3364 | if (handle->sample && wakeup_events) { |
3174 | int events = local_inc_return(&data->events); | 3365 | int events = local_inc_return(&buffer->events); |
3175 | if (events >= wakeup_events) { | 3366 | if (events >= wakeup_events) { |
3176 | local_sub(wakeup_events, &data->events); | 3367 | local_sub(wakeup_events, &buffer->events); |
3177 | local_inc(&data->wakeup); | 3368 | local_inc(&buffer->wakeup); |
3178 | } | 3369 | } |
3179 | } | 3370 | } |
3180 | 3371 | ||
@@ -3211,7 +3402,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, | |||
3211 | u64 values[4]; | 3402 | u64 values[4]; |
3212 | int n = 0; | 3403 | int n = 0; |
3213 | 3404 | ||
3214 | values[n++] = atomic64_read(&event->count); | 3405 | values[n++] = perf_event_count(event); |
3215 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { | 3406 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { |
3216 | values[n++] = event->total_time_enabled + | 3407 | values[n++] = event->total_time_enabled + |
3217 | atomic64_read(&event->child_total_time_enabled); | 3408 | atomic64_read(&event->child_total_time_enabled); |
@@ -3248,7 +3439,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
3248 | if (leader != event) | 3439 | if (leader != event) |
3249 | leader->pmu->read(leader); | 3440 | leader->pmu->read(leader); |
3250 | 3441 | ||
3251 | values[n++] = atomic64_read(&leader->count); | 3442 | values[n++] = perf_event_count(leader); |
3252 | if (read_format & PERF_FORMAT_ID) | 3443 | if (read_format & PERF_FORMAT_ID) |
3253 | values[n++] = primary_event_id(leader); | 3444 | values[n++] = primary_event_id(leader); |
3254 | 3445 | ||
@@ -3260,7 +3451,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
3260 | if (sub != event) | 3451 | if (sub != event) |
3261 | sub->pmu->read(sub); | 3452 | sub->pmu->read(sub); |
3262 | 3453 | ||
3263 | values[n++] = atomic64_read(&sub->count); | 3454 | values[n++] = perf_event_count(sub); |
3264 | if (read_format & PERF_FORMAT_ID) | 3455 | if (read_format & PERF_FORMAT_ID) |
3265 | values[n++] = primary_event_id(sub); | 3456 | values[n++] = primary_event_id(sub); |
3266 | 3457 | ||
@@ -3441,14 +3632,20 @@ static void perf_event_output(struct perf_event *event, int nmi, | |||
3441 | struct perf_output_handle handle; | 3632 | struct perf_output_handle handle; |
3442 | struct perf_event_header header; | 3633 | struct perf_event_header header; |
3443 | 3634 | ||
3635 | /* protect the callchain buffers */ | ||
3636 | rcu_read_lock(); | ||
3637 | |||
3444 | perf_prepare_sample(&header, data, event, regs); | 3638 | perf_prepare_sample(&header, data, event, regs); |
3445 | 3639 | ||
3446 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) | 3640 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) |
3447 | return; | 3641 | goto exit; |
3448 | 3642 | ||
3449 | perf_output_sample(&handle, &header, data, event); | 3643 | perf_output_sample(&handle, &header, data, event); |
3450 | 3644 | ||
3451 | perf_output_end(&handle); | 3645 | perf_output_end(&handle); |
3646 | |||
3647 | exit: | ||
3648 | rcu_read_unlock(); | ||
3452 | } | 3649 | } |
3453 | 3650 | ||
3454 | /* | 3651 | /* |
@@ -3491,7 +3688,7 @@ perf_event_read_event(struct perf_event *event, | |||
3491 | /* | 3688 | /* |
3492 | * task tracking -- fork/exit | 3689 | * task tracking -- fork/exit |
3493 | * | 3690 | * |
3494 | * enabled by: attr.comm | attr.mmap | attr.task | 3691 | * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task |
3495 | */ | 3692 | */ |
3496 | 3693 | ||
3497 | struct perf_task_event { | 3694 | struct perf_task_event { |
@@ -3541,7 +3738,8 @@ static int perf_event_task_match(struct perf_event *event) | |||
3541 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 3738 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
3542 | return 0; | 3739 | return 0; |
3543 | 3740 | ||
3544 | if (event->attr.comm || event->attr.mmap || event->attr.task) | 3741 | if (event->attr.comm || event->attr.mmap || |
3742 | event->attr.mmap_data || event->attr.task) | ||
3545 | return 1; | 3743 | return 1; |
3546 | 3744 | ||
3547 | return 0; | 3745 | return 0; |
@@ -3561,16 +3759,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, | |||
3561 | static void perf_event_task_event(struct perf_task_event *task_event) | 3759 | static void perf_event_task_event(struct perf_task_event *task_event) |
3562 | { | 3760 | { |
3563 | struct perf_cpu_context *cpuctx; | 3761 | struct perf_cpu_context *cpuctx; |
3564 | struct perf_event_context *ctx = task_event->task_ctx; | 3762 | struct perf_event_context *ctx; |
3763 | struct pmu *pmu; | ||
3764 | int ctxn; | ||
3565 | 3765 | ||
3566 | rcu_read_lock(); | 3766 | rcu_read_lock(); |
3567 | cpuctx = &get_cpu_var(perf_cpu_context); | 3767 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3568 | perf_event_task_ctx(&cpuctx->ctx, task_event); | 3768 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3569 | if (!ctx) | 3769 | perf_event_task_ctx(&cpuctx->ctx, task_event); |
3570 | ctx = rcu_dereference(current->perf_event_ctxp); | 3770 | |
3571 | if (ctx) | 3771 | ctx = task_event->task_ctx; |
3572 | perf_event_task_ctx(ctx, task_event); | 3772 | if (!ctx) { |
3573 | put_cpu_var(perf_cpu_context); | 3773 | ctxn = pmu->task_ctx_nr; |
3774 | if (ctxn < 0) | ||
3775 | goto next; | ||
3776 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
3777 | } | ||
3778 | if (ctx) | ||
3779 | perf_event_task_ctx(ctx, task_event); | ||
3780 | next: | ||
3781 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
3782 | } | ||
3574 | rcu_read_unlock(); | 3783 | rcu_read_unlock(); |
3575 | } | 3784 | } |
3576 | 3785 | ||
@@ -3675,8 +3884,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
3675 | { | 3884 | { |
3676 | struct perf_cpu_context *cpuctx; | 3885 | struct perf_cpu_context *cpuctx; |
3677 | struct perf_event_context *ctx; | 3886 | struct perf_event_context *ctx; |
3678 | unsigned int size; | ||
3679 | char comm[TASK_COMM_LEN]; | 3887 | char comm[TASK_COMM_LEN]; |
3888 | unsigned int size; | ||
3889 | struct pmu *pmu; | ||
3890 | int ctxn; | ||
3680 | 3891 | ||
3681 | memset(comm, 0, sizeof(comm)); | 3892 | memset(comm, 0, sizeof(comm)); |
3682 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); | 3893 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); |
@@ -3688,21 +3899,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
3688 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 3899 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
3689 | 3900 | ||
3690 | rcu_read_lock(); | 3901 | rcu_read_lock(); |
3691 | cpuctx = &get_cpu_var(perf_cpu_context); | 3902 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3692 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); | 3903 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3693 | ctx = rcu_dereference(current->perf_event_ctxp); | 3904 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); |
3694 | if (ctx) | 3905 | |
3695 | perf_event_comm_ctx(ctx, comm_event); | 3906 | ctxn = pmu->task_ctx_nr; |
3696 | put_cpu_var(perf_cpu_context); | 3907 | if (ctxn < 0) |
3908 | goto next; | ||
3909 | |||
3910 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
3911 | if (ctx) | ||
3912 | perf_event_comm_ctx(ctx, comm_event); | ||
3913 | next: | ||
3914 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
3915 | } | ||
3697 | rcu_read_unlock(); | 3916 | rcu_read_unlock(); |
3698 | } | 3917 | } |
3699 | 3918 | ||
3700 | void perf_event_comm(struct task_struct *task) | 3919 | void perf_event_comm(struct task_struct *task) |
3701 | { | 3920 | { |
3702 | struct perf_comm_event comm_event; | 3921 | struct perf_comm_event comm_event; |
3922 | struct perf_event_context *ctx; | ||
3923 | int ctxn; | ||
3703 | 3924 | ||
3704 | if (task->perf_event_ctxp) | 3925 | for_each_task_context_nr(ctxn) { |
3705 | perf_event_enable_on_exec(task); | 3926 | ctx = task->perf_event_ctxp[ctxn]; |
3927 | if (!ctx) | ||
3928 | continue; | ||
3929 | |||
3930 | perf_event_enable_on_exec(ctx); | ||
3931 | } | ||
3706 | 3932 | ||
3707 | if (!atomic_read(&nr_comm_events)) | 3933 | if (!atomic_read(&nr_comm_events)) |
3708 | return; | 3934 | return; |
@@ -3766,7 +3992,8 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
3766 | } | 3992 | } |
3767 | 3993 | ||
3768 | static int perf_event_mmap_match(struct perf_event *event, | 3994 | static int perf_event_mmap_match(struct perf_event *event, |
3769 | struct perf_mmap_event *mmap_event) | 3995 | struct perf_mmap_event *mmap_event, |
3996 | int executable) | ||
3770 | { | 3997 | { |
3771 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 3998 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
3772 | return 0; | 3999 | return 0; |
@@ -3774,19 +4001,21 @@ static int perf_event_mmap_match(struct perf_event *event, | |||
3774 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 4001 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
3775 | return 0; | 4002 | return 0; |
3776 | 4003 | ||
3777 | if (event->attr.mmap) | 4004 | if ((!executable && event->attr.mmap_data) || |
4005 | (executable && event->attr.mmap)) | ||
3778 | return 1; | 4006 | return 1; |
3779 | 4007 | ||
3780 | return 0; | 4008 | return 0; |
3781 | } | 4009 | } |
3782 | 4010 | ||
3783 | static void perf_event_mmap_ctx(struct perf_event_context *ctx, | 4011 | static void perf_event_mmap_ctx(struct perf_event_context *ctx, |
3784 | struct perf_mmap_event *mmap_event) | 4012 | struct perf_mmap_event *mmap_event, |
4013 | int executable) | ||
3785 | { | 4014 | { |
3786 | struct perf_event *event; | 4015 | struct perf_event *event; |
3787 | 4016 | ||
3788 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | 4017 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
3789 | if (perf_event_mmap_match(event, mmap_event)) | 4018 | if (perf_event_mmap_match(event, mmap_event, executable)) |
3790 | perf_event_mmap_output(event, mmap_event); | 4019 | perf_event_mmap_output(event, mmap_event); |
3791 | } | 4020 | } |
3792 | } | 4021 | } |
@@ -3801,6 +4030,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
3801 | char tmp[16]; | 4030 | char tmp[16]; |
3802 | char *buf = NULL; | 4031 | char *buf = NULL; |
3803 | const char *name; | 4032 | const char *name; |
4033 | struct pmu *pmu; | ||
4034 | int ctxn; | ||
3804 | 4035 | ||
3805 | memset(tmp, 0, sizeof(tmp)); | 4036 | memset(tmp, 0, sizeof(tmp)); |
3806 | 4037 | ||
@@ -3830,6 +4061,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
3830 | if (!vma->vm_mm) { | 4061 | if (!vma->vm_mm) { |
3831 | name = strncpy(tmp, "[vdso]", sizeof(tmp)); | 4062 | name = strncpy(tmp, "[vdso]", sizeof(tmp)); |
3832 | goto got_name; | 4063 | goto got_name; |
4064 | } else if (vma->vm_start <= vma->vm_mm->start_brk && | ||
4065 | vma->vm_end >= vma->vm_mm->brk) { | ||
4066 | name = strncpy(tmp, "[heap]", sizeof(tmp)); | ||
4067 | goto got_name; | ||
4068 | } else if (vma->vm_start <= vma->vm_mm->start_stack && | ||
4069 | vma->vm_end >= vma->vm_mm->start_stack) { | ||
4070 | name = strncpy(tmp, "[stack]", sizeof(tmp)); | ||
4071 | goto got_name; | ||
3833 | } | 4072 | } |
3834 | 4073 | ||
3835 | name = strncpy(tmp, "//anon", sizeof(tmp)); | 4074 | name = strncpy(tmp, "//anon", sizeof(tmp)); |
@@ -3845,18 +4084,29 @@ got_name: | |||
3845 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; | 4084 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; |
3846 | 4085 | ||
3847 | rcu_read_lock(); | 4086 | rcu_read_lock(); |
3848 | cpuctx = &get_cpu_var(perf_cpu_context); | 4087 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
3849 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); | 4088 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
3850 | ctx = rcu_dereference(current->perf_event_ctxp); | 4089 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, |
3851 | if (ctx) | 4090 | vma->vm_flags & VM_EXEC); |
3852 | perf_event_mmap_ctx(ctx, mmap_event); | 4091 | |
3853 | put_cpu_var(perf_cpu_context); | 4092 | ctxn = pmu->task_ctx_nr; |
4093 | if (ctxn < 0) | ||
4094 | goto next; | ||
4095 | |||
4096 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
4097 | if (ctx) { | ||
4098 | perf_event_mmap_ctx(ctx, mmap_event, | ||
4099 | vma->vm_flags & VM_EXEC); | ||
4100 | } | ||
4101 | next: | ||
4102 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
4103 | } | ||
3854 | rcu_read_unlock(); | 4104 | rcu_read_unlock(); |
3855 | 4105 | ||
3856 | kfree(buf); | 4106 | kfree(buf); |
3857 | } | 4107 | } |
3858 | 4108 | ||
3859 | void __perf_event_mmap(struct vm_area_struct *vma) | 4109 | void perf_event_mmap(struct vm_area_struct *vma) |
3860 | { | 4110 | { |
3861 | struct perf_mmap_event mmap_event; | 4111 | struct perf_mmap_event mmap_event; |
3862 | 4112 | ||
@@ -3932,8 +4182,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
3932 | struct hw_perf_event *hwc = &event->hw; | 4182 | struct hw_perf_event *hwc = &event->hw; |
3933 | int ret = 0; | 4183 | int ret = 0; |
3934 | 4184 | ||
3935 | throttle = (throttle && event->pmu->unthrottle != NULL); | ||
3936 | |||
3937 | if (!throttle) { | 4185 | if (!throttle) { |
3938 | hwc->interrupts++; | 4186 | hwc->interrupts++; |
3939 | } else { | 4187 | } else { |
@@ -3976,8 +4224,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
3976 | event->pending_kill = POLL_HUP; | 4224 | event->pending_kill = POLL_HUP; |
3977 | if (nmi) { | 4225 | if (nmi) { |
3978 | event->pending_disable = 1; | 4226 | event->pending_disable = 1; |
3979 | perf_pending_queue(&event->pending, | 4227 | irq_work_queue(&event->pending); |
3980 | perf_pending_event); | ||
3981 | } else | 4228 | } else |
3982 | perf_event_disable(event); | 4229 | perf_event_disable(event); |
3983 | } | 4230 | } |
@@ -4001,6 +4248,17 @@ int perf_event_overflow(struct perf_event *event, int nmi, | |||
4001 | * Generic software event infrastructure | 4248 | * Generic software event infrastructure |
4002 | */ | 4249 | */ |
4003 | 4250 | ||
4251 | struct swevent_htable { | ||
4252 | struct swevent_hlist *swevent_hlist; | ||
4253 | struct mutex hlist_mutex; | ||
4254 | int hlist_refcount; | ||
4255 | |||
4256 | /* Recursion avoidance in each contexts */ | ||
4257 | int recursion[PERF_NR_CONTEXTS]; | ||
4258 | }; | ||
4259 | |||
4260 | static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); | ||
4261 | |||
4004 | /* | 4262 | /* |
4005 | * We directly increment event->count and keep a second value in | 4263 | * We directly increment event->count and keep a second value in |
4006 | * event->hw.period_left to count intervals. This period event | 4264 | * event->hw.period_left to count intervals. This period event |
@@ -4018,14 +4276,14 @@ static u64 perf_swevent_set_period(struct perf_event *event) | |||
4018 | hwc->last_period = hwc->sample_period; | 4276 | hwc->last_period = hwc->sample_period; |
4019 | 4277 | ||
4020 | again: | 4278 | again: |
4021 | old = val = atomic64_read(&hwc->period_left); | 4279 | old = val = local64_read(&hwc->period_left); |
4022 | if (val < 0) | 4280 | if (val < 0) |
4023 | return 0; | 4281 | return 0; |
4024 | 4282 | ||
4025 | nr = div64_u64(period + val, period); | 4283 | nr = div64_u64(period + val, period); |
4026 | offset = nr * period; | 4284 | offset = nr * period; |
4027 | val -= offset; | 4285 | val -= offset; |
4028 | if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) | 4286 | if (local64_cmpxchg(&hwc->period_left, old, val) != old) |
4029 | goto again; | 4287 | goto again; |
4030 | 4288 | ||
4031 | return nr; | 4289 | return nr; |
@@ -4058,13 +4316,13 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | |||
4058 | } | 4316 | } |
4059 | } | 4317 | } |
4060 | 4318 | ||
4061 | static void perf_swevent_add(struct perf_event *event, u64 nr, | 4319 | static void perf_swevent_event(struct perf_event *event, u64 nr, |
4062 | int nmi, struct perf_sample_data *data, | 4320 | int nmi, struct perf_sample_data *data, |
4063 | struct pt_regs *regs) | 4321 | struct pt_regs *regs) |
4064 | { | 4322 | { |
4065 | struct hw_perf_event *hwc = &event->hw; | 4323 | struct hw_perf_event *hwc = &event->hw; |
4066 | 4324 | ||
4067 | atomic64_add(nr, &event->count); | 4325 | local64_add(nr, &event->count); |
4068 | 4326 | ||
4069 | if (!regs) | 4327 | if (!regs) |
4070 | return; | 4328 | return; |
@@ -4075,7 +4333,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
4075 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) | 4333 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) |
4076 | return perf_swevent_overflow(event, 1, nmi, data, regs); | 4334 | return perf_swevent_overflow(event, 1, nmi, data, regs); |
4077 | 4335 | ||
4078 | if (atomic64_add_negative(nr, &hwc->period_left)) | 4336 | if (local64_add_negative(nr, &hwc->period_left)) |
4079 | return; | 4337 | return; |
4080 | 4338 | ||
4081 | perf_swevent_overflow(event, 0, nmi, data, regs); | 4339 | perf_swevent_overflow(event, 0, nmi, data, regs); |
@@ -4084,6 +4342,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
4084 | static int perf_exclude_event(struct perf_event *event, | 4342 | static int perf_exclude_event(struct perf_event *event, |
4085 | struct pt_regs *regs) | 4343 | struct pt_regs *regs) |
4086 | { | 4344 | { |
4345 | if (event->hw.state & PERF_HES_STOPPED) | ||
4346 | return 0; | ||
4347 | |||
4087 | if (regs) { | 4348 | if (regs) { |
4088 | if (event->attr.exclude_user && user_mode(regs)) | 4349 | if (event->attr.exclude_user && user_mode(regs)) |
4089 | return 1; | 4350 | return 1; |
@@ -4130,11 +4391,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) | |||
4130 | 4391 | ||
4131 | /* For the read side: events when they trigger */ | 4392 | /* For the read side: events when they trigger */ |
4132 | static inline struct hlist_head * | 4393 | static inline struct hlist_head * |
4133 | find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) | 4394 | find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) |
4134 | { | 4395 | { |
4135 | struct swevent_hlist *hlist; | 4396 | struct swevent_hlist *hlist; |
4136 | 4397 | ||
4137 | hlist = rcu_dereference(ctx->swevent_hlist); | 4398 | hlist = rcu_dereference(swhash->swevent_hlist); |
4138 | if (!hlist) | 4399 | if (!hlist) |
4139 | return NULL; | 4400 | return NULL; |
4140 | 4401 | ||
@@ -4143,7 +4404,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) | |||
4143 | 4404 | ||
4144 | /* For the event head insertion and removal in the hlist */ | 4405 | /* For the event head insertion and removal in the hlist */ |
4145 | static inline struct hlist_head * | 4406 | static inline struct hlist_head * |
4146 | find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) | 4407 | find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) |
4147 | { | 4408 | { |
4148 | struct swevent_hlist *hlist; | 4409 | struct swevent_hlist *hlist; |
4149 | u32 event_id = event->attr.config; | 4410 | u32 event_id = event->attr.config; |
@@ -4154,7 +4415,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) | |||
4154 | * and release. Which makes the protected version suitable here. | 4415 | * and release. Which makes the protected version suitable here. |
4155 | * The context lock guarantees that. | 4416 | * The context lock guarantees that. |
4156 | */ | 4417 | */ |
4157 | hlist = rcu_dereference_protected(ctx->swevent_hlist, | 4418 | hlist = rcu_dereference_protected(swhash->swevent_hlist, |
4158 | lockdep_is_held(&event->ctx->lock)); | 4419 | lockdep_is_held(&event->ctx->lock)); |
4159 | if (!hlist) | 4420 | if (!hlist) |
4160 | return NULL; | 4421 | return NULL; |
@@ -4167,23 +4428,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | |||
4167 | struct perf_sample_data *data, | 4428 | struct perf_sample_data *data, |
4168 | struct pt_regs *regs) | 4429 | struct pt_regs *regs) |
4169 | { | 4430 | { |
4170 | struct perf_cpu_context *cpuctx; | 4431 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4171 | struct perf_event *event; | 4432 | struct perf_event *event; |
4172 | struct hlist_node *node; | 4433 | struct hlist_node *node; |
4173 | struct hlist_head *head; | 4434 | struct hlist_head *head; |
4174 | 4435 | ||
4175 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
4176 | |||
4177 | rcu_read_lock(); | 4436 | rcu_read_lock(); |
4178 | 4437 | head = find_swevent_head_rcu(swhash, type, event_id); | |
4179 | head = find_swevent_head_rcu(cpuctx, type, event_id); | ||
4180 | |||
4181 | if (!head) | 4438 | if (!head) |
4182 | goto end; | 4439 | goto end; |
4183 | 4440 | ||
4184 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 4441 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
4185 | if (perf_swevent_match(event, type, event_id, data, regs)) | 4442 | if (perf_swevent_match(event, type, event_id, data, regs)) |
4186 | perf_swevent_add(event, nr, nmi, data, regs); | 4443 | perf_swevent_event(event, nr, nmi, data, regs); |
4187 | } | 4444 | } |
4188 | end: | 4445 | end: |
4189 | rcu_read_unlock(); | 4446 | rcu_read_unlock(); |
@@ -4191,36 +4448,18 @@ end: | |||
4191 | 4448 | ||
4192 | int perf_swevent_get_recursion_context(void) | 4449 | int perf_swevent_get_recursion_context(void) |
4193 | { | 4450 | { |
4194 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 4451 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4195 | int rctx; | ||
4196 | |||
4197 | if (in_nmi()) | ||
4198 | rctx = 3; | ||
4199 | else if (in_irq()) | ||
4200 | rctx = 2; | ||
4201 | else if (in_softirq()) | ||
4202 | rctx = 1; | ||
4203 | else | ||
4204 | rctx = 0; | ||
4205 | 4452 | ||
4206 | if (cpuctx->recursion[rctx]) | 4453 | return get_recursion_context(swhash->recursion); |
4207 | return -1; | ||
4208 | |||
4209 | cpuctx->recursion[rctx]++; | ||
4210 | barrier(); | ||
4211 | |||
4212 | return rctx; | ||
4213 | } | 4454 | } |
4214 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); | 4455 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); |
4215 | 4456 | ||
4216 | void perf_swevent_put_recursion_context(int rctx) | 4457 | void inline perf_swevent_put_recursion_context(int rctx) |
4217 | { | 4458 | { |
4218 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 4459 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
4219 | barrier(); | ||
4220 | cpuctx->recursion[rctx]--; | ||
4221 | } | ||
4222 | EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); | ||
4223 | 4460 | ||
4461 | put_recursion_context(swhash->recursion, rctx); | ||
4462 | } | ||
4224 | 4463 | ||
4225 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, | 4464 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, |
4226 | struct pt_regs *regs, u64 addr) | 4465 | struct pt_regs *regs, u64 addr) |
@@ -4245,20 +4484,20 @@ static void perf_swevent_read(struct perf_event *event) | |||
4245 | { | 4484 | { |
4246 | } | 4485 | } |
4247 | 4486 | ||
4248 | static int perf_swevent_enable(struct perf_event *event) | 4487 | static int perf_swevent_add(struct perf_event *event, int flags) |
4249 | { | 4488 | { |
4489 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | ||
4250 | struct hw_perf_event *hwc = &event->hw; | 4490 | struct hw_perf_event *hwc = &event->hw; |
4251 | struct perf_cpu_context *cpuctx; | ||
4252 | struct hlist_head *head; | 4491 | struct hlist_head *head; |
4253 | 4492 | ||
4254 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
4255 | |||
4256 | if (hwc->sample_period) { | 4493 | if (hwc->sample_period) { |
4257 | hwc->last_period = hwc->sample_period; | 4494 | hwc->last_period = hwc->sample_period; |
4258 | perf_swevent_set_period(event); | 4495 | perf_swevent_set_period(event); |
4259 | } | 4496 | } |
4260 | 4497 | ||
4261 | head = find_swevent_head(cpuctx, event); | 4498 | hwc->state = !(flags & PERF_EF_START); |
4499 | |||
4500 | head = find_swevent_head(swhash, event); | ||
4262 | if (WARN_ON_ONCE(!head)) | 4501 | if (WARN_ON_ONCE(!head)) |
4263 | return -EINVAL; | 4502 | return -EINVAL; |
4264 | 4503 | ||
@@ -4267,202 +4506,27 @@ static int perf_swevent_enable(struct perf_event *event) | |||
4267 | return 0; | 4506 | return 0; |
4268 | } | 4507 | } |
4269 | 4508 | ||
4270 | static void perf_swevent_disable(struct perf_event *event) | 4509 | static void perf_swevent_del(struct perf_event *event, int flags) |
4271 | { | 4510 | { |
4272 | hlist_del_rcu(&event->hlist_entry); | 4511 | hlist_del_rcu(&event->hlist_entry); |
4273 | } | 4512 | } |
4274 | 4513 | ||
4275 | static void perf_swevent_void(struct perf_event *event) | 4514 | static void perf_swevent_start(struct perf_event *event, int flags) |
4276 | { | ||
4277 | } | ||
4278 | |||
4279 | static int perf_swevent_int(struct perf_event *event) | ||
4280 | { | ||
4281 | return 0; | ||
4282 | } | ||
4283 | |||
4284 | static const struct pmu perf_ops_generic = { | ||
4285 | .enable = perf_swevent_enable, | ||
4286 | .disable = perf_swevent_disable, | ||
4287 | .start = perf_swevent_int, | ||
4288 | .stop = perf_swevent_void, | ||
4289 | .read = perf_swevent_read, | ||
4290 | .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ | ||
4291 | }; | ||
4292 | |||
4293 | /* | ||
4294 | * hrtimer based swevent callback | ||
4295 | */ | ||
4296 | |||
4297 | static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | ||
4298 | { | 4515 | { |
4299 | enum hrtimer_restart ret = HRTIMER_RESTART; | 4516 | event->hw.state = 0; |
4300 | struct perf_sample_data data; | ||
4301 | struct pt_regs *regs; | ||
4302 | struct perf_event *event; | ||
4303 | u64 period; | ||
4304 | |||
4305 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); | ||
4306 | event->pmu->read(event); | ||
4307 | |||
4308 | perf_sample_data_init(&data, 0); | ||
4309 | data.period = event->hw.last_period; | ||
4310 | regs = get_irq_regs(); | ||
4311 | |||
4312 | if (regs && !perf_exclude_event(event, regs)) { | ||
4313 | if (!(event->attr.exclude_idle && current->pid == 0)) | ||
4314 | if (perf_event_overflow(event, 0, &data, regs)) | ||
4315 | ret = HRTIMER_NORESTART; | ||
4316 | } | ||
4317 | |||
4318 | period = max_t(u64, 10000, event->hw.sample_period); | ||
4319 | hrtimer_forward_now(hrtimer, ns_to_ktime(period)); | ||
4320 | |||
4321 | return ret; | ||
4322 | } | 4517 | } |
4323 | 4518 | ||
4324 | static void perf_swevent_start_hrtimer(struct perf_event *event) | 4519 | static void perf_swevent_stop(struct perf_event *event, int flags) |
4325 | { | 4520 | { |
4326 | struct hw_perf_event *hwc = &event->hw; | 4521 | event->hw.state = PERF_HES_STOPPED; |
4327 | |||
4328 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
4329 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
4330 | if (hwc->sample_period) { | ||
4331 | u64 period; | ||
4332 | |||
4333 | if (hwc->remaining) { | ||
4334 | if (hwc->remaining < 0) | ||
4335 | period = 10000; | ||
4336 | else | ||
4337 | period = hwc->remaining; | ||
4338 | hwc->remaining = 0; | ||
4339 | } else { | ||
4340 | period = max_t(u64, 10000, hwc->sample_period); | ||
4341 | } | ||
4342 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
4343 | ns_to_ktime(period), 0, | ||
4344 | HRTIMER_MODE_REL, 0); | ||
4345 | } | ||
4346 | } | ||
4347 | |||
4348 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) | ||
4349 | { | ||
4350 | struct hw_perf_event *hwc = &event->hw; | ||
4351 | |||
4352 | if (hwc->sample_period) { | ||
4353 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | ||
4354 | hwc->remaining = ktime_to_ns(remaining); | ||
4355 | |||
4356 | hrtimer_cancel(&hwc->hrtimer); | ||
4357 | } | ||
4358 | } | ||
4359 | |||
4360 | /* | ||
4361 | * Software event: cpu wall time clock | ||
4362 | */ | ||
4363 | |||
4364 | static void cpu_clock_perf_event_update(struct perf_event *event) | ||
4365 | { | ||
4366 | int cpu = raw_smp_processor_id(); | ||
4367 | s64 prev; | ||
4368 | u64 now; | ||
4369 | |||
4370 | now = cpu_clock(cpu); | ||
4371 | prev = atomic64_xchg(&event->hw.prev_count, now); | ||
4372 | atomic64_add(now - prev, &event->count); | ||
4373 | } | ||
4374 | |||
4375 | static int cpu_clock_perf_event_enable(struct perf_event *event) | ||
4376 | { | ||
4377 | struct hw_perf_event *hwc = &event->hw; | ||
4378 | int cpu = raw_smp_processor_id(); | ||
4379 | |||
4380 | atomic64_set(&hwc->prev_count, cpu_clock(cpu)); | ||
4381 | perf_swevent_start_hrtimer(event); | ||
4382 | |||
4383 | return 0; | ||
4384 | } | ||
4385 | |||
4386 | static void cpu_clock_perf_event_disable(struct perf_event *event) | ||
4387 | { | ||
4388 | perf_swevent_cancel_hrtimer(event); | ||
4389 | cpu_clock_perf_event_update(event); | ||
4390 | } | ||
4391 | |||
4392 | static void cpu_clock_perf_event_read(struct perf_event *event) | ||
4393 | { | ||
4394 | cpu_clock_perf_event_update(event); | ||
4395 | } | ||
4396 | |||
4397 | static const struct pmu perf_ops_cpu_clock = { | ||
4398 | .enable = cpu_clock_perf_event_enable, | ||
4399 | .disable = cpu_clock_perf_event_disable, | ||
4400 | .read = cpu_clock_perf_event_read, | ||
4401 | }; | ||
4402 | |||
4403 | /* | ||
4404 | * Software event: task time clock | ||
4405 | */ | ||
4406 | |||
4407 | static void task_clock_perf_event_update(struct perf_event *event, u64 now) | ||
4408 | { | ||
4409 | u64 prev; | ||
4410 | s64 delta; | ||
4411 | |||
4412 | prev = atomic64_xchg(&event->hw.prev_count, now); | ||
4413 | delta = now - prev; | ||
4414 | atomic64_add(delta, &event->count); | ||
4415 | } | ||
4416 | |||
4417 | static int task_clock_perf_event_enable(struct perf_event *event) | ||
4418 | { | ||
4419 | struct hw_perf_event *hwc = &event->hw; | ||
4420 | u64 now; | ||
4421 | |||
4422 | now = event->ctx->time; | ||
4423 | |||
4424 | atomic64_set(&hwc->prev_count, now); | ||
4425 | |||
4426 | perf_swevent_start_hrtimer(event); | ||
4427 | |||
4428 | return 0; | ||
4429 | } | ||
4430 | |||
4431 | static void task_clock_perf_event_disable(struct perf_event *event) | ||
4432 | { | ||
4433 | perf_swevent_cancel_hrtimer(event); | ||
4434 | task_clock_perf_event_update(event, event->ctx->time); | ||
4435 | |||
4436 | } | ||
4437 | |||
4438 | static void task_clock_perf_event_read(struct perf_event *event) | ||
4439 | { | ||
4440 | u64 time; | ||
4441 | |||
4442 | if (!in_nmi()) { | ||
4443 | update_context_time(event->ctx); | ||
4444 | time = event->ctx->time; | ||
4445 | } else { | ||
4446 | u64 now = perf_clock(); | ||
4447 | u64 delta = now - event->ctx->timestamp; | ||
4448 | time = event->ctx->time + delta; | ||
4449 | } | ||
4450 | |||
4451 | task_clock_perf_event_update(event, time); | ||
4452 | } | 4522 | } |
4453 | 4523 | ||
4454 | static const struct pmu perf_ops_task_clock = { | ||
4455 | .enable = task_clock_perf_event_enable, | ||
4456 | .disable = task_clock_perf_event_disable, | ||
4457 | .read = task_clock_perf_event_read, | ||
4458 | }; | ||
4459 | |||
4460 | /* Deref the hlist from the update side */ | 4524 | /* Deref the hlist from the update side */ |
4461 | static inline struct swevent_hlist * | 4525 | static inline struct swevent_hlist * |
4462 | swevent_hlist_deref(struct perf_cpu_context *cpuctx) | 4526 | swevent_hlist_deref(struct swevent_htable *swhash) |
4463 | { | 4527 | { |
4464 | return rcu_dereference_protected(cpuctx->swevent_hlist, | 4528 | return rcu_dereference_protected(swhash->swevent_hlist, |
4465 | lockdep_is_held(&cpuctx->hlist_mutex)); | 4529 | lockdep_is_held(&swhash->hlist_mutex)); |
4466 | } | 4530 | } |
4467 | 4531 | ||
4468 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | 4532 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) |
@@ -4473,27 +4537,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | |||
4473 | kfree(hlist); | 4537 | kfree(hlist); |
4474 | } | 4538 | } |
4475 | 4539 | ||
4476 | static void swevent_hlist_release(struct perf_cpu_context *cpuctx) | 4540 | static void swevent_hlist_release(struct swevent_htable *swhash) |
4477 | { | 4541 | { |
4478 | struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); | 4542 | struct swevent_hlist *hlist = swevent_hlist_deref(swhash); |
4479 | 4543 | ||
4480 | if (!hlist) | 4544 | if (!hlist) |
4481 | return; | 4545 | return; |
4482 | 4546 | ||
4483 | rcu_assign_pointer(cpuctx->swevent_hlist, NULL); | 4547 | rcu_assign_pointer(swhash->swevent_hlist, NULL); |
4484 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); | 4548 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); |
4485 | } | 4549 | } |
4486 | 4550 | ||
4487 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) | 4551 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) |
4488 | { | 4552 | { |
4489 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 4553 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
4490 | 4554 | ||
4491 | mutex_lock(&cpuctx->hlist_mutex); | 4555 | mutex_lock(&swhash->hlist_mutex); |
4492 | 4556 | ||
4493 | if (!--cpuctx->hlist_refcount) | 4557 | if (!--swhash->hlist_refcount) |
4494 | swevent_hlist_release(cpuctx); | 4558 | swevent_hlist_release(swhash); |
4495 | 4559 | ||
4496 | mutex_unlock(&cpuctx->hlist_mutex); | 4560 | mutex_unlock(&swhash->hlist_mutex); |
4497 | } | 4561 | } |
4498 | 4562 | ||
4499 | static void swevent_hlist_put(struct perf_event *event) | 4563 | static void swevent_hlist_put(struct perf_event *event) |
@@ -4511,12 +4575,12 @@ static void swevent_hlist_put(struct perf_event *event) | |||
4511 | 4575 | ||
4512 | static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | 4576 | static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) |
4513 | { | 4577 | { |
4514 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 4578 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
4515 | int err = 0; | 4579 | int err = 0; |
4516 | 4580 | ||
4517 | mutex_lock(&cpuctx->hlist_mutex); | 4581 | mutex_lock(&swhash->hlist_mutex); |
4518 | 4582 | ||
4519 | if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { | 4583 | if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { |
4520 | struct swevent_hlist *hlist; | 4584 | struct swevent_hlist *hlist; |
4521 | 4585 | ||
4522 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 4586 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); |
@@ -4524,11 +4588,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | |||
4524 | err = -ENOMEM; | 4588 | err = -ENOMEM; |
4525 | goto exit; | 4589 | goto exit; |
4526 | } | 4590 | } |
4527 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); | 4591 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
4528 | } | 4592 | } |
4529 | cpuctx->hlist_refcount++; | 4593 | swhash->hlist_refcount++; |
4530 | exit: | 4594 | exit: |
4531 | mutex_unlock(&cpuctx->hlist_mutex); | 4595 | mutex_unlock(&swhash->hlist_mutex); |
4532 | 4596 | ||
4533 | return err; | 4597 | return err; |
4534 | } | 4598 | } |
@@ -4552,7 +4616,7 @@ static int swevent_hlist_get(struct perf_event *event) | |||
4552 | put_online_cpus(); | 4616 | put_online_cpus(); |
4553 | 4617 | ||
4554 | return 0; | 4618 | return 0; |
4555 | fail: | 4619 | fail: |
4556 | for_each_possible_cpu(cpu) { | 4620 | for_each_possible_cpu(cpu) { |
4557 | if (cpu == failed_cpu) | 4621 | if (cpu == failed_cpu) |
4558 | break; | 4622 | break; |
@@ -4563,17 +4627,64 @@ static int swevent_hlist_get(struct perf_event *event) | |||
4563 | return err; | 4627 | return err; |
4564 | } | 4628 | } |
4565 | 4629 | ||
4566 | #ifdef CONFIG_EVENT_TRACING | 4630 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; |
4631 | |||
4632 | static void sw_perf_event_destroy(struct perf_event *event) | ||
4633 | { | ||
4634 | u64 event_id = event->attr.config; | ||
4635 | |||
4636 | WARN_ON(event->parent); | ||
4637 | |||
4638 | jump_label_dec(&perf_swevent_enabled[event_id]); | ||
4639 | swevent_hlist_put(event); | ||
4640 | } | ||
4641 | |||
4642 | static int perf_swevent_init(struct perf_event *event) | ||
4643 | { | ||
4644 | int event_id = event->attr.config; | ||
4645 | |||
4646 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
4647 | return -ENOENT; | ||
4648 | |||
4649 | switch (event_id) { | ||
4650 | case PERF_COUNT_SW_CPU_CLOCK: | ||
4651 | case PERF_COUNT_SW_TASK_CLOCK: | ||
4652 | return -ENOENT; | ||
4653 | |||
4654 | default: | ||
4655 | break; | ||
4656 | } | ||
4657 | |||
4658 | if (event_id > PERF_COUNT_SW_MAX) | ||
4659 | return -ENOENT; | ||
4567 | 4660 | ||
4568 | static const struct pmu perf_ops_tracepoint = { | 4661 | if (!event->parent) { |
4569 | .enable = perf_trace_enable, | 4662 | int err; |
4570 | .disable = perf_trace_disable, | 4663 | |
4571 | .start = perf_swevent_int, | 4664 | err = swevent_hlist_get(event); |
4572 | .stop = perf_swevent_void, | 4665 | if (err) |
4666 | return err; | ||
4667 | |||
4668 | jump_label_inc(&perf_swevent_enabled[event_id]); | ||
4669 | event->destroy = sw_perf_event_destroy; | ||
4670 | } | ||
4671 | |||
4672 | return 0; | ||
4673 | } | ||
4674 | |||
4675 | static struct pmu perf_swevent = { | ||
4676 | .task_ctx_nr = perf_sw_context, | ||
4677 | |||
4678 | .event_init = perf_swevent_init, | ||
4679 | .add = perf_swevent_add, | ||
4680 | .del = perf_swevent_del, | ||
4681 | .start = perf_swevent_start, | ||
4682 | .stop = perf_swevent_stop, | ||
4573 | .read = perf_swevent_read, | 4683 | .read = perf_swevent_read, |
4574 | .unthrottle = perf_swevent_void, | ||
4575 | }; | 4684 | }; |
4576 | 4685 | ||
4686 | #ifdef CONFIG_EVENT_TRACING | ||
4687 | |||
4577 | static int perf_tp_filter_match(struct perf_event *event, | 4688 | static int perf_tp_filter_match(struct perf_event *event, |
4578 | struct perf_sample_data *data) | 4689 | struct perf_sample_data *data) |
4579 | { | 4690 | { |
@@ -4601,7 +4712,7 @@ static int perf_tp_event_match(struct perf_event *event, | |||
4601 | } | 4712 | } |
4602 | 4713 | ||
4603 | void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | 4714 | void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, |
4604 | struct pt_regs *regs, struct hlist_head *head) | 4715 | struct pt_regs *regs, struct hlist_head *head, int rctx) |
4605 | { | 4716 | { |
4606 | struct perf_sample_data data; | 4717 | struct perf_sample_data data; |
4607 | struct perf_event *event; | 4718 | struct perf_event *event; |
@@ -4615,12 +4726,12 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
4615 | perf_sample_data_init(&data, addr); | 4726 | perf_sample_data_init(&data, addr); |
4616 | data.raw = &raw; | 4727 | data.raw = &raw; |
4617 | 4728 | ||
4618 | rcu_read_lock(); | ||
4619 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 4729 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
4620 | if (perf_tp_event_match(event, &data, regs)) | 4730 | if (perf_tp_event_match(event, &data, regs)) |
4621 | perf_swevent_add(event, count, 1, &data, regs); | 4731 | perf_swevent_event(event, count, 1, &data, regs); |
4622 | } | 4732 | } |
4623 | rcu_read_unlock(); | 4733 | |
4734 | perf_swevent_put_recursion_context(rctx); | ||
4624 | } | 4735 | } |
4625 | EXPORT_SYMBOL_GPL(perf_tp_event); | 4736 | EXPORT_SYMBOL_GPL(perf_tp_event); |
4626 | 4737 | ||
@@ -4629,10 +4740,13 @@ static void tp_perf_event_destroy(struct perf_event *event) | |||
4629 | perf_trace_destroy(event); | 4740 | perf_trace_destroy(event); |
4630 | } | 4741 | } |
4631 | 4742 | ||
4632 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4743 | static int perf_tp_event_init(struct perf_event *event) |
4633 | { | 4744 | { |
4634 | int err; | 4745 | int err; |
4635 | 4746 | ||
4747 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | ||
4748 | return -ENOENT; | ||
4749 | |||
4636 | /* | 4750 | /* |
4637 | * Raw tracepoint data is a severe data leak, only allow root to | 4751 | * Raw tracepoint data is a severe data leak, only allow root to |
4638 | * have these. | 4752 | * have these. |
@@ -4640,15 +4754,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) | |||
4640 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && | 4754 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && |
4641 | perf_paranoid_tracepoint_raw() && | 4755 | perf_paranoid_tracepoint_raw() && |
4642 | !capable(CAP_SYS_ADMIN)) | 4756 | !capable(CAP_SYS_ADMIN)) |
4643 | return ERR_PTR(-EPERM); | 4757 | return -EPERM; |
4644 | 4758 | ||
4645 | err = perf_trace_init(event); | 4759 | err = perf_trace_init(event); |
4646 | if (err) | 4760 | if (err) |
4647 | return NULL; | 4761 | return err; |
4648 | 4762 | ||
4649 | event->destroy = tp_perf_event_destroy; | 4763 | event->destroy = tp_perf_event_destroy; |
4650 | 4764 | ||
4651 | return &perf_ops_tracepoint; | 4765 | return 0; |
4766 | } | ||
4767 | |||
4768 | static struct pmu perf_tracepoint = { | ||
4769 | .task_ctx_nr = perf_sw_context, | ||
4770 | |||
4771 | .event_init = perf_tp_event_init, | ||
4772 | .add = perf_trace_add, | ||
4773 | .del = perf_trace_del, | ||
4774 | .start = perf_swevent_start, | ||
4775 | .stop = perf_swevent_stop, | ||
4776 | .read = perf_swevent_read, | ||
4777 | }; | ||
4778 | |||
4779 | static inline void perf_tp_register(void) | ||
4780 | { | ||
4781 | perf_pmu_register(&perf_tracepoint); | ||
4652 | } | 4782 | } |
4653 | 4783 | ||
4654 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4784 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
@@ -4676,9 +4806,8 @@ static void perf_event_free_filter(struct perf_event *event) | |||
4676 | 4806 | ||
4677 | #else | 4807 | #else |
4678 | 4808 | ||
4679 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4809 | static inline void perf_tp_register(void) |
4680 | { | 4810 | { |
4681 | return NULL; | ||
4682 | } | 4811 | } |
4683 | 4812 | ||
4684 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4813 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
@@ -4693,105 +4822,389 @@ static void perf_event_free_filter(struct perf_event *event) | |||
4693 | #endif /* CONFIG_EVENT_TRACING */ | 4822 | #endif /* CONFIG_EVENT_TRACING */ |
4694 | 4823 | ||
4695 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | 4824 | #ifdef CONFIG_HAVE_HW_BREAKPOINT |
4696 | static void bp_perf_event_destroy(struct perf_event *event) | 4825 | void perf_bp_event(struct perf_event *bp, void *data) |
4697 | { | 4826 | { |
4698 | release_bp_slot(event); | 4827 | struct perf_sample_data sample; |
4828 | struct pt_regs *regs = data; | ||
4829 | |||
4830 | perf_sample_data_init(&sample, bp->attr.bp_addr); | ||
4831 | |||
4832 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) | ||
4833 | perf_swevent_event(bp, 1, 1, &sample, regs); | ||
4699 | } | 4834 | } |
4835 | #endif | ||
4700 | 4836 | ||
4701 | static const struct pmu *bp_perf_event_init(struct perf_event *bp) | 4837 | /* |
4838 | * hrtimer based swevent callback | ||
4839 | */ | ||
4840 | |||
4841 | static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | ||
4702 | { | 4842 | { |
4703 | int err; | 4843 | enum hrtimer_restart ret = HRTIMER_RESTART; |
4844 | struct perf_sample_data data; | ||
4845 | struct pt_regs *regs; | ||
4846 | struct perf_event *event; | ||
4847 | u64 period; | ||
4704 | 4848 | ||
4705 | err = register_perf_hw_breakpoint(bp); | 4849 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); |
4706 | if (err) | 4850 | event->pmu->read(event); |
4707 | return ERR_PTR(err); | ||
4708 | 4851 | ||
4709 | bp->destroy = bp_perf_event_destroy; | 4852 | perf_sample_data_init(&data, 0); |
4853 | data.period = event->hw.last_period; | ||
4854 | regs = get_irq_regs(); | ||
4710 | 4855 | ||
4711 | return &perf_ops_bp; | 4856 | if (regs && !perf_exclude_event(event, regs)) { |
4857 | if (!(event->attr.exclude_idle && current->pid == 0)) | ||
4858 | if (perf_event_overflow(event, 0, &data, regs)) | ||
4859 | ret = HRTIMER_NORESTART; | ||
4860 | } | ||
4861 | |||
4862 | period = max_t(u64, 10000, event->hw.sample_period); | ||
4863 | hrtimer_forward_now(hrtimer, ns_to_ktime(period)); | ||
4864 | |||
4865 | return ret; | ||
4712 | } | 4866 | } |
4713 | 4867 | ||
4714 | void perf_bp_event(struct perf_event *bp, void *data) | 4868 | static void perf_swevent_start_hrtimer(struct perf_event *event) |
4715 | { | 4869 | { |
4716 | struct perf_sample_data sample; | 4870 | struct hw_perf_event *hwc = &event->hw; |
4717 | struct pt_regs *regs = data; | ||
4718 | 4871 | ||
4719 | perf_sample_data_init(&sample, bp->attr.bp_addr); | 4872 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
4873 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
4874 | if (hwc->sample_period) { | ||
4875 | s64 period = local64_read(&hwc->period_left); | ||
4720 | 4876 | ||
4721 | if (!perf_exclude_event(bp, regs)) | 4877 | if (period) { |
4722 | perf_swevent_add(bp, 1, 1, &sample, regs); | 4878 | if (period < 0) |
4879 | period = 10000; | ||
4880 | |||
4881 | local64_set(&hwc->period_left, 0); | ||
4882 | } else { | ||
4883 | period = max_t(u64, 10000, hwc->sample_period); | ||
4884 | } | ||
4885 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
4886 | ns_to_ktime(period), 0, | ||
4887 | HRTIMER_MODE_REL_PINNED, 0); | ||
4888 | } | ||
4723 | } | 4889 | } |
4724 | #else | 4890 | |
4725 | static const struct pmu *bp_perf_event_init(struct perf_event *bp) | 4891 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) |
4726 | { | 4892 | { |
4727 | return NULL; | 4893 | struct hw_perf_event *hwc = &event->hw; |
4894 | |||
4895 | if (hwc->sample_period) { | ||
4896 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | ||
4897 | local64_set(&hwc->period_left, ktime_to_ns(remaining)); | ||
4898 | |||
4899 | hrtimer_cancel(&hwc->hrtimer); | ||
4900 | } | ||
4728 | } | 4901 | } |
4729 | 4902 | ||
4730 | void perf_bp_event(struct perf_event *bp, void *regs) | 4903 | /* |
4904 | * Software event: cpu wall time clock | ||
4905 | */ | ||
4906 | |||
4907 | static void cpu_clock_event_update(struct perf_event *event) | ||
4731 | { | 4908 | { |
4909 | s64 prev; | ||
4910 | u64 now; | ||
4911 | |||
4912 | now = local_clock(); | ||
4913 | prev = local64_xchg(&event->hw.prev_count, now); | ||
4914 | local64_add(now - prev, &event->count); | ||
4732 | } | 4915 | } |
4733 | #endif | ||
4734 | 4916 | ||
4735 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; | 4917 | static void cpu_clock_event_start(struct perf_event *event, int flags) |
4918 | { | ||
4919 | local64_set(&event->hw.prev_count, local_clock()); | ||
4920 | perf_swevent_start_hrtimer(event); | ||
4921 | } | ||
4736 | 4922 | ||
4737 | static void sw_perf_event_destroy(struct perf_event *event) | 4923 | static void cpu_clock_event_stop(struct perf_event *event, int flags) |
4738 | { | 4924 | { |
4739 | u64 event_id = event->attr.config; | 4925 | perf_swevent_cancel_hrtimer(event); |
4926 | cpu_clock_event_update(event); | ||
4927 | } | ||
4740 | 4928 | ||
4741 | WARN_ON(event->parent); | 4929 | static int cpu_clock_event_add(struct perf_event *event, int flags) |
4930 | { | ||
4931 | if (flags & PERF_EF_START) | ||
4932 | cpu_clock_event_start(event, flags); | ||
4742 | 4933 | ||
4743 | atomic_dec(&perf_swevent_enabled[event_id]); | 4934 | return 0; |
4744 | swevent_hlist_put(event); | ||
4745 | } | 4935 | } |
4746 | 4936 | ||
4747 | static const struct pmu *sw_perf_event_init(struct perf_event *event) | 4937 | static void cpu_clock_event_del(struct perf_event *event, int flags) |
4748 | { | 4938 | { |
4749 | const struct pmu *pmu = NULL; | 4939 | cpu_clock_event_stop(event, flags); |
4750 | u64 event_id = event->attr.config; | 4940 | } |
4941 | |||
4942 | static void cpu_clock_event_read(struct perf_event *event) | ||
4943 | { | ||
4944 | cpu_clock_event_update(event); | ||
4945 | } | ||
4946 | |||
4947 | static int cpu_clock_event_init(struct perf_event *event) | ||
4948 | { | ||
4949 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
4950 | return -ENOENT; | ||
4951 | |||
4952 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) | ||
4953 | return -ENOENT; | ||
4954 | |||
4955 | return 0; | ||
4956 | } | ||
4957 | |||
4958 | static struct pmu perf_cpu_clock = { | ||
4959 | .task_ctx_nr = perf_sw_context, | ||
4960 | |||
4961 | .event_init = cpu_clock_event_init, | ||
4962 | .add = cpu_clock_event_add, | ||
4963 | .del = cpu_clock_event_del, | ||
4964 | .start = cpu_clock_event_start, | ||
4965 | .stop = cpu_clock_event_stop, | ||
4966 | .read = cpu_clock_event_read, | ||
4967 | }; | ||
4968 | |||
4969 | /* | ||
4970 | * Software event: task time clock | ||
4971 | */ | ||
4972 | |||
4973 | static void task_clock_event_update(struct perf_event *event, u64 now) | ||
4974 | { | ||
4975 | u64 prev; | ||
4976 | s64 delta; | ||
4977 | |||
4978 | prev = local64_xchg(&event->hw.prev_count, now); | ||
4979 | delta = now - prev; | ||
4980 | local64_add(delta, &event->count); | ||
4981 | } | ||
4982 | |||
4983 | static void task_clock_event_start(struct perf_event *event, int flags) | ||
4984 | { | ||
4985 | local64_set(&event->hw.prev_count, event->ctx->time); | ||
4986 | perf_swevent_start_hrtimer(event); | ||
4987 | } | ||
4988 | |||
4989 | static void task_clock_event_stop(struct perf_event *event, int flags) | ||
4990 | { | ||
4991 | perf_swevent_cancel_hrtimer(event); | ||
4992 | task_clock_event_update(event, event->ctx->time); | ||
4993 | } | ||
4994 | |||
4995 | static int task_clock_event_add(struct perf_event *event, int flags) | ||
4996 | { | ||
4997 | if (flags & PERF_EF_START) | ||
4998 | task_clock_event_start(event, flags); | ||
4999 | |||
5000 | return 0; | ||
5001 | } | ||
5002 | |||
5003 | static void task_clock_event_del(struct perf_event *event, int flags) | ||
5004 | { | ||
5005 | task_clock_event_stop(event, PERF_EF_UPDATE); | ||
5006 | } | ||
5007 | |||
5008 | static void task_clock_event_read(struct perf_event *event) | ||
5009 | { | ||
5010 | u64 time; | ||
5011 | |||
5012 | if (!in_nmi()) { | ||
5013 | update_context_time(event->ctx); | ||
5014 | time = event->ctx->time; | ||
5015 | } else { | ||
5016 | u64 now = perf_clock(); | ||
5017 | u64 delta = now - event->ctx->timestamp; | ||
5018 | time = event->ctx->time + delta; | ||
5019 | } | ||
5020 | |||
5021 | task_clock_event_update(event, time); | ||
5022 | } | ||
5023 | |||
5024 | static int task_clock_event_init(struct perf_event *event) | ||
5025 | { | ||
5026 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
5027 | return -ENOENT; | ||
5028 | |||
5029 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) | ||
5030 | return -ENOENT; | ||
5031 | |||
5032 | return 0; | ||
5033 | } | ||
5034 | |||
5035 | static struct pmu perf_task_clock = { | ||
5036 | .task_ctx_nr = perf_sw_context, | ||
5037 | |||
5038 | .event_init = task_clock_event_init, | ||
5039 | .add = task_clock_event_add, | ||
5040 | .del = task_clock_event_del, | ||
5041 | .start = task_clock_event_start, | ||
5042 | .stop = task_clock_event_stop, | ||
5043 | .read = task_clock_event_read, | ||
5044 | }; | ||
5045 | |||
5046 | static void perf_pmu_nop_void(struct pmu *pmu) | ||
5047 | { | ||
5048 | } | ||
5049 | |||
5050 | static int perf_pmu_nop_int(struct pmu *pmu) | ||
5051 | { | ||
5052 | return 0; | ||
5053 | } | ||
5054 | |||
5055 | static void perf_pmu_start_txn(struct pmu *pmu) | ||
5056 | { | ||
5057 | perf_pmu_disable(pmu); | ||
5058 | } | ||
5059 | |||
5060 | static int perf_pmu_commit_txn(struct pmu *pmu) | ||
5061 | { | ||
5062 | perf_pmu_enable(pmu); | ||
5063 | return 0; | ||
5064 | } | ||
5065 | |||
5066 | static void perf_pmu_cancel_txn(struct pmu *pmu) | ||
5067 | { | ||
5068 | perf_pmu_enable(pmu); | ||
5069 | } | ||
5070 | |||
5071 | /* | ||
5072 | * Ensures all contexts with the same task_ctx_nr have the same | ||
5073 | * pmu_cpu_context too. | ||
5074 | */ | ||
5075 | static void *find_pmu_context(int ctxn) | ||
5076 | { | ||
5077 | struct pmu *pmu; | ||
5078 | |||
5079 | if (ctxn < 0) | ||
5080 | return NULL; | ||
5081 | |||
5082 | list_for_each_entry(pmu, &pmus, entry) { | ||
5083 | if (pmu->task_ctx_nr == ctxn) | ||
5084 | return pmu->pmu_cpu_context; | ||
5085 | } | ||
5086 | |||
5087 | return NULL; | ||
5088 | } | ||
5089 | |||
5090 | static void free_pmu_context(void * __percpu cpu_context) | ||
5091 | { | ||
5092 | struct pmu *pmu; | ||
4751 | 5093 | ||
5094 | mutex_lock(&pmus_lock); | ||
4752 | /* | 5095 | /* |
4753 | * Software events (currently) can't in general distinguish | 5096 | * Like a real lame refcount. |
4754 | * between user, kernel and hypervisor events. | ||
4755 | * However, context switches and cpu migrations are considered | ||
4756 | * to be kernel events, and page faults are never hypervisor | ||
4757 | * events. | ||
4758 | */ | 5097 | */ |
4759 | switch (event_id) { | 5098 | list_for_each_entry(pmu, &pmus, entry) { |
4760 | case PERF_COUNT_SW_CPU_CLOCK: | 5099 | if (pmu->pmu_cpu_context == cpu_context) |
4761 | pmu = &perf_ops_cpu_clock; | 5100 | goto out; |
5101 | } | ||
4762 | 5102 | ||
4763 | break; | 5103 | free_percpu(cpu_context); |
4764 | case PERF_COUNT_SW_TASK_CLOCK: | 5104 | out: |
4765 | /* | 5105 | mutex_unlock(&pmus_lock); |
4766 | * If the user instantiates this as a per-cpu event, | 5106 | } |
4767 | * use the cpu_clock event instead. | ||
4768 | */ | ||
4769 | if (event->ctx->task) | ||
4770 | pmu = &perf_ops_task_clock; | ||
4771 | else | ||
4772 | pmu = &perf_ops_cpu_clock; | ||
4773 | 5107 | ||
4774 | break; | 5108 | int perf_pmu_register(struct pmu *pmu) |
4775 | case PERF_COUNT_SW_PAGE_FAULTS: | 5109 | { |
4776 | case PERF_COUNT_SW_PAGE_FAULTS_MIN: | 5110 | int cpu, ret; |
4777 | case PERF_COUNT_SW_PAGE_FAULTS_MAJ: | ||
4778 | case PERF_COUNT_SW_CONTEXT_SWITCHES: | ||
4779 | case PERF_COUNT_SW_CPU_MIGRATIONS: | ||
4780 | case PERF_COUNT_SW_ALIGNMENT_FAULTS: | ||
4781 | case PERF_COUNT_SW_EMULATION_FAULTS: | ||
4782 | if (!event->parent) { | ||
4783 | int err; | ||
4784 | |||
4785 | err = swevent_hlist_get(event); | ||
4786 | if (err) | ||
4787 | return ERR_PTR(err); | ||
4788 | 5111 | ||
4789 | atomic_inc(&perf_swevent_enabled[event_id]); | 5112 | mutex_lock(&pmus_lock); |
4790 | event->destroy = sw_perf_event_destroy; | 5113 | ret = -ENOMEM; |
5114 | pmu->pmu_disable_count = alloc_percpu(int); | ||
5115 | if (!pmu->pmu_disable_count) | ||
5116 | goto unlock; | ||
5117 | |||
5118 | pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); | ||
5119 | if (pmu->pmu_cpu_context) | ||
5120 | goto got_cpu_context; | ||
5121 | |||
5122 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); | ||
5123 | if (!pmu->pmu_cpu_context) | ||
5124 | goto free_pdc; | ||
5125 | |||
5126 | for_each_possible_cpu(cpu) { | ||
5127 | struct perf_cpu_context *cpuctx; | ||
5128 | |||
5129 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
5130 | __perf_event_init_context(&cpuctx->ctx); | ||
5131 | cpuctx->ctx.type = cpu_context; | ||
5132 | cpuctx->ctx.pmu = pmu; | ||
5133 | cpuctx->jiffies_interval = 1; | ||
5134 | INIT_LIST_HEAD(&cpuctx->rotation_list); | ||
5135 | } | ||
5136 | |||
5137 | got_cpu_context: | ||
5138 | if (!pmu->start_txn) { | ||
5139 | if (pmu->pmu_enable) { | ||
5140 | /* | ||
5141 | * If we have pmu_enable/pmu_disable calls, install | ||
5142 | * transaction stubs that use that to try and batch | ||
5143 | * hardware accesses. | ||
5144 | */ | ||
5145 | pmu->start_txn = perf_pmu_start_txn; | ||
5146 | pmu->commit_txn = perf_pmu_commit_txn; | ||
5147 | pmu->cancel_txn = perf_pmu_cancel_txn; | ||
5148 | } else { | ||
5149 | pmu->start_txn = perf_pmu_nop_void; | ||
5150 | pmu->commit_txn = perf_pmu_nop_int; | ||
5151 | pmu->cancel_txn = perf_pmu_nop_void; | ||
5152 | } | ||
5153 | } | ||
5154 | |||
5155 | if (!pmu->pmu_enable) { | ||
5156 | pmu->pmu_enable = perf_pmu_nop_void; | ||
5157 | pmu->pmu_disable = perf_pmu_nop_void; | ||
5158 | } | ||
5159 | |||
5160 | list_add_rcu(&pmu->entry, &pmus); | ||
5161 | ret = 0; | ||
5162 | unlock: | ||
5163 | mutex_unlock(&pmus_lock); | ||
5164 | |||
5165 | return ret; | ||
5166 | |||
5167 | free_pdc: | ||
5168 | free_percpu(pmu->pmu_disable_count); | ||
5169 | goto unlock; | ||
5170 | } | ||
5171 | |||
5172 | void perf_pmu_unregister(struct pmu *pmu) | ||
5173 | { | ||
5174 | mutex_lock(&pmus_lock); | ||
5175 | list_del_rcu(&pmu->entry); | ||
5176 | mutex_unlock(&pmus_lock); | ||
5177 | |||
5178 | /* | ||
5179 | * We dereference the pmu list under both SRCU and regular RCU, so | ||
5180 | * synchronize against both of those. | ||
5181 | */ | ||
5182 | synchronize_srcu(&pmus_srcu); | ||
5183 | synchronize_rcu(); | ||
5184 | |||
5185 | free_percpu(pmu->pmu_disable_count); | ||
5186 | free_pmu_context(pmu->pmu_cpu_context); | ||
5187 | } | ||
5188 | |||
5189 | struct pmu *perf_init_event(struct perf_event *event) | ||
5190 | { | ||
5191 | struct pmu *pmu = NULL; | ||
5192 | int idx; | ||
5193 | |||
5194 | idx = srcu_read_lock(&pmus_srcu); | ||
5195 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
5196 | int ret = pmu->event_init(event); | ||
5197 | if (!ret) | ||
5198 | goto unlock; | ||
5199 | |||
5200 | if (ret != -ENOENT) { | ||
5201 | pmu = ERR_PTR(ret); | ||
5202 | goto unlock; | ||
4791 | } | 5203 | } |
4792 | pmu = &perf_ops_generic; | ||
4793 | break; | ||
4794 | } | 5204 | } |
5205 | pmu = ERR_PTR(-ENOENT); | ||
5206 | unlock: | ||
5207 | srcu_read_unlock(&pmus_srcu, idx); | ||
4795 | 5208 | ||
4796 | return pmu; | 5209 | return pmu; |
4797 | } | 5210 | } |
@@ -4800,20 +5213,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) | |||
4800 | * Allocate and initialize a event structure | 5213 | * Allocate and initialize a event structure |
4801 | */ | 5214 | */ |
4802 | static struct perf_event * | 5215 | static struct perf_event * |
4803 | perf_event_alloc(struct perf_event_attr *attr, | 5216 | perf_event_alloc(struct perf_event_attr *attr, int cpu, |
4804 | int cpu, | 5217 | struct task_struct *task, |
4805 | struct perf_event_context *ctx, | 5218 | struct perf_event *group_leader, |
4806 | struct perf_event *group_leader, | 5219 | struct perf_event *parent_event, |
4807 | struct perf_event *parent_event, | 5220 | perf_overflow_handler_t overflow_handler) |
4808 | perf_overflow_handler_t overflow_handler, | 5221 | { |
4809 | gfp_t gfpflags) | 5222 | struct pmu *pmu; |
4810 | { | ||
4811 | const struct pmu *pmu; | ||
4812 | struct perf_event *event; | 5223 | struct perf_event *event; |
4813 | struct hw_perf_event *hwc; | 5224 | struct hw_perf_event *hwc; |
4814 | long err; | 5225 | long err; |
4815 | 5226 | ||
4816 | event = kzalloc(sizeof(*event), gfpflags); | 5227 | event = kzalloc(sizeof(*event), GFP_KERNEL); |
4817 | if (!event) | 5228 | if (!event) |
4818 | return ERR_PTR(-ENOMEM); | 5229 | return ERR_PTR(-ENOMEM); |
4819 | 5230 | ||
@@ -4831,6 +5242,7 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4831 | INIT_LIST_HEAD(&event->event_entry); | 5242 | INIT_LIST_HEAD(&event->event_entry); |
4832 | INIT_LIST_HEAD(&event->sibling_list); | 5243 | INIT_LIST_HEAD(&event->sibling_list); |
4833 | init_waitqueue_head(&event->waitq); | 5244 | init_waitqueue_head(&event->waitq); |
5245 | init_irq_work(&event->pending, perf_pending_event); | ||
4834 | 5246 | ||
4835 | mutex_init(&event->mmap_mutex); | 5247 | mutex_init(&event->mmap_mutex); |
4836 | 5248 | ||
@@ -4838,7 +5250,6 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4838 | event->attr = *attr; | 5250 | event->attr = *attr; |
4839 | event->group_leader = group_leader; | 5251 | event->group_leader = group_leader; |
4840 | event->pmu = NULL; | 5252 | event->pmu = NULL; |
4841 | event->ctx = ctx; | ||
4842 | event->oncpu = -1; | 5253 | event->oncpu = -1; |
4843 | 5254 | ||
4844 | event->parent = parent_event; | 5255 | event->parent = parent_event; |
@@ -4848,6 +5259,17 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4848 | 5259 | ||
4849 | event->state = PERF_EVENT_STATE_INACTIVE; | 5260 | event->state = PERF_EVENT_STATE_INACTIVE; |
4850 | 5261 | ||
5262 | if (task) { | ||
5263 | event->attach_state = PERF_ATTACH_TASK; | ||
5264 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | ||
5265 | /* | ||
5266 | * hw_breakpoint is a bit difficult here.. | ||
5267 | */ | ||
5268 | if (attr->type == PERF_TYPE_BREAKPOINT) | ||
5269 | event->hw.bp_target = task; | ||
5270 | #endif | ||
5271 | } | ||
5272 | |||
4851 | if (!overflow_handler && parent_event) | 5273 | if (!overflow_handler && parent_event) |
4852 | overflow_handler = parent_event->overflow_handler; | 5274 | overflow_handler = parent_event->overflow_handler; |
4853 | 5275 | ||
@@ -4864,7 +5286,7 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4864 | hwc->sample_period = 1; | 5286 | hwc->sample_period = 1; |
4865 | hwc->last_period = hwc->sample_period; | 5287 | hwc->last_period = hwc->sample_period; |
4866 | 5288 | ||
4867 | atomic64_set(&hwc->period_left, hwc->sample_period); | 5289 | local64_set(&hwc->period_left, hwc->sample_period); |
4868 | 5290 | ||
4869 | /* | 5291 | /* |
4870 | * we currently do not support PERF_FORMAT_GROUP on inherited events | 5292 | * we currently do not support PERF_FORMAT_GROUP on inherited events |
@@ -4872,29 +5294,8 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4872 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) | 5294 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) |
4873 | goto done; | 5295 | goto done; |
4874 | 5296 | ||
4875 | switch (attr->type) { | 5297 | pmu = perf_init_event(event); |
4876 | case PERF_TYPE_RAW: | ||
4877 | case PERF_TYPE_HARDWARE: | ||
4878 | case PERF_TYPE_HW_CACHE: | ||
4879 | pmu = hw_perf_event_init(event); | ||
4880 | break; | ||
4881 | |||
4882 | case PERF_TYPE_SOFTWARE: | ||
4883 | pmu = sw_perf_event_init(event); | ||
4884 | break; | ||
4885 | 5298 | ||
4886 | case PERF_TYPE_TRACEPOINT: | ||
4887 | pmu = tp_perf_event_init(event); | ||
4888 | break; | ||
4889 | |||
4890 | case PERF_TYPE_BREAKPOINT: | ||
4891 | pmu = bp_perf_event_init(event); | ||
4892 | break; | ||
4893 | |||
4894 | |||
4895 | default: | ||
4896 | break; | ||
4897 | } | ||
4898 | done: | 5299 | done: |
4899 | err = 0; | 5300 | err = 0; |
4900 | if (!pmu) | 5301 | if (!pmu) |
@@ -4912,13 +5313,21 @@ done: | |||
4912 | event->pmu = pmu; | 5313 | event->pmu = pmu; |
4913 | 5314 | ||
4914 | if (!event->parent) { | 5315 | if (!event->parent) { |
4915 | atomic_inc(&nr_events); | 5316 | if (event->attach_state & PERF_ATTACH_TASK) |
4916 | if (event->attr.mmap) | 5317 | jump_label_inc(&perf_task_events); |
5318 | if (event->attr.mmap || event->attr.mmap_data) | ||
4917 | atomic_inc(&nr_mmap_events); | 5319 | atomic_inc(&nr_mmap_events); |
4918 | if (event->attr.comm) | 5320 | if (event->attr.comm) |
4919 | atomic_inc(&nr_comm_events); | 5321 | atomic_inc(&nr_comm_events); |
4920 | if (event->attr.task) | 5322 | if (event->attr.task) |
4921 | atomic_inc(&nr_task_events); | 5323 | atomic_inc(&nr_task_events); |
5324 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | ||
5325 | err = get_callchain_buffers(); | ||
5326 | if (err) { | ||
5327 | free_event(event); | ||
5328 | return ERR_PTR(err); | ||
5329 | } | ||
5330 | } | ||
4922 | } | 5331 | } |
4923 | 5332 | ||
4924 | return event; | 5333 | return event; |
@@ -5007,7 +5416,7 @@ err_size: | |||
5007 | static int | 5416 | static int |
5008 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) | 5417 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) |
5009 | { | 5418 | { |
5010 | struct perf_mmap_data *data = NULL, *old_data = NULL; | 5419 | struct perf_buffer *buffer = NULL, *old_buffer = NULL; |
5011 | int ret = -EINVAL; | 5420 | int ret = -EINVAL; |
5012 | 5421 | ||
5013 | if (!output_event) | 5422 | if (!output_event) |
@@ -5037,19 +5446,19 @@ set: | |||
5037 | 5446 | ||
5038 | if (output_event) { | 5447 | if (output_event) { |
5039 | /* get the buffer we want to redirect to */ | 5448 | /* get the buffer we want to redirect to */ |
5040 | data = perf_mmap_data_get(output_event); | 5449 | buffer = perf_buffer_get(output_event); |
5041 | if (!data) | 5450 | if (!buffer) |
5042 | goto unlock; | 5451 | goto unlock; |
5043 | } | 5452 | } |
5044 | 5453 | ||
5045 | old_data = event->data; | 5454 | old_buffer = event->buffer; |
5046 | rcu_assign_pointer(event->data, data); | 5455 | rcu_assign_pointer(event->buffer, buffer); |
5047 | ret = 0; | 5456 | ret = 0; |
5048 | unlock: | 5457 | unlock: |
5049 | mutex_unlock(&event->mmap_mutex); | 5458 | mutex_unlock(&event->mmap_mutex); |
5050 | 5459 | ||
5051 | if (old_data) | 5460 | if (old_buffer) |
5052 | perf_mmap_data_put(old_data); | 5461 | perf_buffer_put(old_buffer); |
5053 | out: | 5462 | out: |
5054 | return ret; | 5463 | return ret; |
5055 | } | 5464 | } |
@@ -5066,12 +5475,16 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5066 | struct perf_event_attr __user *, attr_uptr, | 5475 | struct perf_event_attr __user *, attr_uptr, |
5067 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) | 5476 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) |
5068 | { | 5477 | { |
5069 | struct perf_event *event, *group_leader = NULL, *output_event = NULL; | 5478 | struct perf_event *group_leader = NULL, *output_event = NULL; |
5479 | struct perf_event *event, *sibling; | ||
5070 | struct perf_event_attr attr; | 5480 | struct perf_event_attr attr; |
5071 | struct perf_event_context *ctx; | 5481 | struct perf_event_context *ctx; |
5072 | struct file *event_file = NULL; | 5482 | struct file *event_file = NULL; |
5073 | struct file *group_file = NULL; | 5483 | struct file *group_file = NULL; |
5484 | struct task_struct *task = NULL; | ||
5485 | struct pmu *pmu; | ||
5074 | int event_fd; | 5486 | int event_fd; |
5487 | int move_group = 0; | ||
5075 | int fput_needed = 0; | 5488 | int fput_needed = 0; |
5076 | int err; | 5489 | int err; |
5077 | 5490 | ||
@@ -5097,20 +5510,11 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5097 | if (event_fd < 0) | 5510 | if (event_fd < 0) |
5098 | return event_fd; | 5511 | return event_fd; |
5099 | 5512 | ||
5100 | /* | ||
5101 | * Get the target context (task or percpu): | ||
5102 | */ | ||
5103 | ctx = find_get_context(pid, cpu); | ||
5104 | if (IS_ERR(ctx)) { | ||
5105 | err = PTR_ERR(ctx); | ||
5106 | goto err_fd; | ||
5107 | } | ||
5108 | |||
5109 | if (group_fd != -1) { | 5513 | if (group_fd != -1) { |
5110 | group_leader = perf_fget_light(group_fd, &fput_needed); | 5514 | group_leader = perf_fget_light(group_fd, &fput_needed); |
5111 | if (IS_ERR(group_leader)) { | 5515 | if (IS_ERR(group_leader)) { |
5112 | err = PTR_ERR(group_leader); | 5516 | err = PTR_ERR(group_leader); |
5113 | goto err_put_context; | 5517 | goto err_fd; |
5114 | } | 5518 | } |
5115 | group_file = group_leader->filp; | 5519 | group_file = group_leader->filp; |
5116 | if (flags & PERF_FLAG_FD_OUTPUT) | 5520 | if (flags & PERF_FLAG_FD_OUTPUT) |
@@ -5119,6 +5523,58 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5119 | group_leader = NULL; | 5523 | group_leader = NULL; |
5120 | } | 5524 | } |
5121 | 5525 | ||
5526 | if (pid != -1) { | ||
5527 | task = find_lively_task_by_vpid(pid); | ||
5528 | if (IS_ERR(task)) { | ||
5529 | err = PTR_ERR(task); | ||
5530 | goto err_group_fd; | ||
5531 | } | ||
5532 | } | ||
5533 | |||
5534 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); | ||
5535 | if (IS_ERR(event)) { | ||
5536 | err = PTR_ERR(event); | ||
5537 | goto err_task; | ||
5538 | } | ||
5539 | |||
5540 | /* | ||
5541 | * Special case software events and allow them to be part of | ||
5542 | * any hardware group. | ||
5543 | */ | ||
5544 | pmu = event->pmu; | ||
5545 | |||
5546 | if (group_leader && | ||
5547 | (is_software_event(event) != is_software_event(group_leader))) { | ||
5548 | if (is_software_event(event)) { | ||
5549 | /* | ||
5550 | * If event and group_leader are not both a software | ||
5551 | * event, and event is, then group leader is not. | ||
5552 | * | ||
5553 | * Allow the addition of software events to !software | ||
5554 | * groups, this is safe because software events never | ||
5555 | * fail to schedule. | ||
5556 | */ | ||
5557 | pmu = group_leader->pmu; | ||
5558 | } else if (is_software_event(group_leader) && | ||
5559 | (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { | ||
5560 | /* | ||
5561 | * In case the group is a pure software group, and we | ||
5562 | * try to add a hardware event, move the whole group to | ||
5563 | * the hardware context. | ||
5564 | */ | ||
5565 | move_group = 1; | ||
5566 | } | ||
5567 | } | ||
5568 | |||
5569 | /* | ||
5570 | * Get the target context (task or percpu): | ||
5571 | */ | ||
5572 | ctx = find_get_context(pmu, task, cpu); | ||
5573 | if (IS_ERR(ctx)) { | ||
5574 | err = PTR_ERR(ctx); | ||
5575 | goto err_alloc; | ||
5576 | } | ||
5577 | |||
5122 | /* | 5578 | /* |
5123 | * Look up the group leader (we will attach this event to it): | 5579 | * Look up the group leader (we will attach this event to it): |
5124 | */ | 5580 | */ |
@@ -5130,42 +5586,66 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5130 | * becoming part of another group-sibling): | 5586 | * becoming part of another group-sibling): |
5131 | */ | 5587 | */ |
5132 | if (group_leader->group_leader != group_leader) | 5588 | if (group_leader->group_leader != group_leader) |
5133 | goto err_put_context; | 5589 | goto err_context; |
5134 | /* | 5590 | /* |
5135 | * Do not allow to attach to a group in a different | 5591 | * Do not allow to attach to a group in a different |
5136 | * task or CPU context: | 5592 | * task or CPU context: |
5137 | */ | 5593 | */ |
5138 | if (group_leader->ctx != ctx) | 5594 | if (move_group) { |
5139 | goto err_put_context; | 5595 | if (group_leader->ctx->type != ctx->type) |
5596 | goto err_context; | ||
5597 | } else { | ||
5598 | if (group_leader->ctx != ctx) | ||
5599 | goto err_context; | ||
5600 | } | ||
5601 | |||
5140 | /* | 5602 | /* |
5141 | * Only a group leader can be exclusive or pinned | 5603 | * Only a group leader can be exclusive or pinned |
5142 | */ | 5604 | */ |
5143 | if (attr.exclusive || attr.pinned) | 5605 | if (attr.exclusive || attr.pinned) |
5144 | goto err_put_context; | 5606 | goto err_context; |
5145 | } | ||
5146 | |||
5147 | event = perf_event_alloc(&attr, cpu, ctx, group_leader, | ||
5148 | NULL, NULL, GFP_KERNEL); | ||
5149 | if (IS_ERR(event)) { | ||
5150 | err = PTR_ERR(event); | ||
5151 | goto err_put_context; | ||
5152 | } | 5607 | } |
5153 | 5608 | ||
5154 | if (output_event) { | 5609 | if (output_event) { |
5155 | err = perf_event_set_output(event, output_event); | 5610 | err = perf_event_set_output(event, output_event); |
5156 | if (err) | 5611 | if (err) |
5157 | goto err_free_put_context; | 5612 | goto err_context; |
5158 | } | 5613 | } |
5159 | 5614 | ||
5160 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); | 5615 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); |
5161 | if (IS_ERR(event_file)) { | 5616 | if (IS_ERR(event_file)) { |
5162 | err = PTR_ERR(event_file); | 5617 | err = PTR_ERR(event_file); |
5163 | goto err_free_put_context; | 5618 | goto err_context; |
5619 | } | ||
5620 | |||
5621 | if (move_group) { | ||
5622 | struct perf_event_context *gctx = group_leader->ctx; | ||
5623 | |||
5624 | mutex_lock(&gctx->mutex); | ||
5625 | perf_event_remove_from_context(group_leader); | ||
5626 | list_for_each_entry(sibling, &group_leader->sibling_list, | ||
5627 | group_entry) { | ||
5628 | perf_event_remove_from_context(sibling); | ||
5629 | put_ctx(gctx); | ||
5630 | } | ||
5631 | mutex_unlock(&gctx->mutex); | ||
5632 | put_ctx(gctx); | ||
5164 | } | 5633 | } |
5165 | 5634 | ||
5166 | event->filp = event_file; | 5635 | event->filp = event_file; |
5167 | WARN_ON_ONCE(ctx->parent_ctx); | 5636 | WARN_ON_ONCE(ctx->parent_ctx); |
5168 | mutex_lock(&ctx->mutex); | 5637 | mutex_lock(&ctx->mutex); |
5638 | |||
5639 | if (move_group) { | ||
5640 | perf_install_in_context(ctx, group_leader, cpu); | ||
5641 | get_ctx(ctx); | ||
5642 | list_for_each_entry(sibling, &group_leader->sibling_list, | ||
5643 | group_entry) { | ||
5644 | perf_install_in_context(ctx, sibling, cpu); | ||
5645 | get_ctx(ctx); | ||
5646 | } | ||
5647 | } | ||
5648 | |||
5169 | perf_install_in_context(ctx, event, cpu); | 5649 | perf_install_in_context(ctx, event, cpu); |
5170 | ++ctx->generation; | 5650 | ++ctx->generation; |
5171 | mutex_unlock(&ctx->mutex); | 5651 | mutex_unlock(&ctx->mutex); |
@@ -5186,11 +5666,15 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5186 | fd_install(event_fd, event_file); | 5666 | fd_install(event_fd, event_file); |
5187 | return event_fd; | 5667 | return event_fd; |
5188 | 5668 | ||
5189 | err_free_put_context: | 5669 | err_context: |
5670 | put_ctx(ctx); | ||
5671 | err_alloc: | ||
5190 | free_event(event); | 5672 | free_event(event); |
5191 | err_put_context: | 5673 | err_task: |
5674 | if (task) | ||
5675 | put_task_struct(task); | ||
5676 | err_group_fd: | ||
5192 | fput_light(group_file, fput_needed); | 5677 | fput_light(group_file, fput_needed); |
5193 | put_ctx(ctx); | ||
5194 | err_fd: | 5678 | err_fd: |
5195 | put_unused_fd(event_fd); | 5679 | put_unused_fd(event_fd); |
5196 | return err; | 5680 | return err; |
@@ -5201,32 +5685,31 @@ err_fd: | |||
5201 | * | 5685 | * |
5202 | * @attr: attributes of the counter to create | 5686 | * @attr: attributes of the counter to create |
5203 | * @cpu: cpu in which the counter is bound | 5687 | * @cpu: cpu in which the counter is bound |
5204 | * @pid: task to profile | 5688 | * @task: task to profile (NULL for percpu) |
5205 | */ | 5689 | */ |
5206 | struct perf_event * | 5690 | struct perf_event * |
5207 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | 5691 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, |
5208 | pid_t pid, | 5692 | struct task_struct *task, |
5209 | perf_overflow_handler_t overflow_handler) | 5693 | perf_overflow_handler_t overflow_handler) |
5210 | { | 5694 | { |
5211 | struct perf_event *event; | ||
5212 | struct perf_event_context *ctx; | 5695 | struct perf_event_context *ctx; |
5696 | struct perf_event *event; | ||
5213 | int err; | 5697 | int err; |
5214 | 5698 | ||
5215 | /* | 5699 | /* |
5216 | * Get the target context (task or percpu): | 5700 | * Get the target context (task or percpu): |
5217 | */ | 5701 | */ |
5218 | 5702 | ||
5219 | ctx = find_get_context(pid, cpu); | 5703 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); |
5220 | if (IS_ERR(ctx)) { | ||
5221 | err = PTR_ERR(ctx); | ||
5222 | goto err_exit; | ||
5223 | } | ||
5224 | |||
5225 | event = perf_event_alloc(attr, cpu, ctx, NULL, | ||
5226 | NULL, overflow_handler, GFP_KERNEL); | ||
5227 | if (IS_ERR(event)) { | 5704 | if (IS_ERR(event)) { |
5228 | err = PTR_ERR(event); | 5705 | err = PTR_ERR(event); |
5229 | goto err_put_context; | 5706 | goto err; |
5707 | } | ||
5708 | |||
5709 | ctx = find_get_context(event->pmu, task, cpu); | ||
5710 | if (IS_ERR(ctx)) { | ||
5711 | err = PTR_ERR(ctx); | ||
5712 | goto err_free; | ||
5230 | } | 5713 | } |
5231 | 5714 | ||
5232 | event->filp = NULL; | 5715 | event->filp = NULL; |
@@ -5244,112 +5727,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
5244 | 5727 | ||
5245 | return event; | 5728 | return event; |
5246 | 5729 | ||
5247 | err_put_context: | 5730 | err_free: |
5248 | put_ctx(ctx); | 5731 | free_event(event); |
5249 | err_exit: | 5732 | err: |
5250 | return ERR_PTR(err); | 5733 | return ERR_PTR(err); |
5251 | } | 5734 | } |
5252 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); | 5735 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); |
5253 | 5736 | ||
5254 | /* | ||
5255 | * inherit a event from parent task to child task: | ||
5256 | */ | ||
5257 | static struct perf_event * | ||
5258 | inherit_event(struct perf_event *parent_event, | ||
5259 | struct task_struct *parent, | ||
5260 | struct perf_event_context *parent_ctx, | ||
5261 | struct task_struct *child, | ||
5262 | struct perf_event *group_leader, | ||
5263 | struct perf_event_context *child_ctx) | ||
5264 | { | ||
5265 | struct perf_event *child_event; | ||
5266 | |||
5267 | /* | ||
5268 | * Instead of creating recursive hierarchies of events, | ||
5269 | * we link inherited events back to the original parent, | ||
5270 | * which has a filp for sure, which we use as the reference | ||
5271 | * count: | ||
5272 | */ | ||
5273 | if (parent_event->parent) | ||
5274 | parent_event = parent_event->parent; | ||
5275 | |||
5276 | child_event = perf_event_alloc(&parent_event->attr, | ||
5277 | parent_event->cpu, child_ctx, | ||
5278 | group_leader, parent_event, | ||
5279 | NULL, GFP_KERNEL); | ||
5280 | if (IS_ERR(child_event)) | ||
5281 | return child_event; | ||
5282 | get_ctx(child_ctx); | ||
5283 | |||
5284 | /* | ||
5285 | * Make the child state follow the state of the parent event, | ||
5286 | * not its attr.disabled bit. We hold the parent's mutex, | ||
5287 | * so we won't race with perf_event_{en, dis}able_family. | ||
5288 | */ | ||
5289 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) | ||
5290 | child_event->state = PERF_EVENT_STATE_INACTIVE; | ||
5291 | else | ||
5292 | child_event->state = PERF_EVENT_STATE_OFF; | ||
5293 | |||
5294 | if (parent_event->attr.freq) { | ||
5295 | u64 sample_period = parent_event->hw.sample_period; | ||
5296 | struct hw_perf_event *hwc = &child_event->hw; | ||
5297 | |||
5298 | hwc->sample_period = sample_period; | ||
5299 | hwc->last_period = sample_period; | ||
5300 | |||
5301 | atomic64_set(&hwc->period_left, sample_period); | ||
5302 | } | ||
5303 | |||
5304 | child_event->overflow_handler = parent_event->overflow_handler; | ||
5305 | |||
5306 | /* | ||
5307 | * Link it up in the child's context: | ||
5308 | */ | ||
5309 | add_event_to_ctx(child_event, child_ctx); | ||
5310 | |||
5311 | /* | ||
5312 | * Get a reference to the parent filp - we will fput it | ||
5313 | * when the child event exits. This is safe to do because | ||
5314 | * we are in the parent and we know that the filp still | ||
5315 | * exists and has a nonzero count: | ||
5316 | */ | ||
5317 | atomic_long_inc(&parent_event->filp->f_count); | ||
5318 | |||
5319 | /* | ||
5320 | * Link this into the parent event's child list | ||
5321 | */ | ||
5322 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
5323 | mutex_lock(&parent_event->child_mutex); | ||
5324 | list_add_tail(&child_event->child_list, &parent_event->child_list); | ||
5325 | mutex_unlock(&parent_event->child_mutex); | ||
5326 | |||
5327 | return child_event; | ||
5328 | } | ||
5329 | |||
5330 | static int inherit_group(struct perf_event *parent_event, | ||
5331 | struct task_struct *parent, | ||
5332 | struct perf_event_context *parent_ctx, | ||
5333 | struct task_struct *child, | ||
5334 | struct perf_event_context *child_ctx) | ||
5335 | { | ||
5336 | struct perf_event *leader; | ||
5337 | struct perf_event *sub; | ||
5338 | struct perf_event *child_ctr; | ||
5339 | |||
5340 | leader = inherit_event(parent_event, parent, parent_ctx, | ||
5341 | child, NULL, child_ctx); | ||
5342 | if (IS_ERR(leader)) | ||
5343 | return PTR_ERR(leader); | ||
5344 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { | ||
5345 | child_ctr = inherit_event(sub, parent, parent_ctx, | ||
5346 | child, leader, child_ctx); | ||
5347 | if (IS_ERR(child_ctr)) | ||
5348 | return PTR_ERR(child_ctr); | ||
5349 | } | ||
5350 | return 0; | ||
5351 | } | ||
5352 | |||
5353 | static void sync_child_event(struct perf_event *child_event, | 5737 | static void sync_child_event(struct perf_event *child_event, |
5354 | struct task_struct *child) | 5738 | struct task_struct *child) |
5355 | { | 5739 | { |
@@ -5359,12 +5743,12 @@ static void sync_child_event(struct perf_event *child_event, | |||
5359 | if (child_event->attr.inherit_stat) | 5743 | if (child_event->attr.inherit_stat) |
5360 | perf_event_read_event(child_event, child); | 5744 | perf_event_read_event(child_event, child); |
5361 | 5745 | ||
5362 | child_val = atomic64_read(&child_event->count); | 5746 | child_val = perf_event_count(child_event); |
5363 | 5747 | ||
5364 | /* | 5748 | /* |
5365 | * Add back the child's count to the parent's count: | 5749 | * Add back the child's count to the parent's count: |
5366 | */ | 5750 | */ |
5367 | atomic64_add(child_val, &parent_event->count); | 5751 | atomic64_add(child_val, &parent_event->child_count); |
5368 | atomic64_add(child_event->total_time_enabled, | 5752 | atomic64_add(child_event->total_time_enabled, |
5369 | &parent_event->child_total_time_enabled); | 5753 | &parent_event->child_total_time_enabled); |
5370 | atomic64_add(child_event->total_time_running, | 5754 | atomic64_add(child_event->total_time_running, |
@@ -5406,16 +5790,13 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
5406 | } | 5790 | } |
5407 | } | 5791 | } |
5408 | 5792 | ||
5409 | /* | 5793 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) |
5410 | * When a child task exits, feed back event values to parent events. | ||
5411 | */ | ||
5412 | void perf_event_exit_task(struct task_struct *child) | ||
5413 | { | 5794 | { |
5414 | struct perf_event *child_event, *tmp; | 5795 | struct perf_event *child_event, *tmp; |
5415 | struct perf_event_context *child_ctx; | 5796 | struct perf_event_context *child_ctx; |
5416 | unsigned long flags; | 5797 | unsigned long flags; |
5417 | 5798 | ||
5418 | if (likely(!child->perf_event_ctxp)) { | 5799 | if (likely(!child->perf_event_ctxp[ctxn])) { |
5419 | perf_event_task(child, NULL, 0); | 5800 | perf_event_task(child, NULL, 0); |
5420 | return; | 5801 | return; |
5421 | } | 5802 | } |
@@ -5427,8 +5808,8 @@ void perf_event_exit_task(struct task_struct *child) | |||
5427 | * scheduled, so we are now safe from rescheduling changing | 5808 | * scheduled, so we are now safe from rescheduling changing |
5428 | * our context. | 5809 | * our context. |
5429 | */ | 5810 | */ |
5430 | child_ctx = child->perf_event_ctxp; | 5811 | child_ctx = child->perf_event_ctxp[ctxn]; |
5431 | __perf_event_task_sched_out(child_ctx); | 5812 | task_ctx_sched_out(child_ctx, EVENT_ALL); |
5432 | 5813 | ||
5433 | /* | 5814 | /* |
5434 | * Take the context lock here so that if find_get_context is | 5815 | * Take the context lock here so that if find_get_context is |
@@ -5436,7 +5817,7 @@ void perf_event_exit_task(struct task_struct *child) | |||
5436 | * incremented the context's refcount before we do put_ctx below. | 5817 | * incremented the context's refcount before we do put_ctx below. |
5437 | */ | 5818 | */ |
5438 | raw_spin_lock(&child_ctx->lock); | 5819 | raw_spin_lock(&child_ctx->lock); |
5439 | child->perf_event_ctxp = NULL; | 5820 | child->perf_event_ctxp[ctxn] = NULL; |
5440 | /* | 5821 | /* |
5441 | * If this context is a clone; unclone it so it can't get | 5822 | * If this context is a clone; unclone it so it can't get |
5442 | * swapped to another process while we're removing all | 5823 | * swapped to another process while we're removing all |
@@ -5489,6 +5870,17 @@ again: | |||
5489 | put_ctx(child_ctx); | 5870 | put_ctx(child_ctx); |
5490 | } | 5871 | } |
5491 | 5872 | ||
5873 | /* | ||
5874 | * When a child task exits, feed back event values to parent events. | ||
5875 | */ | ||
5876 | void perf_event_exit_task(struct task_struct *child) | ||
5877 | { | ||
5878 | int ctxn; | ||
5879 | |||
5880 | for_each_task_context_nr(ctxn) | ||
5881 | perf_event_exit_task_context(child, ctxn); | ||
5882 | } | ||
5883 | |||
5492 | static void perf_free_event(struct perf_event *event, | 5884 | static void perf_free_event(struct perf_event *event, |
5493 | struct perf_event_context *ctx) | 5885 | struct perf_event_context *ctx) |
5494 | { | 5886 | { |
@@ -5510,48 +5902,166 @@ static void perf_free_event(struct perf_event *event, | |||
5510 | 5902 | ||
5511 | /* | 5903 | /* |
5512 | * free an unexposed, unused context as created by inheritance by | 5904 | * free an unexposed, unused context as created by inheritance by |
5513 | * init_task below, used by fork() in case of fail. | 5905 | * perf_event_init_task below, used by fork() in case of fail. |
5514 | */ | 5906 | */ |
5515 | void perf_event_free_task(struct task_struct *task) | 5907 | void perf_event_free_task(struct task_struct *task) |
5516 | { | 5908 | { |
5517 | struct perf_event_context *ctx = task->perf_event_ctxp; | 5909 | struct perf_event_context *ctx; |
5518 | struct perf_event *event, *tmp; | 5910 | struct perf_event *event, *tmp; |
5911 | int ctxn; | ||
5519 | 5912 | ||
5520 | if (!ctx) | 5913 | for_each_task_context_nr(ctxn) { |
5521 | return; | 5914 | ctx = task->perf_event_ctxp[ctxn]; |
5915 | if (!ctx) | ||
5916 | continue; | ||
5522 | 5917 | ||
5523 | mutex_lock(&ctx->mutex); | 5918 | mutex_lock(&ctx->mutex); |
5524 | again: | 5919 | again: |
5525 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 5920 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, |
5526 | perf_free_event(event, ctx); | 5921 | group_entry) |
5922 | perf_free_event(event, ctx); | ||
5527 | 5923 | ||
5528 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, | 5924 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, |
5529 | group_entry) | 5925 | group_entry) |
5530 | perf_free_event(event, ctx); | 5926 | perf_free_event(event, ctx); |
5531 | 5927 | ||
5532 | if (!list_empty(&ctx->pinned_groups) || | 5928 | if (!list_empty(&ctx->pinned_groups) || |
5533 | !list_empty(&ctx->flexible_groups)) | 5929 | !list_empty(&ctx->flexible_groups)) |
5534 | goto again; | 5930 | goto again; |
5535 | 5931 | ||
5536 | mutex_unlock(&ctx->mutex); | 5932 | mutex_unlock(&ctx->mutex); |
5537 | 5933 | ||
5538 | put_ctx(ctx); | 5934 | put_ctx(ctx); |
5935 | } | ||
5936 | } | ||
5937 | |||
5938 | void perf_event_delayed_put(struct task_struct *task) | ||
5939 | { | ||
5940 | int ctxn; | ||
5941 | |||
5942 | for_each_task_context_nr(ctxn) | ||
5943 | WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); | ||
5944 | } | ||
5945 | |||
5946 | /* | ||
5947 | * inherit a event from parent task to child task: | ||
5948 | */ | ||
5949 | static struct perf_event * | ||
5950 | inherit_event(struct perf_event *parent_event, | ||
5951 | struct task_struct *parent, | ||
5952 | struct perf_event_context *parent_ctx, | ||
5953 | struct task_struct *child, | ||
5954 | struct perf_event *group_leader, | ||
5955 | struct perf_event_context *child_ctx) | ||
5956 | { | ||
5957 | struct perf_event *child_event; | ||
5958 | unsigned long flags; | ||
5959 | |||
5960 | /* | ||
5961 | * Instead of creating recursive hierarchies of events, | ||
5962 | * we link inherited events back to the original parent, | ||
5963 | * which has a filp for sure, which we use as the reference | ||
5964 | * count: | ||
5965 | */ | ||
5966 | if (parent_event->parent) | ||
5967 | parent_event = parent_event->parent; | ||
5968 | |||
5969 | child_event = perf_event_alloc(&parent_event->attr, | ||
5970 | parent_event->cpu, | ||
5971 | child, | ||
5972 | group_leader, parent_event, | ||
5973 | NULL); | ||
5974 | if (IS_ERR(child_event)) | ||
5975 | return child_event; | ||
5976 | get_ctx(child_ctx); | ||
5977 | |||
5978 | /* | ||
5979 | * Make the child state follow the state of the parent event, | ||
5980 | * not its attr.disabled bit. We hold the parent's mutex, | ||
5981 | * so we won't race with perf_event_{en, dis}able_family. | ||
5982 | */ | ||
5983 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) | ||
5984 | child_event->state = PERF_EVENT_STATE_INACTIVE; | ||
5985 | else | ||
5986 | child_event->state = PERF_EVENT_STATE_OFF; | ||
5987 | |||
5988 | if (parent_event->attr.freq) { | ||
5989 | u64 sample_period = parent_event->hw.sample_period; | ||
5990 | struct hw_perf_event *hwc = &child_event->hw; | ||
5991 | |||
5992 | hwc->sample_period = sample_period; | ||
5993 | hwc->last_period = sample_period; | ||
5994 | |||
5995 | local64_set(&hwc->period_left, sample_period); | ||
5996 | } | ||
5997 | |||
5998 | child_event->ctx = child_ctx; | ||
5999 | child_event->overflow_handler = parent_event->overflow_handler; | ||
6000 | |||
6001 | /* | ||
6002 | * Link it up in the child's context: | ||
6003 | */ | ||
6004 | raw_spin_lock_irqsave(&child_ctx->lock, flags); | ||
6005 | add_event_to_ctx(child_event, child_ctx); | ||
6006 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); | ||
6007 | |||
6008 | /* | ||
6009 | * Get a reference to the parent filp - we will fput it | ||
6010 | * when the child event exits. This is safe to do because | ||
6011 | * we are in the parent and we know that the filp still | ||
6012 | * exists and has a nonzero count: | ||
6013 | */ | ||
6014 | atomic_long_inc(&parent_event->filp->f_count); | ||
6015 | |||
6016 | /* | ||
6017 | * Link this into the parent event's child list | ||
6018 | */ | ||
6019 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
6020 | mutex_lock(&parent_event->child_mutex); | ||
6021 | list_add_tail(&child_event->child_list, &parent_event->child_list); | ||
6022 | mutex_unlock(&parent_event->child_mutex); | ||
6023 | |||
6024 | return child_event; | ||
6025 | } | ||
6026 | |||
6027 | static int inherit_group(struct perf_event *parent_event, | ||
6028 | struct task_struct *parent, | ||
6029 | struct perf_event_context *parent_ctx, | ||
6030 | struct task_struct *child, | ||
6031 | struct perf_event_context *child_ctx) | ||
6032 | { | ||
6033 | struct perf_event *leader; | ||
6034 | struct perf_event *sub; | ||
6035 | struct perf_event *child_ctr; | ||
6036 | |||
6037 | leader = inherit_event(parent_event, parent, parent_ctx, | ||
6038 | child, NULL, child_ctx); | ||
6039 | if (IS_ERR(leader)) | ||
6040 | return PTR_ERR(leader); | ||
6041 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { | ||
6042 | child_ctr = inherit_event(sub, parent, parent_ctx, | ||
6043 | child, leader, child_ctx); | ||
6044 | if (IS_ERR(child_ctr)) | ||
6045 | return PTR_ERR(child_ctr); | ||
6046 | } | ||
6047 | return 0; | ||
5539 | } | 6048 | } |
5540 | 6049 | ||
5541 | static int | 6050 | static int |
5542 | inherit_task_group(struct perf_event *event, struct task_struct *parent, | 6051 | inherit_task_group(struct perf_event *event, struct task_struct *parent, |
5543 | struct perf_event_context *parent_ctx, | 6052 | struct perf_event_context *parent_ctx, |
5544 | struct task_struct *child, | 6053 | struct task_struct *child, int ctxn, |
5545 | int *inherited_all) | 6054 | int *inherited_all) |
5546 | { | 6055 | { |
5547 | int ret; | 6056 | int ret; |
5548 | struct perf_event_context *child_ctx = child->perf_event_ctxp; | 6057 | struct perf_event_context *child_ctx; |
5549 | 6058 | ||
5550 | if (!event->attr.inherit) { | 6059 | if (!event->attr.inherit) { |
5551 | *inherited_all = 0; | 6060 | *inherited_all = 0; |
5552 | return 0; | 6061 | return 0; |
5553 | } | 6062 | } |
5554 | 6063 | ||
6064 | child_ctx = child->perf_event_ctxp[ctxn]; | ||
5555 | if (!child_ctx) { | 6065 | if (!child_ctx) { |
5556 | /* | 6066 | /* |
5557 | * This is executed from the parent task context, so | 6067 | * This is executed from the parent task context, so |
@@ -5560,14 +6070,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
5560 | * child. | 6070 | * child. |
5561 | */ | 6071 | */ |
5562 | 6072 | ||
5563 | child_ctx = kzalloc(sizeof(struct perf_event_context), | 6073 | child_ctx = alloc_perf_context(event->pmu, child); |
5564 | GFP_KERNEL); | ||
5565 | if (!child_ctx) | 6074 | if (!child_ctx) |
5566 | return -ENOMEM; | 6075 | return -ENOMEM; |
5567 | 6076 | ||
5568 | __perf_event_init_context(child_ctx, child); | 6077 | child->perf_event_ctxp[ctxn] = child_ctx; |
5569 | child->perf_event_ctxp = child_ctx; | ||
5570 | get_task_struct(child); | ||
5571 | } | 6078 | } |
5572 | 6079 | ||
5573 | ret = inherit_group(event, parent, parent_ctx, | 6080 | ret = inherit_group(event, parent, parent_ctx, |
@@ -5579,11 +6086,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
5579 | return ret; | 6086 | return ret; |
5580 | } | 6087 | } |
5581 | 6088 | ||
5582 | |||
5583 | /* | 6089 | /* |
5584 | * Initialize the perf_event context in task_struct | 6090 | * Initialize the perf_event context in task_struct |
5585 | */ | 6091 | */ |
5586 | int perf_event_init_task(struct task_struct *child) | 6092 | int perf_event_init_context(struct task_struct *child, int ctxn) |
5587 | { | 6093 | { |
5588 | struct perf_event_context *child_ctx, *parent_ctx; | 6094 | struct perf_event_context *child_ctx, *parent_ctx; |
5589 | struct perf_event_context *cloned_ctx; | 6095 | struct perf_event_context *cloned_ctx; |
@@ -5592,19 +6098,19 @@ int perf_event_init_task(struct task_struct *child) | |||
5592 | int inherited_all = 1; | 6098 | int inherited_all = 1; |
5593 | int ret = 0; | 6099 | int ret = 0; |
5594 | 6100 | ||
5595 | child->perf_event_ctxp = NULL; | 6101 | child->perf_event_ctxp[ctxn] = NULL; |
5596 | 6102 | ||
5597 | mutex_init(&child->perf_event_mutex); | 6103 | mutex_init(&child->perf_event_mutex); |
5598 | INIT_LIST_HEAD(&child->perf_event_list); | 6104 | INIT_LIST_HEAD(&child->perf_event_list); |
5599 | 6105 | ||
5600 | if (likely(!parent->perf_event_ctxp)) | 6106 | if (likely(!parent->perf_event_ctxp[ctxn])) |
5601 | return 0; | 6107 | return 0; |
5602 | 6108 | ||
5603 | /* | 6109 | /* |
5604 | * If the parent's context is a clone, pin it so it won't get | 6110 | * If the parent's context is a clone, pin it so it won't get |
5605 | * swapped under us. | 6111 | * swapped under us. |
5606 | */ | 6112 | */ |
5607 | parent_ctx = perf_pin_task_context(parent); | 6113 | parent_ctx = perf_pin_task_context(parent, ctxn); |
5608 | 6114 | ||
5609 | /* | 6115 | /* |
5610 | * No need to check if parent_ctx != NULL here; since we saw | 6116 | * No need to check if parent_ctx != NULL here; since we saw |
@@ -5624,20 +6130,20 @@ int perf_event_init_task(struct task_struct *child) | |||
5624 | * the list, not manipulating it: | 6130 | * the list, not manipulating it: |
5625 | */ | 6131 | */ |
5626 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { | 6132 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { |
5627 | ret = inherit_task_group(event, parent, parent_ctx, child, | 6133 | ret = inherit_task_group(event, parent, parent_ctx, |
5628 | &inherited_all); | 6134 | child, ctxn, &inherited_all); |
5629 | if (ret) | 6135 | if (ret) |
5630 | break; | 6136 | break; |
5631 | } | 6137 | } |
5632 | 6138 | ||
5633 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { | 6139 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { |
5634 | ret = inherit_task_group(event, parent, parent_ctx, child, | 6140 | ret = inherit_task_group(event, parent, parent_ctx, |
5635 | &inherited_all); | 6141 | child, ctxn, &inherited_all); |
5636 | if (ret) | 6142 | if (ret) |
5637 | break; | 6143 | break; |
5638 | } | 6144 | } |
5639 | 6145 | ||
5640 | child_ctx = child->perf_event_ctxp; | 6146 | child_ctx = child->perf_event_ctxp[ctxn]; |
5641 | 6147 | ||
5642 | if (child_ctx && inherited_all) { | 6148 | if (child_ctx && inherited_all) { |
5643 | /* | 6149 | /* |
@@ -5666,63 +6172,98 @@ int perf_event_init_task(struct task_struct *child) | |||
5666 | return ret; | 6172 | return ret; |
5667 | } | 6173 | } |
5668 | 6174 | ||
6175 | /* | ||
6176 | * Initialize the perf_event context in task_struct | ||
6177 | */ | ||
6178 | int perf_event_init_task(struct task_struct *child) | ||
6179 | { | ||
6180 | int ctxn, ret; | ||
6181 | |||
6182 | for_each_task_context_nr(ctxn) { | ||
6183 | ret = perf_event_init_context(child, ctxn); | ||
6184 | if (ret) | ||
6185 | return ret; | ||
6186 | } | ||
6187 | |||
6188 | return 0; | ||
6189 | } | ||
6190 | |||
5669 | static void __init perf_event_init_all_cpus(void) | 6191 | static void __init perf_event_init_all_cpus(void) |
5670 | { | 6192 | { |
6193 | struct swevent_htable *swhash; | ||
5671 | int cpu; | 6194 | int cpu; |
5672 | struct perf_cpu_context *cpuctx; | ||
5673 | 6195 | ||
5674 | for_each_possible_cpu(cpu) { | 6196 | for_each_possible_cpu(cpu) { |
5675 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 6197 | swhash = &per_cpu(swevent_htable, cpu); |
5676 | mutex_init(&cpuctx->hlist_mutex); | 6198 | mutex_init(&swhash->hlist_mutex); |
5677 | __perf_event_init_context(&cpuctx->ctx, NULL); | 6199 | INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); |
5678 | } | 6200 | } |
5679 | } | 6201 | } |
5680 | 6202 | ||
5681 | static void __cpuinit perf_event_init_cpu(int cpu) | 6203 | static void __cpuinit perf_event_init_cpu(int cpu) |
5682 | { | 6204 | { |
5683 | struct perf_cpu_context *cpuctx; | 6205 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
5684 | |||
5685 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
5686 | 6206 | ||
5687 | spin_lock(&perf_resource_lock); | 6207 | mutex_lock(&swhash->hlist_mutex); |
5688 | cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; | 6208 | if (swhash->hlist_refcount > 0) { |
5689 | spin_unlock(&perf_resource_lock); | ||
5690 | |||
5691 | mutex_lock(&cpuctx->hlist_mutex); | ||
5692 | if (cpuctx->hlist_refcount > 0) { | ||
5693 | struct swevent_hlist *hlist; | 6209 | struct swevent_hlist *hlist; |
5694 | 6210 | ||
5695 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 6211 | hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); |
5696 | WARN_ON_ONCE(!hlist); | 6212 | WARN_ON(!hlist); |
5697 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); | 6213 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
5698 | } | 6214 | } |
5699 | mutex_unlock(&cpuctx->hlist_mutex); | 6215 | mutex_unlock(&swhash->hlist_mutex); |
5700 | } | 6216 | } |
5701 | 6217 | ||
5702 | #ifdef CONFIG_HOTPLUG_CPU | 6218 | #ifdef CONFIG_HOTPLUG_CPU |
5703 | static void __perf_event_exit_cpu(void *info) | 6219 | static void perf_pmu_rotate_stop(struct pmu *pmu) |
5704 | { | 6220 | { |
5705 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 6221 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
5706 | struct perf_event_context *ctx = &cpuctx->ctx; | 6222 | |
6223 | WARN_ON(!irqs_disabled()); | ||
6224 | |||
6225 | list_del_init(&cpuctx->rotation_list); | ||
6226 | } | ||
6227 | |||
6228 | static void __perf_event_exit_context(void *__info) | ||
6229 | { | ||
6230 | struct perf_event_context *ctx = __info; | ||
5707 | struct perf_event *event, *tmp; | 6231 | struct perf_event *event, *tmp; |
5708 | 6232 | ||
6233 | perf_pmu_rotate_stop(ctx->pmu); | ||
6234 | |||
5709 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 6235 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) |
5710 | __perf_event_remove_from_context(event); | 6236 | __perf_event_remove_from_context(event); |
5711 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) | 6237 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) |
5712 | __perf_event_remove_from_context(event); | 6238 | __perf_event_remove_from_context(event); |
5713 | } | 6239 | } |
6240 | |||
6241 | static void perf_event_exit_cpu_context(int cpu) | ||
6242 | { | ||
6243 | struct perf_event_context *ctx; | ||
6244 | struct pmu *pmu; | ||
6245 | int idx; | ||
6246 | |||
6247 | idx = srcu_read_lock(&pmus_srcu); | ||
6248 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
6249 | ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; | ||
6250 | |||
6251 | mutex_lock(&ctx->mutex); | ||
6252 | smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); | ||
6253 | mutex_unlock(&ctx->mutex); | ||
6254 | } | ||
6255 | srcu_read_unlock(&pmus_srcu, idx); | ||
6256 | } | ||
6257 | |||
5714 | static void perf_event_exit_cpu(int cpu) | 6258 | static void perf_event_exit_cpu(int cpu) |
5715 | { | 6259 | { |
5716 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 6260 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
5717 | struct perf_event_context *ctx = &cpuctx->ctx; | ||
5718 | 6261 | ||
5719 | mutex_lock(&cpuctx->hlist_mutex); | 6262 | mutex_lock(&swhash->hlist_mutex); |
5720 | swevent_hlist_release(cpuctx); | 6263 | swevent_hlist_release(swhash); |
5721 | mutex_unlock(&cpuctx->hlist_mutex); | 6264 | mutex_unlock(&swhash->hlist_mutex); |
5722 | 6265 | ||
5723 | mutex_lock(&ctx->mutex); | 6266 | perf_event_exit_cpu_context(cpu); |
5724 | smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); | ||
5725 | mutex_unlock(&ctx->mutex); | ||
5726 | } | 6267 | } |
5727 | #else | 6268 | #else |
5728 | static inline void perf_event_exit_cpu(int cpu) { } | 6269 | static inline void perf_event_exit_cpu(int cpu) { } |
@@ -5733,15 +6274,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
5733 | { | 6274 | { |
5734 | unsigned int cpu = (long)hcpu; | 6275 | unsigned int cpu = (long)hcpu; |
5735 | 6276 | ||
5736 | switch (action) { | 6277 | switch (action & ~CPU_TASKS_FROZEN) { |
5737 | 6278 | ||
5738 | case CPU_UP_PREPARE: | 6279 | case CPU_UP_PREPARE: |
5739 | case CPU_UP_PREPARE_FROZEN: | 6280 | case CPU_DOWN_FAILED: |
5740 | perf_event_init_cpu(cpu); | 6281 | perf_event_init_cpu(cpu); |
5741 | break; | 6282 | break; |
5742 | 6283 | ||
6284 | case CPU_UP_CANCELED: | ||
5743 | case CPU_DOWN_PREPARE: | 6285 | case CPU_DOWN_PREPARE: |
5744 | case CPU_DOWN_PREPARE_FROZEN: | ||
5745 | perf_event_exit_cpu(cpu); | 6286 | perf_event_exit_cpu(cpu); |
5746 | break; | 6287 | break; |
5747 | 6288 | ||
@@ -5752,118 +6293,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
5752 | return NOTIFY_OK; | 6293 | return NOTIFY_OK; |
5753 | } | 6294 | } |
5754 | 6295 | ||
5755 | /* | ||
5756 | * This has to have a higher priority than migration_notifier in sched.c. | ||
5757 | */ | ||
5758 | static struct notifier_block __cpuinitdata perf_cpu_nb = { | ||
5759 | .notifier_call = perf_cpu_notify, | ||
5760 | .priority = 20, | ||
5761 | }; | ||
5762 | |||
5763 | void __init perf_event_init(void) | 6296 | void __init perf_event_init(void) |
5764 | { | 6297 | { |
5765 | perf_event_init_all_cpus(); | 6298 | perf_event_init_all_cpus(); |
5766 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, | 6299 | init_srcu_struct(&pmus_srcu); |
5767 | (void *)(long)smp_processor_id()); | 6300 | perf_pmu_register(&perf_swevent); |
5768 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, | 6301 | perf_pmu_register(&perf_cpu_clock); |
5769 | (void *)(long)smp_processor_id()); | 6302 | perf_pmu_register(&perf_task_clock); |
5770 | register_cpu_notifier(&perf_cpu_nb); | 6303 | perf_tp_register(); |
5771 | } | 6304 | perf_cpu_notifier(perf_cpu_notify); |
5772 | |||
5773 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, | ||
5774 | struct sysdev_class_attribute *attr, | ||
5775 | char *buf) | ||
5776 | { | ||
5777 | return sprintf(buf, "%d\n", perf_reserved_percpu); | ||
5778 | } | ||
5779 | |||
5780 | static ssize_t | ||
5781 | perf_set_reserve_percpu(struct sysdev_class *class, | ||
5782 | struct sysdev_class_attribute *attr, | ||
5783 | const char *buf, | ||
5784 | size_t count) | ||
5785 | { | ||
5786 | struct perf_cpu_context *cpuctx; | ||
5787 | unsigned long val; | ||
5788 | int err, cpu, mpt; | ||
5789 | |||
5790 | err = strict_strtoul(buf, 10, &val); | ||
5791 | if (err) | ||
5792 | return err; | ||
5793 | if (val > perf_max_events) | ||
5794 | return -EINVAL; | ||
5795 | |||
5796 | spin_lock(&perf_resource_lock); | ||
5797 | perf_reserved_percpu = val; | ||
5798 | for_each_online_cpu(cpu) { | ||
5799 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
5800 | raw_spin_lock_irq(&cpuctx->ctx.lock); | ||
5801 | mpt = min(perf_max_events - cpuctx->ctx.nr_events, | ||
5802 | perf_max_events - perf_reserved_percpu); | ||
5803 | cpuctx->max_pertask = mpt; | ||
5804 | raw_spin_unlock_irq(&cpuctx->ctx.lock); | ||
5805 | } | ||
5806 | spin_unlock(&perf_resource_lock); | ||
5807 | |||
5808 | return count; | ||
5809 | } | ||
5810 | |||
5811 | static ssize_t perf_show_overcommit(struct sysdev_class *class, | ||
5812 | struct sysdev_class_attribute *attr, | ||
5813 | char *buf) | ||
5814 | { | ||
5815 | return sprintf(buf, "%d\n", perf_overcommit); | ||
5816 | } | ||
5817 | |||
5818 | static ssize_t | ||
5819 | perf_set_overcommit(struct sysdev_class *class, | ||
5820 | struct sysdev_class_attribute *attr, | ||
5821 | const char *buf, size_t count) | ||
5822 | { | ||
5823 | unsigned long val; | ||
5824 | int err; | ||
5825 | |||
5826 | err = strict_strtoul(buf, 10, &val); | ||
5827 | if (err) | ||
5828 | return err; | ||
5829 | if (val > 1) | ||
5830 | return -EINVAL; | ||
5831 | |||
5832 | spin_lock(&perf_resource_lock); | ||
5833 | perf_overcommit = val; | ||
5834 | spin_unlock(&perf_resource_lock); | ||
5835 | |||
5836 | return count; | ||
5837 | } | ||
5838 | |||
5839 | static SYSDEV_CLASS_ATTR( | ||
5840 | reserve_percpu, | ||
5841 | 0644, | ||
5842 | perf_show_reserve_percpu, | ||
5843 | perf_set_reserve_percpu | ||
5844 | ); | ||
5845 | |||
5846 | static SYSDEV_CLASS_ATTR( | ||
5847 | overcommit, | ||
5848 | 0644, | ||
5849 | perf_show_overcommit, | ||
5850 | perf_set_overcommit | ||
5851 | ); | ||
5852 | |||
5853 | static struct attribute *perfclass_attrs[] = { | ||
5854 | &attr_reserve_percpu.attr, | ||
5855 | &attr_overcommit.attr, | ||
5856 | NULL | ||
5857 | }; | ||
5858 | |||
5859 | static struct attribute_group perfclass_attr_group = { | ||
5860 | .attrs = perfclass_attrs, | ||
5861 | .name = "perf_events", | ||
5862 | }; | ||
5863 | |||
5864 | static int __init perf_event_sysfs_init(void) | ||
5865 | { | ||
5866 | return sysfs_create_group(&cpu_sysdev_class.kset.kobj, | ||
5867 | &perfclass_attr_group); | ||
5868 | } | 6305 | } |
5869 | device_initcall(perf_event_sysfs_init); | ||
diff --git a/kernel/pid.c b/kernel/pid.c index e9fd8c132d26..39b65b69584f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -122,6 +122,43 @@ static void free_pidmap(struct upid *upid) | |||
122 | atomic_inc(&map->nr_free); | 122 | atomic_inc(&map->nr_free); |
123 | } | 123 | } |
124 | 124 | ||
125 | /* | ||
126 | * If we started walking pids at 'base', is 'a' seen before 'b'? | ||
127 | */ | ||
128 | static int pid_before(int base, int a, int b) | ||
129 | { | ||
130 | /* | ||
131 | * This is the same as saying | ||
132 | * | ||
133 | * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT | ||
134 | * and that mapping orders 'a' and 'b' with respect to 'base'. | ||
135 | */ | ||
136 | return (unsigned)(a - base) < (unsigned)(b - base); | ||
137 | } | ||
138 | |||
139 | /* | ||
140 | * We might be racing with someone else trying to set pid_ns->last_pid. | ||
141 | * We want the winner to have the "later" value, because if the | ||
142 | * "earlier" value prevails, then a pid may get reused immediately. | ||
143 | * | ||
144 | * Since pids rollover, it is not sufficient to just pick the bigger | ||
145 | * value. We have to consider where we started counting from. | ||
146 | * | ||
147 | * 'base' is the value of pid_ns->last_pid that we observed when | ||
148 | * we started looking for a pid. | ||
149 | * | ||
150 | * 'pid' is the pid that we eventually found. | ||
151 | */ | ||
152 | static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) | ||
153 | { | ||
154 | int prev; | ||
155 | int last_write = base; | ||
156 | do { | ||
157 | prev = last_write; | ||
158 | last_write = cmpxchg(&pid_ns->last_pid, prev, pid); | ||
159 | } while ((prev != last_write) && (pid_before(base, last_write, pid))); | ||
160 | } | ||
161 | |||
125 | static int alloc_pidmap(struct pid_namespace *pid_ns) | 162 | static int alloc_pidmap(struct pid_namespace *pid_ns) |
126 | { | 163 | { |
127 | int i, offset, max_scan, pid, last = pid_ns->last_pid; | 164 | int i, offset, max_scan, pid, last = pid_ns->last_pid; |
@@ -132,7 +169,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) | |||
132 | pid = RESERVED_PIDS; | 169 | pid = RESERVED_PIDS; |
133 | offset = pid & BITS_PER_PAGE_MASK; | 170 | offset = pid & BITS_PER_PAGE_MASK; |
134 | map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; | 171 | map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; |
135 | max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; | 172 | /* |
173 | * If last_pid points into the middle of the map->page we | ||
174 | * want to scan this bitmap block twice, the second time | ||
175 | * we start with offset == 0 (or RESERVED_PIDS). | ||
176 | */ | ||
177 | max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; | ||
136 | for (i = 0; i <= max_scan; ++i) { | 178 | for (i = 0; i <= max_scan; ++i) { |
137 | if (unlikely(!map->page)) { | 179 | if (unlikely(!map->page)) { |
138 | void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); | 180 | void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); |
@@ -154,20 +196,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) | |||
154 | do { | 196 | do { |
155 | if (!test_and_set_bit(offset, map->page)) { | 197 | if (!test_and_set_bit(offset, map->page)) { |
156 | atomic_dec(&map->nr_free); | 198 | atomic_dec(&map->nr_free); |
157 | pid_ns->last_pid = pid; | 199 | set_last_pid(pid_ns, last, pid); |
158 | return pid; | 200 | return pid; |
159 | } | 201 | } |
160 | offset = find_next_offset(map, offset); | 202 | offset = find_next_offset(map, offset); |
161 | pid = mk_pid(pid_ns, map, offset); | 203 | pid = mk_pid(pid_ns, map, offset); |
162 | /* | 204 | } while (offset < BITS_PER_PAGE && pid < pid_max); |
163 | * find_next_offset() found a bit, the pid from it | ||
164 | * is in-bounds, and if we fell back to the last | ||
165 | * bitmap block and the final block was the same | ||
166 | * as the starting point, pid is before last_pid. | ||
167 | */ | ||
168 | } while (offset < BITS_PER_PAGE && pid < pid_max && | ||
169 | (i != max_scan || pid < last || | ||
170 | !((last+1) & BITS_PER_PAGE_MASK))); | ||
171 | } | 205 | } |
172 | if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { | 206 | if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { |
173 | ++map; | 207 | ++map; |
@@ -367,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) | |||
367 | struct task_struct *result = NULL; | 401 | struct task_struct *result = NULL; |
368 | if (pid) { | 402 | if (pid) { |
369 | struct hlist_node *first; | 403 | struct hlist_node *first; |
370 | first = rcu_dereference_check(pid->tasks[type].first, | 404 | first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), |
371 | rcu_read_lock_held() || | 405 | rcu_read_lock_held() || |
372 | lockdep_tasklist_lock_is_held()); | 406 | lockdep_tasklist_lock_is_held()); |
373 | if (first) | 407 | if (first) |
@@ -382,6 +416,7 @@ EXPORT_SYMBOL(pid_task); | |||
382 | */ | 416 | */ |
383 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) | 417 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) |
384 | { | 418 | { |
419 | rcu_lockdep_assert(rcu_read_lock_held()); | ||
385 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); | 420 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); |
386 | } | 421 | } |
387 | 422 | ||
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index f42d3f737a33..c7a8f453919e 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c | |||
@@ -48,59 +48,49 @@ | |||
48 | * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock | 48 | * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock |
49 | * held, taken with _irqsave. One lock to rule them all | 49 | * held, taken with _irqsave. One lock to rule them all |
50 | */ | 50 | */ |
51 | struct pm_qos_request_list { | 51 | enum pm_qos_type { |
52 | struct list_head list; | 52 | PM_QOS_MAX, /* return the largest value */ |
53 | union { | 53 | PM_QOS_MIN /* return the smallest value */ |
54 | s32 value; | ||
55 | s32 usec; | ||
56 | s32 kbps; | ||
57 | }; | ||
58 | int pm_qos_class; | ||
59 | }; | 54 | }; |
60 | 55 | ||
61 | static s32 max_compare(s32 v1, s32 v2); | ||
62 | static s32 min_compare(s32 v1, s32 v2); | ||
63 | |||
64 | struct pm_qos_object { | 56 | struct pm_qos_object { |
65 | struct pm_qos_request_list requests; | 57 | struct plist_head requests; |
66 | struct blocking_notifier_head *notifiers; | 58 | struct blocking_notifier_head *notifiers; |
67 | struct miscdevice pm_qos_power_miscdev; | 59 | struct miscdevice pm_qos_power_miscdev; |
68 | char *name; | 60 | char *name; |
69 | s32 default_value; | 61 | s32 default_value; |
70 | atomic_t target_value; | 62 | enum pm_qos_type type; |
71 | s32 (*comparitor)(s32, s32); | ||
72 | }; | 63 | }; |
73 | 64 | ||
65 | static DEFINE_SPINLOCK(pm_qos_lock); | ||
66 | |||
74 | static struct pm_qos_object null_pm_qos; | 67 | static struct pm_qos_object null_pm_qos; |
75 | static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); | 68 | static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); |
76 | static struct pm_qos_object cpu_dma_pm_qos = { | 69 | static struct pm_qos_object cpu_dma_pm_qos = { |
77 | .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)}, | 70 | .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), |
78 | .notifiers = &cpu_dma_lat_notifier, | 71 | .notifiers = &cpu_dma_lat_notifier, |
79 | .name = "cpu_dma_latency", | 72 | .name = "cpu_dma_latency", |
80 | .default_value = 2000 * USEC_PER_SEC, | 73 | .default_value = 2000 * USEC_PER_SEC, |
81 | .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), | 74 | .type = PM_QOS_MIN, |
82 | .comparitor = min_compare | ||
83 | }; | 75 | }; |
84 | 76 | ||
85 | static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); | 77 | static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); |
86 | static struct pm_qos_object network_lat_pm_qos = { | 78 | static struct pm_qos_object network_lat_pm_qos = { |
87 | .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)}, | 79 | .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), |
88 | .notifiers = &network_lat_notifier, | 80 | .notifiers = &network_lat_notifier, |
89 | .name = "network_latency", | 81 | .name = "network_latency", |
90 | .default_value = 2000 * USEC_PER_SEC, | 82 | .default_value = 2000 * USEC_PER_SEC, |
91 | .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), | 83 | .type = PM_QOS_MIN |
92 | .comparitor = min_compare | ||
93 | }; | 84 | }; |
94 | 85 | ||
95 | 86 | ||
96 | static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); | 87 | static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); |
97 | static struct pm_qos_object network_throughput_pm_qos = { | 88 | static struct pm_qos_object network_throughput_pm_qos = { |
98 | .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)}, | 89 | .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), |
99 | .notifiers = &network_throughput_notifier, | 90 | .notifiers = &network_throughput_notifier, |
100 | .name = "network_throughput", | 91 | .name = "network_throughput", |
101 | .default_value = 0, | 92 | .default_value = 0, |
102 | .target_value = ATOMIC_INIT(0), | 93 | .type = PM_QOS_MAX, |
103 | .comparitor = max_compare | ||
104 | }; | 94 | }; |
105 | 95 | ||
106 | 96 | ||
@@ -111,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = { | |||
111 | &network_throughput_pm_qos | 101 | &network_throughput_pm_qos |
112 | }; | 102 | }; |
113 | 103 | ||
114 | static DEFINE_SPINLOCK(pm_qos_lock); | ||
115 | |||
116 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | 104 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, |
117 | size_t count, loff_t *f_pos); | 105 | size_t count, loff_t *f_pos); |
118 | static int pm_qos_power_open(struct inode *inode, struct file *filp); | 106 | static int pm_qos_power_open(struct inode *inode, struct file *filp); |
@@ -122,48 +110,58 @@ static const struct file_operations pm_qos_power_fops = { | |||
122 | .write = pm_qos_power_write, | 110 | .write = pm_qos_power_write, |
123 | .open = pm_qos_power_open, | 111 | .open = pm_qos_power_open, |
124 | .release = pm_qos_power_release, | 112 | .release = pm_qos_power_release, |
113 | .llseek = noop_llseek, | ||
125 | }; | 114 | }; |
126 | 115 | ||
127 | /* static helper functions */ | 116 | /* unlocked internal variant */ |
128 | static s32 max_compare(s32 v1, s32 v2) | 117 | static inline int pm_qos_get_value(struct pm_qos_object *o) |
129 | { | 118 | { |
130 | return max(v1, v2); | 119 | if (plist_head_empty(&o->requests)) |
131 | } | 120 | return o->default_value; |
132 | 121 | ||
133 | static s32 min_compare(s32 v1, s32 v2) | 122 | switch (o->type) { |
134 | { | 123 | case PM_QOS_MIN: |
135 | return min(v1, v2); | 124 | return plist_last(&o->requests)->prio; |
136 | } | 125 | |
126 | case PM_QOS_MAX: | ||
127 | return plist_first(&o->requests)->prio; | ||
137 | 128 | ||
129 | default: | ||
130 | /* runtime check for not using enum */ | ||
131 | BUG(); | ||
132 | } | ||
133 | } | ||
138 | 134 | ||
139 | static void update_target(int pm_qos_class) | 135 | static void update_target(struct pm_qos_object *o, struct plist_node *node, |
136 | int del, int value) | ||
140 | { | 137 | { |
141 | s32 extreme_value; | ||
142 | struct pm_qos_request_list *node; | ||
143 | unsigned long flags; | 138 | unsigned long flags; |
144 | int call_notifier = 0; | 139 | int prev_value, curr_value; |
145 | 140 | ||
146 | spin_lock_irqsave(&pm_qos_lock, flags); | 141 | spin_lock_irqsave(&pm_qos_lock, flags); |
147 | extreme_value = pm_qos_array[pm_qos_class]->default_value; | 142 | prev_value = pm_qos_get_value(o); |
148 | list_for_each_entry(node, | 143 | /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ |
149 | &pm_qos_array[pm_qos_class]->requests.list, list) { | 144 | if (value != PM_QOS_DEFAULT_VALUE) { |
150 | extreme_value = pm_qos_array[pm_qos_class]->comparitor( | 145 | /* |
151 | extreme_value, node->value); | 146 | * to change the list, we atomically remove, reinit |
152 | } | 147 | * with new value and add, then see if the extremal |
153 | if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) != | 148 | * changed |
154 | extreme_value) { | 149 | */ |
155 | call_notifier = 1; | 150 | plist_del(node, &o->requests); |
156 | atomic_set(&pm_qos_array[pm_qos_class]->target_value, | 151 | plist_node_init(node, value); |
157 | extreme_value); | 152 | plist_add(node, &o->requests); |
158 | pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class, | 153 | } else if (del) { |
159 | atomic_read(&pm_qos_array[pm_qos_class]->target_value)); | 154 | plist_del(node, &o->requests); |
155 | } else { | ||
156 | plist_add(node, &o->requests); | ||
160 | } | 157 | } |
158 | curr_value = pm_qos_get_value(o); | ||
161 | spin_unlock_irqrestore(&pm_qos_lock, flags); | 159 | spin_unlock_irqrestore(&pm_qos_lock, flags); |
162 | 160 | ||
163 | if (call_notifier) | 161 | if (prev_value != curr_value) |
164 | blocking_notifier_call_chain( | 162 | blocking_notifier_call_chain(o->notifiers, |
165 | pm_qos_array[pm_qos_class]->notifiers, | 163 | (unsigned long)curr_value, |
166 | (unsigned long) extreme_value, NULL); | 164 | NULL); |
167 | } | 165 | } |
168 | 166 | ||
169 | static int register_pm_qos_misc(struct pm_qos_object *qos) | 167 | static int register_pm_qos_misc(struct pm_qos_object *qos) |
@@ -196,42 +194,53 @@ static int find_pm_qos_object_by_minor(int minor) | |||
196 | */ | 194 | */ |
197 | int pm_qos_request(int pm_qos_class) | 195 | int pm_qos_request(int pm_qos_class) |
198 | { | 196 | { |
199 | return atomic_read(&pm_qos_array[pm_qos_class]->target_value); | 197 | unsigned long flags; |
198 | int value; | ||
199 | |||
200 | spin_lock_irqsave(&pm_qos_lock, flags); | ||
201 | value = pm_qos_get_value(pm_qos_array[pm_qos_class]); | ||
202 | spin_unlock_irqrestore(&pm_qos_lock, flags); | ||
203 | |||
204 | return value; | ||
200 | } | 205 | } |
201 | EXPORT_SYMBOL_GPL(pm_qos_request); | 206 | EXPORT_SYMBOL_GPL(pm_qos_request); |
202 | 207 | ||
208 | int pm_qos_request_active(struct pm_qos_request_list *req) | ||
209 | { | ||
210 | return req->pm_qos_class != 0; | ||
211 | } | ||
212 | EXPORT_SYMBOL_GPL(pm_qos_request_active); | ||
213 | |||
203 | /** | 214 | /** |
204 | * pm_qos_add_request - inserts new qos request into the list | 215 | * pm_qos_add_request - inserts new qos request into the list |
205 | * @pm_qos_class: identifies which list of qos request to us | 216 | * @dep: pointer to a preallocated handle |
217 | * @pm_qos_class: identifies which list of qos request to use | ||
206 | * @value: defines the qos request | 218 | * @value: defines the qos request |
207 | * | 219 | * |
208 | * This function inserts a new entry in the pm_qos_class list of requested qos | 220 | * This function inserts a new entry in the pm_qos_class list of requested qos |
209 | * performance characteristics. It recomputes the aggregate QoS expectations | 221 | * performance characteristics. It recomputes the aggregate QoS expectations |
210 | * for the pm_qos_class of parameters, and returns the pm_qos_request list | 222 | * for the pm_qos_class of parameters and initializes the pm_qos_request_list |
211 | * element as a handle for use in updating and removal. Call needs to save | 223 | * handle. Caller needs to save this handle for later use in updates and |
212 | * this handle for later use. | 224 | * removal. |
213 | */ | 225 | */ |
214 | struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value) | 226 | |
227 | void pm_qos_add_request(struct pm_qos_request_list *dep, | ||
228 | int pm_qos_class, s32 value) | ||
215 | { | 229 | { |
216 | struct pm_qos_request_list *dep; | 230 | struct pm_qos_object *o = pm_qos_array[pm_qos_class]; |
217 | unsigned long flags; | 231 | int new_value; |
218 | 232 | ||
219 | dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL); | 233 | if (pm_qos_request_active(dep)) { |
220 | if (dep) { | 234 | WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); |
221 | if (value == PM_QOS_DEFAULT_VALUE) | 235 | return; |
222 | dep->value = pm_qos_array[pm_qos_class]->default_value; | ||
223 | else | ||
224 | dep->value = value; | ||
225 | dep->pm_qos_class = pm_qos_class; | ||
226 | |||
227 | spin_lock_irqsave(&pm_qos_lock, flags); | ||
228 | list_add(&dep->list, | ||
229 | &pm_qos_array[pm_qos_class]->requests.list); | ||
230 | spin_unlock_irqrestore(&pm_qos_lock, flags); | ||
231 | update_target(pm_qos_class); | ||
232 | } | 236 | } |
233 | 237 | if (value == PM_QOS_DEFAULT_VALUE) | |
234 | return dep; | 238 | new_value = o->default_value; |
239 | else | ||
240 | new_value = value; | ||
241 | plist_node_init(&dep->list, new_value); | ||
242 | dep->pm_qos_class = pm_qos_class; | ||
243 | update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE); | ||
235 | } | 244 | } |
236 | EXPORT_SYMBOL_GPL(pm_qos_add_request); | 245 | EXPORT_SYMBOL_GPL(pm_qos_add_request); |
237 | 246 | ||
@@ -246,27 +255,28 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request); | |||
246 | * Attempts are made to make this code callable on hot code paths. | 255 | * Attempts are made to make this code callable on hot code paths. |
247 | */ | 256 | */ |
248 | void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, | 257 | void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, |
249 | s32 new_value) | 258 | s32 new_value) |
250 | { | 259 | { |
251 | unsigned long flags; | ||
252 | int pending_update = 0; | ||
253 | s32 temp; | 260 | s32 temp; |
261 | struct pm_qos_object *o; | ||
262 | |||
263 | if (!pm_qos_req) /*guard against callers passing in null */ | ||
264 | return; | ||
254 | 265 | ||
255 | if (pm_qos_req) { /*guard against callers passing in null */ | 266 | if (!pm_qos_request_active(pm_qos_req)) { |
256 | spin_lock_irqsave(&pm_qos_lock, flags); | 267 | WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); |
257 | if (new_value == PM_QOS_DEFAULT_VALUE) | 268 | return; |
258 | temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value; | ||
259 | else | ||
260 | temp = new_value; | ||
261 | |||
262 | if (temp != pm_qos_req->value) { | ||
263 | pending_update = 1; | ||
264 | pm_qos_req->value = temp; | ||
265 | } | ||
266 | spin_unlock_irqrestore(&pm_qos_lock, flags); | ||
267 | if (pending_update) | ||
268 | update_target(pm_qos_req->pm_qos_class); | ||
269 | } | 269 | } |
270 | |||
271 | o = pm_qos_array[pm_qos_req->pm_qos_class]; | ||
272 | |||
273 | if (new_value == PM_QOS_DEFAULT_VALUE) | ||
274 | temp = o->default_value; | ||
275 | else | ||
276 | temp = new_value; | ||
277 | |||
278 | if (temp != pm_qos_req->list.prio) | ||
279 | update_target(o, &pm_qos_req->list, 0, temp); | ||
270 | } | 280 | } |
271 | EXPORT_SYMBOL_GPL(pm_qos_update_request); | 281 | EXPORT_SYMBOL_GPL(pm_qos_update_request); |
272 | 282 | ||
@@ -280,19 +290,20 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request); | |||
280 | */ | 290 | */ |
281 | void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) | 291 | void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) |
282 | { | 292 | { |
283 | unsigned long flags; | 293 | struct pm_qos_object *o; |
284 | int qos_class; | ||
285 | 294 | ||
286 | if (pm_qos_req == NULL) | 295 | if (pm_qos_req == NULL) |
287 | return; | 296 | return; |
288 | /* silent return to keep pcm code cleaner */ | 297 | /* silent return to keep pcm code cleaner */ |
289 | 298 | ||
290 | qos_class = pm_qos_req->pm_qos_class; | 299 | if (!pm_qos_request_active(pm_qos_req)) { |
291 | spin_lock_irqsave(&pm_qos_lock, flags); | 300 | WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); |
292 | list_del(&pm_qos_req->list); | 301 | return; |
293 | kfree(pm_qos_req); | 302 | } |
294 | spin_unlock_irqrestore(&pm_qos_lock, flags); | 303 | |
295 | update_target(qos_class); | 304 | o = pm_qos_array[pm_qos_req->pm_qos_class]; |
305 | update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); | ||
306 | memset(pm_qos_req, 0, sizeof(*pm_qos_req)); | ||
296 | } | 307 | } |
297 | EXPORT_SYMBOL_GPL(pm_qos_remove_request); | 308 | EXPORT_SYMBOL_GPL(pm_qos_remove_request); |
298 | 309 | ||
@@ -340,8 +351,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) | |||
340 | 351 | ||
341 | pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); | 352 | pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); |
342 | if (pm_qos_class >= 0) { | 353 | if (pm_qos_class >= 0) { |
343 | filp->private_data = (void *) pm_qos_add_request(pm_qos_class, | 354 | struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL); |
344 | PM_QOS_DEFAULT_VALUE); | 355 | if (!req) |
356 | return -ENOMEM; | ||
357 | |||
358 | pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); | ||
359 | filp->private_data = req; | ||
345 | 360 | ||
346 | if (filp->private_data) | 361 | if (filp->private_data) |
347 | return 0; | 362 | return 0; |
@@ -353,8 +368,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp) | |||
353 | { | 368 | { |
354 | struct pm_qos_request_list *req; | 369 | struct pm_qos_request_list *req; |
355 | 370 | ||
356 | req = (struct pm_qos_request_list *)filp->private_data; | 371 | req = filp->private_data; |
357 | pm_qos_remove_request(req); | 372 | pm_qos_remove_request(req); |
373 | kfree(req); | ||
358 | 374 | ||
359 | return 0; | 375 | return 0; |
360 | } | 376 | } |
@@ -374,14 +390,16 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
374 | } else if (count == 11) { /* len('0x12345678/0') */ | 390 | } else if (count == 11) { /* len('0x12345678/0') */ |
375 | if (copy_from_user(ascii_value, buf, 11)) | 391 | if (copy_from_user(ascii_value, buf, 11)) |
376 | return -EFAULT; | 392 | return -EFAULT; |
393 | if (strlen(ascii_value) != 10) | ||
394 | return -EINVAL; | ||
377 | x = sscanf(ascii_value, "%x", &value); | 395 | x = sscanf(ascii_value, "%x", &value); |
378 | if (x != 1) | 396 | if (x != 1) |
379 | return -EINVAL; | 397 | return -EINVAL; |
380 | pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value); | 398 | pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); |
381 | } else | 399 | } else |
382 | return -EINVAL; | 400 | return -EINVAL; |
383 | 401 | ||
384 | pm_qos_req = (struct pm_qos_request_list *)filp->private_data; | 402 | pm_qos_req = filp->private_data; |
385 | pm_qos_update_request(pm_qos_req, value); | 403 | pm_qos_update_request(pm_qos_req, value); |
386 | 404 | ||
387 | return count; | 405 | return count; |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 9829646d399c..6842eeba5879 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -16,13 +16,13 @@ | |||
16 | * siglock protection since other code may update expiration cache as | 16 | * siglock protection since other code may update expiration cache as |
17 | * well. | 17 | * well. |
18 | */ | 18 | */ |
19 | void update_rlimit_cpu(unsigned long rlim_new) | 19 | void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) |
20 | { | 20 | { |
21 | cputime_t cputime = secs_to_cputime(rlim_new); | 21 | cputime_t cputime = secs_to_cputime(rlim_new); |
22 | 22 | ||
23 | spin_lock_irq(¤t->sighand->siglock); | 23 | spin_lock_irq(&task->sighand->siglock); |
24 | set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); | 24 | set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL); |
25 | spin_unlock_irq(¤t->sighand->siglock); | 25 | spin_unlock_irq(&task->sighand->siglock); |
26 | } | 26 | } |
27 | 27 | ||
28 | static int check_clock(const clockid_t which_clock) | 28 | static int check_clock(const clockid_t which_clock) |
@@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, | |||
232 | 232 | ||
233 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | 233 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) |
234 | { | 234 | { |
235 | struct sighand_struct *sighand; | 235 | struct signal_struct *sig = tsk->signal; |
236 | struct signal_struct *sig; | ||
237 | struct task_struct *t; | 236 | struct task_struct *t; |
238 | 237 | ||
239 | *times = INIT_CPUTIME; | 238 | times->utime = sig->utime; |
239 | times->stime = sig->stime; | ||
240 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
240 | 241 | ||
241 | rcu_read_lock(); | 242 | rcu_read_lock(); |
242 | sighand = rcu_dereference(tsk->sighand); | 243 | /* make sure we can trust tsk->thread_group list */ |
243 | if (!sighand) | 244 | if (!likely(pid_alive(tsk))) |
244 | goto out; | 245 | goto out; |
245 | 246 | ||
246 | sig = tsk->signal; | ||
247 | |||
248 | t = tsk; | 247 | t = tsk; |
249 | do { | 248 | do { |
250 | times->utime = cputime_add(times->utime, t->utime); | 249 | times->utime = cputime_add(times->utime, t->utime); |
251 | times->stime = cputime_add(times->stime, t->stime); | 250 | times->stime = cputime_add(times->stime, t->stime); |
252 | times->sum_exec_runtime += t->se.sum_exec_runtime; | 251 | times->sum_exec_runtime += t->se.sum_exec_runtime; |
253 | 252 | } while_each_thread(tsk, t); | |
254 | t = next_thread(t); | ||
255 | } while (t != tsk); | ||
256 | |||
257 | times->utime = cputime_add(times->utime, sig->utime); | ||
258 | times->stime = cputime_add(times->stime, sig->stime); | ||
259 | times->sum_exec_runtime += sig->sum_sched_runtime; | ||
260 | out: | 253 | out: |
261 | rcu_read_unlock(); | 254 | rcu_read_unlock(); |
262 | } | 255 | } |
@@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk) | |||
1279 | { | 1272 | { |
1280 | struct signal_struct *sig; | 1273 | struct signal_struct *sig; |
1281 | 1274 | ||
1282 | /* tsk == current, ensure it is safe to use ->signal/sighand */ | ||
1283 | if (unlikely(tsk->exit_state)) | ||
1284 | return 0; | ||
1285 | |||
1286 | if (!task_cputime_zero(&tsk->cputime_expires)) { | 1275 | if (!task_cputime_zero(&tsk->cputime_expires)) { |
1287 | struct task_cputime task_sample = { | 1276 | struct task_cputime task_sample = { |
1288 | .utime = tsk->utime, | 1277 | .utime = tsk->utime, |
@@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk) | |||
1298 | if (sig->cputimer.running) { | 1287 | if (sig->cputimer.running) { |
1299 | struct task_cputime group_sample; | 1288 | struct task_cputime group_sample; |
1300 | 1289 | ||
1301 | thread_group_cputimer(tsk, &group_sample); | 1290 | spin_lock(&sig->cputimer.lock); |
1291 | group_sample = sig->cputimer.cputime; | ||
1292 | spin_unlock(&sig->cputimer.lock); | ||
1293 | |||
1302 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) | 1294 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) |
1303 | return 1; | 1295 | return 1; |
1304 | } | 1296 | } |
@@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1315 | { | 1307 | { |
1316 | LIST_HEAD(firing); | 1308 | LIST_HEAD(firing); |
1317 | struct k_itimer *timer, *next; | 1309 | struct k_itimer *timer, *next; |
1310 | unsigned long flags; | ||
1318 | 1311 | ||
1319 | BUG_ON(!irqs_disabled()); | 1312 | BUG_ON(!irqs_disabled()); |
1320 | 1313 | ||
@@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1325 | if (!fastpath_timer_check(tsk)) | 1318 | if (!fastpath_timer_check(tsk)) |
1326 | return; | 1319 | return; |
1327 | 1320 | ||
1328 | spin_lock(&tsk->sighand->siglock); | 1321 | if (!lock_task_sighand(tsk, &flags)) |
1322 | return; | ||
1329 | /* | 1323 | /* |
1330 | * Here we take off tsk->signal->cpu_timers[N] and | 1324 | * Here we take off tsk->signal->cpu_timers[N] and |
1331 | * tsk->cpu_timers[N] all the timers that are firing, and | 1325 | * tsk->cpu_timers[N] all the timers that are firing, and |
@@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1347 | * that gets the timer lock before we do will give it up and | 1341 | * that gets the timer lock before we do will give it up and |
1348 | * spin until we've taken care of that timer below. | 1342 | * spin until we've taken care of that timer below. |
1349 | */ | 1343 | */ |
1350 | spin_unlock(&tsk->sighand->siglock); | 1344 | unlock_task_sighand(tsk, &flags); |
1351 | 1345 | ||
1352 | /* | 1346 | /* |
1353 | * Now that all the timers on our list have the firing flag, | 1347 | * Now that all the timers on our list have the firing flag, |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ad723420acc3..9ca4973f736d 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
560 | new_timer->it_clock = which_clock; | 560 | new_timer->it_clock = which_clock; |
561 | new_timer->it_overrun = -1; | 561 | new_timer->it_overrun = -1; |
562 | 562 | ||
563 | if (copy_to_user(created_timer_id, | ||
564 | &new_timer_id, sizeof (new_timer_id))) { | ||
565 | error = -EFAULT; | ||
566 | goto out; | ||
567 | } | ||
568 | if (timer_event_spec) { | 563 | if (timer_event_spec) { |
569 | if (copy_from_user(&event, timer_event_spec, sizeof (event))) { | 564 | if (copy_from_user(&event, timer_event_spec, sizeof (event))) { |
570 | error = -EFAULT; | 565 | error = -EFAULT; |
@@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
590 | new_timer->sigq->info.si_tid = new_timer->it_id; | 585 | new_timer->sigq->info.si_tid = new_timer->it_id; |
591 | new_timer->sigq->info.si_code = SI_TIMER; | 586 | new_timer->sigq->info.si_code = SI_TIMER; |
592 | 587 | ||
588 | if (copy_to_user(created_timer_id, | ||
589 | &new_timer_id, sizeof (new_timer_id))) { | ||
590 | error = -EFAULT; | ||
591 | goto out; | ||
592 | } | ||
593 | |||
593 | error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); | 594 | error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); |
594 | if (error) | 595 | if (error) |
595 | goto out; | 596 | goto out; |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ca6066a6952e..29bff6117abc 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -86,6 +86,7 @@ config PM_SLEEP_SMP | |||
86 | depends on SMP | 86 | depends on SMP |
87 | depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE | 87 | depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE |
88 | depends on PM_SLEEP | 88 | depends on PM_SLEEP |
89 | select HOTPLUG | ||
89 | select HOTPLUG_CPU | 90 | select HOTPLUG_CPU |
90 | default y | 91 | default y |
91 | 92 | ||
@@ -137,6 +138,8 @@ config SUSPEND_FREEZER | |||
137 | config HIBERNATION | 138 | config HIBERNATION |
138 | bool "Hibernation (aka 'suspend to disk')" | 139 | bool "Hibernation (aka 'suspend to disk')" |
139 | depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE | 140 | depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE |
141 | select LZO_COMPRESS | ||
142 | select LZO_DECOMPRESS | ||
140 | select SUSPEND_NVS if HAS_IOMEM | 143 | select SUSPEND_NVS if HAS_IOMEM |
141 | ---help--- | 144 | ---help--- |
142 | Enable the suspend to disk (STD) functionality, which is usually | 145 | Enable the suspend to disk (STD) functionality, which is usually |
@@ -242,3 +245,17 @@ config PM_OPS | |||
242 | bool | 245 | bool |
243 | depends on PM_SLEEP || PM_RUNTIME | 246 | depends on PM_SLEEP || PM_RUNTIME |
244 | default y | 247 | default y |
248 | |||
249 | config PM_OPP | ||
250 | bool "Operating Performance Point (OPP) Layer library" | ||
251 | depends on PM | ||
252 | ---help--- | ||
253 | SOCs have a standard set of tuples consisting of frequency and | ||
254 | voltage pairs that the device will support per voltage domain. This | ||
255 | is called Operating Performance Point or OPP. The actual definitions | ||
256 | of OPP varies over silicon within the same family of devices. | ||
257 | |||
258 | OPP layer organizes the data internally using device pointers | ||
259 | representing individual voltage domains and provides SOC | ||
260 | implementations a ready to use framework to manage OPPs. | ||
261 | For more information, read <file:Documentation/power/opp.txt> | ||
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index 97024fd40cd5..83bbc7c02df9 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c | |||
@@ -28,7 +28,7 @@ | |||
28 | static int submit(int rw, struct block_device *bdev, sector_t sector, | 28 | static int submit(int rw, struct block_device *bdev, sector_t sector, |
29 | struct page *page, struct bio **bio_chain) | 29 | struct page *page, struct bio **bio_chain) |
30 | { | 30 | { |
31 | const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); | 31 | const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; |
32 | struct bio *bio; | 32 | struct bio *bio; |
33 | 33 | ||
34 | bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); | 34 | bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index aa9e916da4d5..657272e91d0a 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * Copyright (c) 2003 Patrick Mochel | 4 | * Copyright (c) 2003 Patrick Mochel |
5 | * Copyright (c) 2003 Open Source Development Lab | 5 | * Copyright (c) 2003 Open Source Development Lab |
6 | * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> | 6 | * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> |
7 | * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. | 7 | * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. |
8 | * | 8 | * |
9 | * This file is released under the GPLv2. | 9 | * This file is released under the GPLv2. |
@@ -29,6 +29,7 @@ | |||
29 | #include "power.h" | 29 | #include "power.h" |
30 | 30 | ||
31 | 31 | ||
32 | static int nocompress = 0; | ||
32 | static int noresume = 0; | 33 | static int noresume = 0; |
33 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; | 34 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; |
34 | dev_t swsusp_resume_device; | 35 | dev_t swsusp_resume_device; |
@@ -277,7 +278,7 @@ static int create_image(int platform_mode) | |||
277 | goto Enable_irqs; | 278 | goto Enable_irqs; |
278 | } | 279 | } |
279 | 280 | ||
280 | if (hibernation_test(TEST_CORE)) | 281 | if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) |
281 | goto Power_up; | 282 | goto Power_up; |
282 | 283 | ||
283 | in_suspend = 1; | 284 | in_suspend = 1; |
@@ -288,8 +289,10 @@ static int create_image(int platform_mode) | |||
288 | error); | 289 | error); |
289 | /* Restore control flow magically appears here */ | 290 | /* Restore control flow magically appears here */ |
290 | restore_processor_state(); | 291 | restore_processor_state(); |
291 | if (!in_suspend) | 292 | if (!in_suspend) { |
293 | events_check_enabled = false; | ||
292 | platform_leave(platform_mode); | 294 | platform_leave(platform_mode); |
295 | } | ||
293 | 296 | ||
294 | Power_up: | 297 | Power_up: |
295 | sysdev_resume(); | 298 | sysdev_resume(); |
@@ -328,7 +331,7 @@ int hibernation_snapshot(int platform_mode) | |||
328 | 331 | ||
329 | error = platform_begin(platform_mode); | 332 | error = platform_begin(platform_mode); |
330 | if (error) | 333 | if (error) |
331 | return error; | 334 | goto Close; |
332 | 335 | ||
333 | /* Preallocate image memory before shutting down devices. */ | 336 | /* Preallocate image memory before shutting down devices. */ |
334 | error = hibernate_preallocate_memory(); | 337 | error = hibernate_preallocate_memory(); |
@@ -511,18 +514,24 @@ int hibernation_platform_enter(void) | |||
511 | 514 | ||
512 | local_irq_disable(); | 515 | local_irq_disable(); |
513 | sysdev_suspend(PMSG_HIBERNATE); | 516 | sysdev_suspend(PMSG_HIBERNATE); |
517 | if (!pm_check_wakeup_events()) { | ||
518 | error = -EAGAIN; | ||
519 | goto Power_up; | ||
520 | } | ||
521 | |||
514 | hibernation_ops->enter(); | 522 | hibernation_ops->enter(); |
515 | /* We should never get here */ | 523 | /* We should never get here */ |
516 | while (1); | 524 | while (1); |
517 | 525 | ||
518 | /* | 526 | Power_up: |
519 | * We don't need to reenable the nonboot CPUs or resume consoles, since | 527 | sysdev_resume(); |
520 | * the system is going to be halted anyway. | 528 | local_irq_enable(); |
521 | */ | 529 | enable_nonboot_cpus(); |
530 | |||
522 | Platform_finish: | 531 | Platform_finish: |
523 | hibernation_ops->finish(); | 532 | hibernation_ops->finish(); |
524 | 533 | ||
525 | dpm_suspend_noirq(PMSG_RESTORE); | 534 | dpm_resume_noirq(PMSG_RESTORE); |
526 | 535 | ||
527 | Resume_devices: | 536 | Resume_devices: |
528 | entering_platform_hibernation = false; | 537 | entering_platform_hibernation = false; |
@@ -630,6 +639,8 @@ int hibernate(void) | |||
630 | 639 | ||
631 | if (hibernation_mode == HIBERNATION_PLATFORM) | 640 | if (hibernation_mode == HIBERNATION_PLATFORM) |
632 | flags |= SF_PLATFORM_MODE; | 641 | flags |= SF_PLATFORM_MODE; |
642 | if (nocompress) | ||
643 | flags |= SF_NOCOMPRESS_MODE; | ||
633 | pr_debug("PM: writing image.\n"); | 644 | pr_debug("PM: writing image.\n"); |
634 | error = swsusp_write(flags); | 645 | error = swsusp_write(flags); |
635 | swsusp_free(); | 646 | swsusp_free(); |
@@ -697,7 +708,7 @@ static int software_resume(void) | |||
697 | goto Unlock; | 708 | goto Unlock; |
698 | } | 709 | } |
699 | 710 | ||
700 | pr_debug("PM: Checking image partition %s\n", resume_file); | 711 | pr_debug("PM: Checking hibernation image partition %s\n", resume_file); |
701 | 712 | ||
702 | /* Check if the device is there */ | 713 | /* Check if the device is there */ |
703 | swsusp_resume_device = name_to_dev_t(resume_file); | 714 | swsusp_resume_device = name_to_dev_t(resume_file); |
@@ -722,10 +733,10 @@ static int software_resume(void) | |||
722 | } | 733 | } |
723 | 734 | ||
724 | Check_image: | 735 | Check_image: |
725 | pr_debug("PM: Resume from partition %d:%d\n", | 736 | pr_debug("PM: Hibernation image partition %d:%d present\n", |
726 | MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); | 737 | MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); |
727 | 738 | ||
728 | pr_debug("PM: Checking hibernation image.\n"); | 739 | pr_debug("PM: Looking for hibernation image.\n"); |
729 | error = swsusp_check(); | 740 | error = swsusp_check(); |
730 | if (error) | 741 | if (error) |
731 | goto Unlock; | 742 | goto Unlock; |
@@ -757,14 +768,14 @@ static int software_resume(void) | |||
757 | goto Done; | 768 | goto Done; |
758 | } | 769 | } |
759 | 770 | ||
760 | pr_debug("PM: Reading hibernation image.\n"); | 771 | pr_debug("PM: Loading hibernation image.\n"); |
761 | 772 | ||
762 | error = swsusp_read(&flags); | 773 | error = swsusp_read(&flags); |
763 | swsusp_close(FMODE_READ); | 774 | swsusp_close(FMODE_READ); |
764 | if (!error) | 775 | if (!error) |
765 | hibernation_restore(flags & SF_PLATFORM_MODE); | 776 | hibernation_restore(flags & SF_PLATFORM_MODE); |
766 | 777 | ||
767 | printk(KERN_ERR "PM: Restore failed, recovering.\n"); | 778 | printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); |
768 | swsusp_free(); | 779 | swsusp_free(); |
769 | thaw_processes(); | 780 | thaw_processes(); |
770 | Done: | 781 | Done: |
@@ -777,7 +788,7 @@ static int software_resume(void) | |||
777 | /* For success case, the suspend path will release the lock */ | 788 | /* For success case, the suspend path will release the lock */ |
778 | Unlock: | 789 | Unlock: |
779 | mutex_unlock(&pm_mutex); | 790 | mutex_unlock(&pm_mutex); |
780 | pr_debug("PM: Resume from disk failed.\n"); | 791 | pr_debug("PM: Hibernation image not present or could not be loaded.\n"); |
781 | return error; | 792 | return error; |
782 | close_finish: | 793 | close_finish: |
783 | swsusp_close(FMODE_READ); | 794 | swsusp_close(FMODE_READ); |
@@ -996,6 +1007,15 @@ static int __init resume_offset_setup(char *str) | |||
996 | return 1; | 1007 | return 1; |
997 | } | 1008 | } |
998 | 1009 | ||
1010 | static int __init hibernate_setup(char *str) | ||
1011 | { | ||
1012 | if (!strncmp(str, "noresume", 8)) | ||
1013 | noresume = 1; | ||
1014 | else if (!strncmp(str, "nocompress", 10)) | ||
1015 | nocompress = 1; | ||
1016 | return 1; | ||
1017 | } | ||
1018 | |||
999 | static int __init noresume_setup(char *str) | 1019 | static int __init noresume_setup(char *str) |
1000 | { | 1020 | { |
1001 | noresume = 1; | 1021 | noresume = 1; |
@@ -1005,3 +1025,4 @@ static int __init noresume_setup(char *str) | |||
1005 | __setup("noresume", noresume_setup); | 1025 | __setup("noresume", noresume_setup); |
1006 | __setup("resume_offset=", resume_offset_setup); | 1026 | __setup("resume_offset=", resume_offset_setup); |
1007 | __setup("resume=", resume_setup); | 1027 | __setup("resume=", resume_setup); |
1028 | __setup("hibernate=", hibernate_setup); | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index b58800b21fc0..7b5db6a8561e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
204 | 204 | ||
205 | power_attr(state); | 205 | power_attr(state); |
206 | 206 | ||
207 | #ifdef CONFIG_PM_SLEEP | ||
208 | /* | ||
209 | * The 'wakeup_count' attribute, along with the functions defined in | ||
210 | * drivers/base/power/wakeup.c, provides a means by which wakeup events can be | ||
211 | * handled in a non-racy way. | ||
212 | * | ||
213 | * If a wakeup event occurs when the system is in a sleep state, it simply is | ||
214 | * woken up. In turn, if an event that would wake the system up from a sleep | ||
215 | * state occurs when it is undergoing a transition to that sleep state, the | ||
216 | * transition should be aborted. Moreover, if such an event occurs when the | ||
217 | * system is in the working state, an attempt to start a transition to the | ||
218 | * given sleep state should fail during certain period after the detection of | ||
219 | * the event. Using the 'state' attribute alone is not sufficient to satisfy | ||
220 | * these requirements, because a wakeup event may occur exactly when 'state' | ||
221 | * is being written to and may be delivered to user space right before it is | ||
222 | * frozen, so the event will remain only partially processed until the system is | ||
223 | * woken up by another event. In particular, it won't cause the transition to | ||
224 | * a sleep state to be aborted. | ||
225 | * | ||
226 | * This difficulty may be overcome if user space uses 'wakeup_count' before | ||
227 | * writing to 'state'. It first should read from 'wakeup_count' and store | ||
228 | * the read value. Then, after carrying out its own preparations for the system | ||
229 | * transition to a sleep state, it should write the stored value to | ||
230 | * 'wakeup_count'. If that fails, at least one wakeup event has occured since | ||
231 | * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it | ||
232 | * is allowed to write to 'state', but the transition will be aborted if there | ||
233 | * are any wakeup events detected after 'wakeup_count' was written to. | ||
234 | */ | ||
235 | |||
236 | static ssize_t wakeup_count_show(struct kobject *kobj, | ||
237 | struct kobj_attribute *attr, | ||
238 | char *buf) | ||
239 | { | ||
240 | unsigned int val; | ||
241 | |||
242 | return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; | ||
243 | } | ||
244 | |||
245 | static ssize_t wakeup_count_store(struct kobject *kobj, | ||
246 | struct kobj_attribute *attr, | ||
247 | const char *buf, size_t n) | ||
248 | { | ||
249 | unsigned int val; | ||
250 | |||
251 | if (sscanf(buf, "%u", &val) == 1) { | ||
252 | if (pm_save_wakeup_count(val)) | ||
253 | return n; | ||
254 | } | ||
255 | return -EINVAL; | ||
256 | } | ||
257 | |||
258 | power_attr(wakeup_count); | ||
259 | #endif /* CONFIG_PM_SLEEP */ | ||
260 | |||
207 | #ifdef CONFIG_PM_TRACE | 261 | #ifdef CONFIG_PM_TRACE |
208 | int pm_trace_enabled; | 262 | int pm_trace_enabled; |
209 | 263 | ||
@@ -227,15 +281,34 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
227 | } | 281 | } |
228 | 282 | ||
229 | power_attr(pm_trace); | 283 | power_attr(pm_trace); |
284 | |||
285 | static ssize_t pm_trace_dev_match_show(struct kobject *kobj, | ||
286 | struct kobj_attribute *attr, | ||
287 | char *buf) | ||
288 | { | ||
289 | return show_trace_dev_match(buf, PAGE_SIZE); | ||
290 | } | ||
291 | |||
292 | static ssize_t | ||
293 | pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
294 | const char *buf, size_t n) | ||
295 | { | ||
296 | return -EINVAL; | ||
297 | } | ||
298 | |||
299 | power_attr(pm_trace_dev_match); | ||
300 | |||
230 | #endif /* CONFIG_PM_TRACE */ | 301 | #endif /* CONFIG_PM_TRACE */ |
231 | 302 | ||
232 | static struct attribute * g[] = { | 303 | static struct attribute * g[] = { |
233 | &state_attr.attr, | 304 | &state_attr.attr, |
234 | #ifdef CONFIG_PM_TRACE | 305 | #ifdef CONFIG_PM_TRACE |
235 | &pm_trace_attr.attr, | 306 | &pm_trace_attr.attr, |
307 | &pm_trace_dev_match_attr.attr, | ||
236 | #endif | 308 | #endif |
237 | #ifdef CONFIG_PM_SLEEP | 309 | #ifdef CONFIG_PM_SLEEP |
238 | &pm_async_attr.attr, | 310 | &pm_async_attr.attr, |
311 | &wakeup_count_attr.attr, | ||
239 | #ifdef CONFIG_PM_DEBUG | 312 | #ifdef CONFIG_PM_DEBUG |
240 | &pm_test_attr.attr, | 313 | &pm_test_attr.attr, |
241 | #endif | 314 | #endif |
@@ -253,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq); | |||
253 | 326 | ||
254 | static int __init pm_start_workqueue(void) | 327 | static int __init pm_start_workqueue(void) |
255 | { | 328 | { |
256 | pm_wq = create_freezeable_workqueue("pm"); | 329 | pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0); |
257 | 330 | ||
258 | return pm_wq ? 0 : -ENOMEM; | 331 | return pm_wq ? 0 : -ENOMEM; |
259 | } | 332 | } |
@@ -266,6 +339,7 @@ static int __init pm_init(void) | |||
266 | int error = pm_start_workqueue(); | 339 | int error = pm_start_workqueue(); |
267 | if (error) | 340 | if (error) |
268 | return error; | 341 | return error; |
342 | hibernate_image_size_init(); | ||
269 | power_kobj = kobject_create_and_add("power", NULL); | 343 | power_kobj = kobject_create_and_add("power", NULL); |
270 | if (!power_kobj) | 344 | if (!power_kobj) |
271 | return -ENOMEM; | 345 | return -ENOMEM; |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 006270fe382d..03634be55f62 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -14,6 +14,9 @@ struct swsusp_info { | |||
14 | } __attribute__((aligned(PAGE_SIZE))); | 14 | } __attribute__((aligned(PAGE_SIZE))); |
15 | 15 | ||
16 | #ifdef CONFIG_HIBERNATION | 16 | #ifdef CONFIG_HIBERNATION |
17 | /* kernel/power/snapshot.c */ | ||
18 | extern void __init hibernate_image_size_init(void); | ||
19 | |||
17 | #ifdef CONFIG_ARCH_HIBERNATION_HEADER | 20 | #ifdef CONFIG_ARCH_HIBERNATION_HEADER |
18 | /* Maximum size of architecture specific data in a hibernation header */ | 21 | /* Maximum size of architecture specific data in a hibernation header */ |
19 | #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) | 22 | #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) |
@@ -49,7 +52,11 @@ static inline char *check_image_kernel(struct swsusp_info *info) | |||
49 | extern int hibernation_snapshot(int platform_mode); | 52 | extern int hibernation_snapshot(int platform_mode); |
50 | extern int hibernation_restore(int platform_mode); | 53 | extern int hibernation_restore(int platform_mode); |
51 | extern int hibernation_platform_enter(void); | 54 | extern int hibernation_platform_enter(void); |
52 | #endif | 55 | |
56 | #else /* !CONFIG_HIBERNATION */ | ||
57 | |||
58 | static inline void hibernate_image_size_init(void) {} | ||
59 | #endif /* !CONFIG_HIBERNATION */ | ||
53 | 60 | ||
54 | extern int pfn_is_nosave(unsigned long); | 61 | extern int pfn_is_nosave(unsigned long); |
55 | 62 | ||
@@ -134,6 +141,7 @@ extern int swsusp_swap_in_use(void); | |||
134 | * the image header. | 141 | * the image header. |
135 | */ | 142 | */ |
136 | #define SF_PLATFORM_MODE 1 | 143 | #define SF_PLATFORM_MODE 1 |
144 | #define SF_NOCOMPRESS_MODE 2 | ||
137 | 145 | ||
138 | /* kernel/power/hibernate.c */ | 146 | /* kernel/power/hibernate.c */ |
139 | extern int swsusp_check(void); | 147 | extern int swsusp_check(void); |
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index e8b337006276..d52359374e85 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c | |||
@@ -24,7 +24,7 @@ static void do_poweroff(struct work_struct *dummy) | |||
24 | 24 | ||
25 | static DECLARE_WORK(poweroff_work, do_poweroff); | 25 | static DECLARE_WORK(poweroff_work, do_poweroff); |
26 | 26 | ||
27 | static void handle_poweroff(int key, struct tty_struct *tty) | 27 | static void handle_poweroff(int key) |
28 | { | 28 | { |
29 | /* run sysrq poweroff on boot cpu */ | 29 | /* run sysrq poweroff on boot cpu */ |
30 | schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); | 30 | schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 71ae29052ab6..e50b4c1b2a0f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
16 | #include <linux/freezer.h> | 16 | #include <linux/freezer.h> |
17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
18 | #include <linux/workqueue.h> | ||
18 | 19 | ||
19 | /* | 20 | /* |
20 | * Timeout for stopping processes | 21 | * Timeout for stopping processes |
@@ -35,13 +36,19 @@ static int try_to_freeze_tasks(bool sig_only) | |||
35 | struct task_struct *g, *p; | 36 | struct task_struct *g, *p; |
36 | unsigned long end_time; | 37 | unsigned long end_time; |
37 | unsigned int todo; | 38 | unsigned int todo; |
39 | bool wq_busy = false; | ||
38 | struct timeval start, end; | 40 | struct timeval start, end; |
39 | u64 elapsed_csecs64; | 41 | u64 elapsed_csecs64; |
40 | unsigned int elapsed_csecs; | 42 | unsigned int elapsed_csecs; |
43 | bool wakeup = false; | ||
41 | 44 | ||
42 | do_gettimeofday(&start); | 45 | do_gettimeofday(&start); |
43 | 46 | ||
44 | end_time = jiffies + TIMEOUT; | 47 | end_time = jiffies + TIMEOUT; |
48 | |||
49 | if (!sig_only) | ||
50 | freeze_workqueues_begin(); | ||
51 | |||
45 | while (true) { | 52 | while (true) { |
46 | todo = 0; | 53 | todo = 0; |
47 | read_lock(&tasklist_lock); | 54 | read_lock(&tasklist_lock); |
@@ -63,9 +70,20 @@ static int try_to_freeze_tasks(bool sig_only) | |||
63 | todo++; | 70 | todo++; |
64 | } while_each_thread(g, p); | 71 | } while_each_thread(g, p); |
65 | read_unlock(&tasklist_lock); | 72 | read_unlock(&tasklist_lock); |
73 | |||
74 | if (!sig_only) { | ||
75 | wq_busy = freeze_workqueues_busy(); | ||
76 | todo += wq_busy; | ||
77 | } | ||
78 | |||
66 | if (!todo || time_after(jiffies, end_time)) | 79 | if (!todo || time_after(jiffies, end_time)) |
67 | break; | 80 | break; |
68 | 81 | ||
82 | if (!pm_check_wakeup_events()) { | ||
83 | wakeup = true; | ||
84 | break; | ||
85 | } | ||
86 | |||
69 | /* | 87 | /* |
70 | * We need to retry, but first give the freezing tasks some | 88 | * We need to retry, but first give the freezing tasks some |
71 | * time to enter the regrigerator. | 89 | * time to enter the regrigerator. |
@@ -85,13 +103,18 @@ static int try_to_freeze_tasks(bool sig_only) | |||
85 | * but it cleans up leftover PF_FREEZE requests. | 103 | * but it cleans up leftover PF_FREEZE requests. |
86 | */ | 104 | */ |
87 | printk("\n"); | 105 | printk("\n"); |
88 | printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " | 106 | printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " |
89 | "(%d tasks refusing to freeze):\n", | 107 | "(%d tasks refusing to freeze, wq_busy=%d):\n", |
90 | elapsed_csecs / 100, elapsed_csecs % 100, todo); | 108 | wakeup ? "aborted" : "failed", |
109 | elapsed_csecs / 100, elapsed_csecs % 100, | ||
110 | todo - wq_busy, wq_busy); | ||
111 | |||
112 | thaw_workqueues(); | ||
113 | |||
91 | read_lock(&tasklist_lock); | 114 | read_lock(&tasklist_lock); |
92 | do_each_thread(g, p) { | 115 | do_each_thread(g, p) { |
93 | task_lock(p); | 116 | task_lock(p); |
94 | if (freezing(p) && !freezer_should_skip(p)) | 117 | if (!wakeup && freezing(p) && !freezer_should_skip(p)) |
95 | sched_show_task(p); | 118 | sched_show_task(p); |
96 | cancel_freezing(p); | 119 | cancel_freezing(p); |
97 | task_unlock(p); | 120 | task_unlock(p); |
@@ -157,6 +180,7 @@ void thaw_processes(void) | |||
157 | oom_killer_enable(); | 180 | oom_killer_enable(); |
158 | 181 | ||
159 | printk("Restarting tasks ... "); | 182 | printk("Restarting tasks ... "); |
183 | thaw_workqueues(); | ||
160 | thaw_tasks(true); | 184 | thaw_tasks(true); |
161 | thaw_tasks(false); | 185 | thaw_tasks(false); |
162 | schedule(); | 186 | schedule(); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 25ce010e9f8b..0dac75ea4456 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * This file provides system snapshot/restore functionality for swsusp. | 4 | * This file provides system snapshot/restore functionality for swsusp. |
5 | * | 5 | * |
6 | * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> | 6 | * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz> |
7 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | 7 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> |
8 | * | 8 | * |
9 | * This file is released under the GPLv2. | 9 | * This file is released under the GPLv2. |
@@ -46,7 +46,12 @@ static void swsusp_unset_page_forbidden(struct page *); | |||
46 | * size will not exceed N bytes, but if that is impossible, it will | 46 | * size will not exceed N bytes, but if that is impossible, it will |
47 | * try to create the smallest image possible. | 47 | * try to create the smallest image possible. |
48 | */ | 48 | */ |
49 | unsigned long image_size = 500 * 1024 * 1024; | 49 | unsigned long image_size; |
50 | |||
51 | void __init hibernate_image_size_init(void) | ||
52 | { | ||
53 | image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; | ||
54 | } | ||
50 | 55 | ||
51 | /* List of PBEs needed for restoring the pages that were allocated before | 56 | /* List of PBEs needed for restoring the pages that were allocated before |
52 | * the suspend and included in the suspend image, but have also been | 57 | * the suspend and included in the suspend image, but have also been |
@@ -979,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | |||
979 | src = kmap_atomic(s_page, KM_USER0); | 984 | src = kmap_atomic(s_page, KM_USER0); |
980 | dst = kmap_atomic(d_page, KM_USER1); | 985 | dst = kmap_atomic(d_page, KM_USER1); |
981 | do_copy_page(dst, src); | 986 | do_copy_page(dst, src); |
982 | kunmap_atomic(src, KM_USER0); | ||
983 | kunmap_atomic(dst, KM_USER1); | 987 | kunmap_atomic(dst, KM_USER1); |
988 | kunmap_atomic(src, KM_USER0); | ||
984 | } else { | 989 | } else { |
985 | if (PageHighMem(d_page)) { | 990 | if (PageHighMem(d_page)) { |
986 | /* Page pointed to by src may contain some kernel | 991 | /* Page pointed to by src may contain some kernel |
@@ -988,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | |||
988 | */ | 993 | */ |
989 | safe_copy_page(buffer, s_page); | 994 | safe_copy_page(buffer, s_page); |
990 | dst = kmap_atomic(d_page, KM_USER0); | 995 | dst = kmap_atomic(d_page, KM_USER0); |
991 | memcpy(dst, buffer, PAGE_SIZE); | 996 | copy_page(dst, buffer); |
992 | kunmap_atomic(dst, KM_USER0); | 997 | kunmap_atomic(dst, KM_USER0); |
993 | } else { | 998 | } else { |
994 | safe_copy_page(page_address(d_page), s_page); | 999 | safe_copy_page(page_address(d_page), s_page); |
@@ -1121,9 +1126,19 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) | |||
1121 | return nr_alloc; | 1126 | return nr_alloc; |
1122 | } | 1127 | } |
1123 | 1128 | ||
1124 | static unsigned long preallocate_image_memory(unsigned long nr_pages) | 1129 | static unsigned long preallocate_image_memory(unsigned long nr_pages, |
1130 | unsigned long avail_normal) | ||
1125 | { | 1131 | { |
1126 | return preallocate_image_pages(nr_pages, GFP_IMAGE); | 1132 | unsigned long alloc; |
1133 | |||
1134 | if (avail_normal <= alloc_normal) | ||
1135 | return 0; | ||
1136 | |||
1137 | alloc = avail_normal - alloc_normal; | ||
1138 | if (nr_pages < alloc) | ||
1139 | alloc = nr_pages; | ||
1140 | |||
1141 | return preallocate_image_pages(alloc, GFP_IMAGE); | ||
1127 | } | 1142 | } |
1128 | 1143 | ||
1129 | #ifdef CONFIG_HIGHMEM | 1144 | #ifdef CONFIG_HIGHMEM |
@@ -1169,15 +1184,22 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, | |||
1169 | */ | 1184 | */ |
1170 | static void free_unnecessary_pages(void) | 1185 | static void free_unnecessary_pages(void) |
1171 | { | 1186 | { |
1172 | unsigned long save_highmem, to_free_normal, to_free_highmem; | 1187 | unsigned long save, to_free_normal, to_free_highmem; |
1173 | 1188 | ||
1174 | to_free_normal = alloc_normal - count_data_pages(); | 1189 | save = count_data_pages(); |
1175 | save_highmem = count_highmem_pages(); | 1190 | if (alloc_normal >= save) { |
1176 | if (alloc_highmem > save_highmem) { | 1191 | to_free_normal = alloc_normal - save; |
1177 | to_free_highmem = alloc_highmem - save_highmem; | 1192 | save = 0; |
1193 | } else { | ||
1194 | to_free_normal = 0; | ||
1195 | save -= alloc_normal; | ||
1196 | } | ||
1197 | save += count_highmem_pages(); | ||
1198 | if (alloc_highmem >= save) { | ||
1199 | to_free_highmem = alloc_highmem - save; | ||
1178 | } else { | 1200 | } else { |
1179 | to_free_highmem = 0; | 1201 | to_free_highmem = 0; |
1180 | to_free_normal -= save_highmem - alloc_highmem; | 1202 | to_free_normal -= save - alloc_highmem; |
1181 | } | 1203 | } |
1182 | 1204 | ||
1183 | memory_bm_position_reset(©_bm); | 1205 | memory_bm_position_reset(©_bm); |
@@ -1258,7 +1280,7 @@ int hibernate_preallocate_memory(void) | |||
1258 | { | 1280 | { |
1259 | struct zone *zone; | 1281 | struct zone *zone; |
1260 | unsigned long saveable, size, max_size, count, highmem, pages = 0; | 1282 | unsigned long saveable, size, max_size, count, highmem, pages = 0; |
1261 | unsigned long alloc, save_highmem, pages_highmem; | 1283 | unsigned long alloc, save_highmem, pages_highmem, avail_normal; |
1262 | struct timeval start, stop; | 1284 | struct timeval start, stop; |
1263 | int error; | 1285 | int error; |
1264 | 1286 | ||
@@ -1295,26 +1317,38 @@ int hibernate_preallocate_memory(void) | |||
1295 | else | 1317 | else |
1296 | count += zone_page_state(zone, NR_FREE_PAGES); | 1318 | count += zone_page_state(zone, NR_FREE_PAGES); |
1297 | } | 1319 | } |
1320 | avail_normal = count; | ||
1298 | count += highmem; | 1321 | count += highmem; |
1299 | count -= totalreserve_pages; | 1322 | count -= totalreserve_pages; |
1300 | 1323 | ||
1301 | /* Compute the maximum number of saveable pages to leave in memory. */ | 1324 | /* Compute the maximum number of saveable pages to leave in memory. */ |
1302 | max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; | 1325 | max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; |
1326 | /* Compute the desired number of image pages specified by image_size. */ | ||
1303 | size = DIV_ROUND_UP(image_size, PAGE_SIZE); | 1327 | size = DIV_ROUND_UP(image_size, PAGE_SIZE); |
1304 | if (size > max_size) | 1328 | if (size > max_size) |
1305 | size = max_size; | 1329 | size = max_size; |
1306 | /* | 1330 | /* |
1307 | * If the maximum is not less than the current number of saveable pages | 1331 | * If the desired number of image pages is at least as large as the |
1308 | * in memory, allocate page frames for the image and we're done. | 1332 | * current number of saveable pages in memory, allocate page frames for |
1333 | * the image and we're done. | ||
1309 | */ | 1334 | */ |
1310 | if (size >= saveable) { | 1335 | if (size >= saveable) { |
1311 | pages = preallocate_image_highmem(save_highmem); | 1336 | pages = preallocate_image_highmem(save_highmem); |
1312 | pages += preallocate_image_memory(saveable - pages); | 1337 | pages += preallocate_image_memory(saveable - pages, avail_normal); |
1313 | goto out; | 1338 | goto out; |
1314 | } | 1339 | } |
1315 | 1340 | ||
1316 | /* Estimate the minimum size of the image. */ | 1341 | /* Estimate the minimum size of the image. */ |
1317 | pages = minimum_image_size(saveable); | 1342 | pages = minimum_image_size(saveable); |
1343 | /* | ||
1344 | * To avoid excessive pressure on the normal zone, leave room in it to | ||
1345 | * accommodate an image of the minimum size (unless it's already too | ||
1346 | * small, in which case don't preallocate pages from it at all). | ||
1347 | */ | ||
1348 | if (avail_normal > pages) | ||
1349 | avail_normal -= pages; | ||
1350 | else | ||
1351 | avail_normal = 0; | ||
1318 | if (size < pages) | 1352 | if (size < pages) |
1319 | size = min_t(unsigned long, pages, max_size); | 1353 | size = min_t(unsigned long, pages, max_size); |
1320 | 1354 | ||
@@ -1335,16 +1369,34 @@ int hibernate_preallocate_memory(void) | |||
1335 | */ | 1369 | */ |
1336 | pages_highmem = preallocate_image_highmem(highmem / 2); | 1370 | pages_highmem = preallocate_image_highmem(highmem / 2); |
1337 | alloc = (count - max_size) - pages_highmem; | 1371 | alloc = (count - max_size) - pages_highmem; |
1338 | pages = preallocate_image_memory(alloc); | 1372 | pages = preallocate_image_memory(alloc, avail_normal); |
1339 | if (pages < alloc) | 1373 | if (pages < alloc) { |
1340 | goto err_out; | 1374 | /* We have exhausted non-highmem pages, try highmem. */ |
1341 | size = max_size - size; | 1375 | alloc -= pages; |
1342 | alloc = size; | 1376 | pages += pages_highmem; |
1343 | size = preallocate_highmem_fraction(size, highmem, count); | 1377 | pages_highmem = preallocate_image_highmem(alloc); |
1344 | pages_highmem += size; | 1378 | if (pages_highmem < alloc) |
1345 | alloc -= size; | 1379 | goto err_out; |
1346 | pages += preallocate_image_memory(alloc); | 1380 | pages += pages_highmem; |
1347 | pages += pages_highmem; | 1381 | /* |
1382 | * size is the desired number of saveable pages to leave in | ||
1383 | * memory, so try to preallocate (all memory - size) pages. | ||
1384 | */ | ||
1385 | alloc = (count - pages) - size; | ||
1386 | pages += preallocate_image_highmem(alloc); | ||
1387 | } else { | ||
1388 | /* | ||
1389 | * There are approximately max_size saveable pages at this point | ||
1390 | * and we want to reduce this number down to size. | ||
1391 | */ | ||
1392 | alloc = max_size - size; | ||
1393 | size = preallocate_highmem_fraction(alloc, highmem, count); | ||
1394 | pages_highmem += size; | ||
1395 | alloc -= size; | ||
1396 | size = preallocate_image_memory(alloc, avail_normal); | ||
1397 | pages_highmem += preallocate_image_highmem(alloc - size); | ||
1398 | pages += pages_highmem + size; | ||
1399 | } | ||
1348 | 1400 | ||
1349 | /* | 1401 | /* |
1350 | * We only need as many page frames for the image as there are saveable | 1402 | * We only need as many page frames for the image as there are saveable |
@@ -1635,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle) | |||
1635 | memory_bm_position_reset(&orig_bm); | 1687 | memory_bm_position_reset(&orig_bm); |
1636 | memory_bm_position_reset(©_bm); | 1688 | memory_bm_position_reset(©_bm); |
1637 | } else if (handle->cur <= nr_meta_pages) { | 1689 | } else if (handle->cur <= nr_meta_pages) { |
1638 | memset(buffer, 0, PAGE_SIZE); | 1690 | clear_page(buffer); |
1639 | pack_pfns(buffer, &orig_bm); | 1691 | pack_pfns(buffer, &orig_bm); |
1640 | } else { | 1692 | } else { |
1641 | struct page *page; | 1693 | struct page *page; |
@@ -1649,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle) | |||
1649 | void *kaddr; | 1701 | void *kaddr; |
1650 | 1702 | ||
1651 | kaddr = kmap_atomic(page, KM_USER0); | 1703 | kaddr = kmap_atomic(page, KM_USER0); |
1652 | memcpy(buffer, kaddr, PAGE_SIZE); | 1704 | copy_page(buffer, kaddr); |
1653 | kunmap_atomic(kaddr, KM_USER0); | 1705 | kunmap_atomic(kaddr, KM_USER0); |
1654 | handle->buffer = buffer; | 1706 | handle->buffer = buffer; |
1655 | } else { | 1707 | } else { |
@@ -1932,7 +1984,7 @@ static void copy_last_highmem_page(void) | |||
1932 | void *dst; | 1984 | void *dst; |
1933 | 1985 | ||
1934 | dst = kmap_atomic(last_highmem_page, KM_USER0); | 1986 | dst = kmap_atomic(last_highmem_page, KM_USER0); |
1935 | memcpy(dst, buffer, PAGE_SIZE); | 1987 | copy_page(dst, buffer); |
1936 | kunmap_atomic(dst, KM_USER0); | 1988 | kunmap_atomic(dst, KM_USER0); |
1937 | last_highmem_page = NULL; | 1989 | last_highmem_page = NULL; |
1938 | } | 1990 | } |
@@ -2218,11 +2270,11 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) | |||
2218 | 2270 | ||
2219 | kaddr1 = kmap_atomic(p1, KM_USER0); | 2271 | kaddr1 = kmap_atomic(p1, KM_USER0); |
2220 | kaddr2 = kmap_atomic(p2, KM_USER1); | 2272 | kaddr2 = kmap_atomic(p2, KM_USER1); |
2221 | memcpy(buf, kaddr1, PAGE_SIZE); | 2273 | copy_page(buf, kaddr1); |
2222 | memcpy(kaddr1, kaddr2, PAGE_SIZE); | 2274 | copy_page(kaddr1, kaddr2); |
2223 | memcpy(kaddr2, buf, PAGE_SIZE); | 2275 | copy_page(kaddr2, buf); |
2224 | kunmap_atomic(kaddr1, KM_USER0); | ||
2225 | kunmap_atomic(kaddr2, KM_USER1); | 2276 | kunmap_atomic(kaddr2, KM_USER1); |
2277 | kunmap_atomic(kaddr1, KM_USER0); | ||
2226 | } | 2278 | } |
2227 | 2279 | ||
2228 | /** | 2280 | /** |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f37cb7dd4402..7335952ee473 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -136,19 +136,19 @@ static int suspend_enter(suspend_state_t state) | |||
136 | if (suspend_ops->prepare) { | 136 | if (suspend_ops->prepare) { |
137 | error = suspend_ops->prepare(); | 137 | error = suspend_ops->prepare(); |
138 | if (error) | 138 | if (error) |
139 | return error; | 139 | goto Platform_finish; |
140 | } | 140 | } |
141 | 141 | ||
142 | error = dpm_suspend_noirq(PMSG_SUSPEND); | 142 | error = dpm_suspend_noirq(PMSG_SUSPEND); |
143 | if (error) { | 143 | if (error) { |
144 | printk(KERN_ERR "PM: Some devices failed to power down\n"); | 144 | printk(KERN_ERR "PM: Some devices failed to power down\n"); |
145 | goto Platfrom_finish; | 145 | goto Platform_finish; |
146 | } | 146 | } |
147 | 147 | ||
148 | if (suspend_ops->prepare_late) { | 148 | if (suspend_ops->prepare_late) { |
149 | error = suspend_ops->prepare_late(); | 149 | error = suspend_ops->prepare_late(); |
150 | if (error) | 150 | if (error) |
151 | goto Power_up_devices; | 151 | goto Platform_wake; |
152 | } | 152 | } |
153 | 153 | ||
154 | if (suspend_test(TEST_PLATFORM)) | 154 | if (suspend_test(TEST_PLATFORM)) |
@@ -163,8 +163,10 @@ static int suspend_enter(suspend_state_t state) | |||
163 | 163 | ||
164 | error = sysdev_suspend(PMSG_SUSPEND); | 164 | error = sysdev_suspend(PMSG_SUSPEND); |
165 | if (!error) { | 165 | if (!error) { |
166 | if (!suspend_test(TEST_CORE)) | 166 | if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { |
167 | error = suspend_ops->enter(state); | 167 | error = suspend_ops->enter(state); |
168 | events_check_enabled = false; | ||
169 | } | ||
168 | sysdev_resume(); | 170 | sysdev_resume(); |
169 | } | 171 | } |
170 | 172 | ||
@@ -178,10 +180,9 @@ static int suspend_enter(suspend_state_t state) | |||
178 | if (suspend_ops->wake) | 180 | if (suspend_ops->wake) |
179 | suspend_ops->wake(); | 181 | suspend_ops->wake(); |
180 | 182 | ||
181 | Power_up_devices: | ||
182 | dpm_resume_noirq(PMSG_RESUME); | 183 | dpm_resume_noirq(PMSG_RESUME); |
183 | 184 | ||
184 | Platfrom_finish: | 185 | Platform_finish: |
185 | if (suspend_ops->finish) | 186 | if (suspend_ops->finish) |
186 | suspend_ops->finish(); | 187 | suspend_ops->finish(); |
187 | 188 | ||
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b0bb21778391..a0e4a86ccf94 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * This file provides functions for reading the suspend image from | 4 | * This file provides functions for reading the suspend image from |
5 | * and writing it to a swap partition. | 5 | * and writing it to a swap partition. |
6 | * | 6 | * |
7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> | 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> |
8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | 8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> |
9 | * | 9 | * |
10 | * This file is released under the GPLv2. | 10 | * This file is released under the GPLv2. |
@@ -24,15 +24,17 @@ | |||
24 | #include <linux/swapops.h> | 24 | #include <linux/swapops.h> |
25 | #include <linux/pm.h> | 25 | #include <linux/pm.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/lzo.h> | ||
28 | #include <linux/vmalloc.h> | ||
27 | 29 | ||
28 | #include "power.h" | 30 | #include "power.h" |
29 | 31 | ||
30 | #define SWSUSP_SIG "S1SUSPEND" | 32 | #define HIBERNATE_SIG "LINHIB0001" |
31 | 33 | ||
32 | /* | 34 | /* |
33 | * The swap map is a data structure used for keeping track of each page | 35 | * The swap map is a data structure used for keeping track of each page |
34 | * written to a swap partition. It consists of many swap_map_page | 36 | * written to a swap partition. It consists of many swap_map_page |
35 | * structures that contain each an array of MAP_PAGE_SIZE swap entries. | 37 | * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. |
36 | * These structures are stored on the swap and linked together with the | 38 | * These structures are stored on the swap and linked together with the |
37 | * help of the .next_swap member. | 39 | * help of the .next_swap member. |
38 | * | 40 | * |
@@ -148,7 +150,7 @@ sector_t alloc_swapdev_block(int swap) | |||
148 | 150 | ||
149 | /** | 151 | /** |
150 | * free_all_swap_pages - free swap pages allocated for saving image data. | 152 | * free_all_swap_pages - free swap pages allocated for saving image data. |
151 | * It also frees the extents used to register which swap entres had been | 153 | * It also frees the extents used to register which swap entries had been |
152 | * allocated. | 154 | * allocated. |
153 | */ | 155 | */ |
154 | 156 | ||
@@ -193,7 +195,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) | |||
193 | if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || | 195 | if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || |
194 | !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { | 196 | !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { |
195 | memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); | 197 | memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); |
196 | memcpy(swsusp_header->sig,SWSUSP_SIG, 10); | 198 | memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); |
197 | swsusp_header->image = handle->first_sector; | 199 | swsusp_header->image = handle->first_sector; |
198 | swsusp_header->flags = flags; | 200 | swsusp_header->flags = flags; |
199 | error = hib_bio_write_page(swsusp_resume_block, | 201 | error = hib_bio_write_page(swsusp_resume_block, |
@@ -249,7 +251,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) | |||
249 | if (bio_chain) { | 251 | if (bio_chain) { |
250 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 252 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); |
251 | if (src) { | 253 | if (src) { |
252 | memcpy(src, buf, PAGE_SIZE); | 254 | copy_page(src, buf); |
253 | } else { | 255 | } else { |
254 | WARN_ON_ONCE(1); | 256 | WARN_ON_ONCE(1); |
255 | bio_chain = NULL; /* Go synchronous */ | 257 | bio_chain = NULL; /* Go synchronous */ |
@@ -323,7 +325,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, | |||
323 | error = write_page(handle->cur, handle->cur_swap, NULL); | 325 | error = write_page(handle->cur, handle->cur_swap, NULL); |
324 | if (error) | 326 | if (error) |
325 | goto out; | 327 | goto out; |
326 | memset(handle->cur, 0, PAGE_SIZE); | 328 | clear_page(handle->cur); |
327 | handle->cur_swap = offset; | 329 | handle->cur_swap = offset; |
328 | handle->k = 0; | 330 | handle->k = 0; |
329 | } | 331 | } |
@@ -357,6 +359,18 @@ static int swap_writer_finish(struct swap_map_handle *handle, | |||
357 | return error; | 359 | return error; |
358 | } | 360 | } |
359 | 361 | ||
362 | /* We need to remember how much compressed data we need to read. */ | ||
363 | #define LZO_HEADER sizeof(size_t) | ||
364 | |||
365 | /* Number of pages/bytes we'll compress at one time. */ | ||
366 | #define LZO_UNC_PAGES 32 | ||
367 | #define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE) | ||
368 | |||
369 | /* Number of pages/bytes we need for compressed data (worst case). */ | ||
370 | #define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \ | ||
371 | LZO_HEADER, PAGE_SIZE) | ||
372 | #define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) | ||
373 | |||
360 | /** | 374 | /** |
361 | * save_image - save the suspend image data | 375 | * save_image - save the suspend image data |
362 | */ | 376 | */ |
@@ -404,6 +418,137 @@ static int save_image(struct swap_map_handle *handle, | |||
404 | return ret; | 418 | return ret; |
405 | } | 419 | } |
406 | 420 | ||
421 | |||
422 | /** | ||
423 | * save_image_lzo - Save the suspend image data compressed with LZO. | ||
424 | * @handle: Swap mam handle to use for saving the image. | ||
425 | * @snapshot: Image to read data from. | ||
426 | * @nr_to_write: Number of pages to save. | ||
427 | */ | ||
428 | static int save_image_lzo(struct swap_map_handle *handle, | ||
429 | struct snapshot_handle *snapshot, | ||
430 | unsigned int nr_to_write) | ||
431 | { | ||
432 | unsigned int m; | ||
433 | int ret = 0; | ||
434 | int nr_pages; | ||
435 | int err2; | ||
436 | struct bio *bio; | ||
437 | struct timeval start; | ||
438 | struct timeval stop; | ||
439 | size_t off, unc_len, cmp_len; | ||
440 | unsigned char *unc, *cmp, *wrk, *page; | ||
441 | |||
442 | page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | ||
443 | if (!page) { | ||
444 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | ||
445 | return -ENOMEM; | ||
446 | } | ||
447 | |||
448 | wrk = vmalloc(LZO1X_1_MEM_COMPRESS); | ||
449 | if (!wrk) { | ||
450 | printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); | ||
451 | free_page((unsigned long)page); | ||
452 | return -ENOMEM; | ||
453 | } | ||
454 | |||
455 | unc = vmalloc(LZO_UNC_SIZE); | ||
456 | if (!unc) { | ||
457 | printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); | ||
458 | vfree(wrk); | ||
459 | free_page((unsigned long)page); | ||
460 | return -ENOMEM; | ||
461 | } | ||
462 | |||
463 | cmp = vmalloc(LZO_CMP_SIZE); | ||
464 | if (!cmp) { | ||
465 | printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); | ||
466 | vfree(unc); | ||
467 | vfree(wrk); | ||
468 | free_page((unsigned long)page); | ||
469 | return -ENOMEM; | ||
470 | } | ||
471 | |||
472 | printk(KERN_INFO | ||
473 | "PM: Compressing and saving image data (%u pages) ... ", | ||
474 | nr_to_write); | ||
475 | m = nr_to_write / 100; | ||
476 | if (!m) | ||
477 | m = 1; | ||
478 | nr_pages = 0; | ||
479 | bio = NULL; | ||
480 | do_gettimeofday(&start); | ||
481 | for (;;) { | ||
482 | for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { | ||
483 | ret = snapshot_read_next(snapshot); | ||
484 | if (ret < 0) | ||
485 | goto out_finish; | ||
486 | |||
487 | if (!ret) | ||
488 | break; | ||
489 | |||
490 | memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); | ||
491 | |||
492 | if (!(nr_pages % m)) | ||
493 | printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); | ||
494 | nr_pages++; | ||
495 | } | ||
496 | |||
497 | if (!off) | ||
498 | break; | ||
499 | |||
500 | unc_len = off; | ||
501 | ret = lzo1x_1_compress(unc, unc_len, | ||
502 | cmp + LZO_HEADER, &cmp_len, wrk); | ||
503 | if (ret < 0) { | ||
504 | printk(KERN_ERR "PM: LZO compression failed\n"); | ||
505 | break; | ||
506 | } | ||
507 | |||
508 | if (unlikely(!cmp_len || | ||
509 | cmp_len > lzo1x_worst_compress(unc_len))) { | ||
510 | printk(KERN_ERR "PM: Invalid LZO compressed length\n"); | ||
511 | ret = -1; | ||
512 | break; | ||
513 | } | ||
514 | |||
515 | *(size_t *)cmp = cmp_len; | ||
516 | |||
517 | /* | ||
518 | * Given we are writing one page at a time to disk, we copy | ||
519 | * that much from the buffer, although the last bit will likely | ||
520 | * be smaller than full page. This is OK - we saved the length | ||
521 | * of the compressed data, so any garbage at the end will be | ||
522 | * discarded when we read it. | ||
523 | */ | ||
524 | for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { | ||
525 | memcpy(page, cmp + off, PAGE_SIZE); | ||
526 | |||
527 | ret = swap_write_page(handle, page, &bio); | ||
528 | if (ret) | ||
529 | goto out_finish; | ||
530 | } | ||
531 | } | ||
532 | |||
533 | out_finish: | ||
534 | err2 = hib_wait_on_bio_chain(&bio); | ||
535 | do_gettimeofday(&stop); | ||
536 | if (!ret) | ||
537 | ret = err2; | ||
538 | if (!ret) | ||
539 | printk(KERN_CONT "\b\b\b\bdone\n"); | ||
540 | else | ||
541 | printk(KERN_CONT "\n"); | ||
542 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); | ||
543 | |||
544 | vfree(cmp); | ||
545 | vfree(unc); | ||
546 | vfree(wrk); | ||
547 | free_page((unsigned long)page); | ||
548 | |||
549 | return ret; | ||
550 | } | ||
551 | |||
407 | /** | 552 | /** |
408 | * enough_swap - Make sure we have enough swap to save the image. | 553 | * enough_swap - Make sure we have enough swap to save the image. |
409 | * | 554 | * |
@@ -411,12 +556,16 @@ static int save_image(struct swap_map_handle *handle, | |||
411 | * space avaiable from the resume partition. | 556 | * space avaiable from the resume partition. |
412 | */ | 557 | */ |
413 | 558 | ||
414 | static int enough_swap(unsigned int nr_pages) | 559 | static int enough_swap(unsigned int nr_pages, unsigned int flags) |
415 | { | 560 | { |
416 | unsigned int free_swap = count_swap_pages(root_swap, 1); | 561 | unsigned int free_swap = count_swap_pages(root_swap, 1); |
562 | unsigned int required; | ||
417 | 563 | ||
418 | pr_debug("PM: Free swap pages: %u\n", free_swap); | 564 | pr_debug("PM: Free swap pages: %u\n", free_swap); |
419 | return free_swap > nr_pages + PAGES_FOR_IO; | 565 | |
566 | required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ? | ||
567 | nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1); | ||
568 | return free_swap > required; | ||
420 | } | 569 | } |
421 | 570 | ||
422 | /** | 571 | /** |
@@ -443,7 +592,7 @@ int swsusp_write(unsigned int flags) | |||
443 | printk(KERN_ERR "PM: Cannot get swap writer\n"); | 592 | printk(KERN_ERR "PM: Cannot get swap writer\n"); |
444 | return error; | 593 | return error; |
445 | } | 594 | } |
446 | if (!enough_swap(pages)) { | 595 | if (!enough_swap(pages, flags)) { |
447 | printk(KERN_ERR "PM: Not enough free swap\n"); | 596 | printk(KERN_ERR "PM: Not enough free swap\n"); |
448 | error = -ENOSPC; | 597 | error = -ENOSPC; |
449 | goto out_finish; | 598 | goto out_finish; |
@@ -458,8 +607,11 @@ int swsusp_write(unsigned int flags) | |||
458 | } | 607 | } |
459 | header = (struct swsusp_info *)data_of(snapshot); | 608 | header = (struct swsusp_info *)data_of(snapshot); |
460 | error = swap_write_page(&handle, header, NULL); | 609 | error = swap_write_page(&handle, header, NULL); |
461 | if (!error) | 610 | if (!error) { |
462 | error = save_image(&handle, &snapshot, pages - 1); | 611 | error = (flags & SF_NOCOMPRESS_MODE) ? |
612 | save_image(&handle, &snapshot, pages - 1) : | ||
613 | save_image_lzo(&handle, &snapshot, pages - 1); | ||
614 | } | ||
463 | out_finish: | 615 | out_finish: |
464 | error = swap_writer_finish(&handle, flags, error); | 616 | error = swap_writer_finish(&handle, flags, error); |
465 | return error; | 617 | return error; |
@@ -590,6 +742,127 @@ static int load_image(struct swap_map_handle *handle, | |||
590 | } | 742 | } |
591 | 743 | ||
592 | /** | 744 | /** |
745 | * load_image_lzo - Load compressed image data and decompress them with LZO. | ||
746 | * @handle: Swap map handle to use for loading data. | ||
747 | * @snapshot: Image to copy uncompressed data into. | ||
748 | * @nr_to_read: Number of pages to load. | ||
749 | */ | ||
750 | static int load_image_lzo(struct swap_map_handle *handle, | ||
751 | struct snapshot_handle *snapshot, | ||
752 | unsigned int nr_to_read) | ||
753 | { | ||
754 | unsigned int m; | ||
755 | int error = 0; | ||
756 | struct timeval start; | ||
757 | struct timeval stop; | ||
758 | unsigned nr_pages; | ||
759 | size_t off, unc_len, cmp_len; | ||
760 | unsigned char *unc, *cmp, *page; | ||
761 | |||
762 | page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | ||
763 | if (!page) { | ||
764 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | ||
765 | return -ENOMEM; | ||
766 | } | ||
767 | |||
768 | unc = vmalloc(LZO_UNC_SIZE); | ||
769 | if (!unc) { | ||
770 | printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); | ||
771 | free_page((unsigned long)page); | ||
772 | return -ENOMEM; | ||
773 | } | ||
774 | |||
775 | cmp = vmalloc(LZO_CMP_SIZE); | ||
776 | if (!cmp) { | ||
777 | printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); | ||
778 | vfree(unc); | ||
779 | free_page((unsigned long)page); | ||
780 | return -ENOMEM; | ||
781 | } | ||
782 | |||
783 | printk(KERN_INFO | ||
784 | "PM: Loading and decompressing image data (%u pages) ... ", | ||
785 | nr_to_read); | ||
786 | m = nr_to_read / 100; | ||
787 | if (!m) | ||
788 | m = 1; | ||
789 | nr_pages = 0; | ||
790 | do_gettimeofday(&start); | ||
791 | |||
792 | error = snapshot_write_next(snapshot); | ||
793 | if (error <= 0) | ||
794 | goto out_finish; | ||
795 | |||
796 | for (;;) { | ||
797 | error = swap_read_page(handle, page, NULL); /* sync */ | ||
798 | if (error) | ||
799 | break; | ||
800 | |||
801 | cmp_len = *(size_t *)page; | ||
802 | if (unlikely(!cmp_len || | ||
803 | cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { | ||
804 | printk(KERN_ERR "PM: Invalid LZO compressed length\n"); | ||
805 | error = -1; | ||
806 | break; | ||
807 | } | ||
808 | |||
809 | memcpy(cmp, page, PAGE_SIZE); | ||
810 | for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { | ||
811 | error = swap_read_page(handle, page, NULL); /* sync */ | ||
812 | if (error) | ||
813 | goto out_finish; | ||
814 | |||
815 | memcpy(cmp + off, page, PAGE_SIZE); | ||
816 | } | ||
817 | |||
818 | unc_len = LZO_UNC_SIZE; | ||
819 | error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, | ||
820 | unc, &unc_len); | ||
821 | if (error < 0) { | ||
822 | printk(KERN_ERR "PM: LZO decompression failed\n"); | ||
823 | break; | ||
824 | } | ||
825 | |||
826 | if (unlikely(!unc_len || | ||
827 | unc_len > LZO_UNC_SIZE || | ||
828 | unc_len & (PAGE_SIZE - 1))) { | ||
829 | printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); | ||
830 | error = -1; | ||
831 | break; | ||
832 | } | ||
833 | |||
834 | for (off = 0; off < unc_len; off += PAGE_SIZE) { | ||
835 | memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); | ||
836 | |||
837 | if (!(nr_pages % m)) | ||
838 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
839 | nr_pages++; | ||
840 | |||
841 | error = snapshot_write_next(snapshot); | ||
842 | if (error <= 0) | ||
843 | goto out_finish; | ||
844 | } | ||
845 | } | ||
846 | |||
847 | out_finish: | ||
848 | do_gettimeofday(&stop); | ||
849 | if (!error) { | ||
850 | printk("\b\b\b\bdone\n"); | ||
851 | snapshot_write_finalize(snapshot); | ||
852 | if (!snapshot_image_loaded(snapshot)) | ||
853 | error = -ENODATA; | ||
854 | } else | ||
855 | printk("\n"); | ||
856 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); | ||
857 | |||
858 | vfree(cmp); | ||
859 | vfree(unc); | ||
860 | free_page((unsigned long)page); | ||
861 | |||
862 | return error; | ||
863 | } | ||
864 | |||
865 | /** | ||
593 | * swsusp_read - read the hibernation image. | 866 | * swsusp_read - read the hibernation image. |
594 | * @flags_p: flags passed by the "frozen" kernel in the image header should | 867 | * @flags_p: flags passed by the "frozen" kernel in the image header should |
595 | * be written into this memeory location | 868 | * be written into this memeory location |
@@ -612,8 +885,11 @@ int swsusp_read(unsigned int *flags_p) | |||
612 | goto end; | 885 | goto end; |
613 | if (!error) | 886 | if (!error) |
614 | error = swap_read_page(&handle, header, NULL); | 887 | error = swap_read_page(&handle, header, NULL); |
615 | if (!error) | 888 | if (!error) { |
616 | error = load_image(&handle, &snapshot, header->pages - 1); | 889 | error = (*flags_p & SF_NOCOMPRESS_MODE) ? |
890 | load_image(&handle, &snapshot, header->pages - 1) : | ||
891 | load_image_lzo(&handle, &snapshot, header->pages - 1); | ||
892 | } | ||
617 | swap_reader_finish(&handle); | 893 | swap_reader_finish(&handle); |
618 | end: | 894 | end: |
619 | if (!error) | 895 | if (!error) |
@@ -634,13 +910,13 @@ int swsusp_check(void) | |||
634 | hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); | 910 | hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); |
635 | if (!IS_ERR(hib_resume_bdev)) { | 911 | if (!IS_ERR(hib_resume_bdev)) { |
636 | set_blocksize(hib_resume_bdev, PAGE_SIZE); | 912 | set_blocksize(hib_resume_bdev, PAGE_SIZE); |
637 | memset(swsusp_header, 0, PAGE_SIZE); | 913 | clear_page(swsusp_header); |
638 | error = hib_bio_read_page(swsusp_resume_block, | 914 | error = hib_bio_read_page(swsusp_resume_block, |
639 | swsusp_header, NULL); | 915 | swsusp_header, NULL); |
640 | if (error) | 916 | if (error) |
641 | goto put; | 917 | goto put; |
642 | 918 | ||
643 | if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { | 919 | if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { |
644 | memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); | 920 | memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); |
645 | /* Reset swap signature now */ | 921 | /* Reset swap signature now */ |
646 | error = hib_bio_write_page(swsusp_resume_block, | 922 | error = hib_bio_write_page(swsusp_resume_block, |
@@ -653,13 +929,13 @@ put: | |||
653 | if (error) | 929 | if (error) |
654 | blkdev_put(hib_resume_bdev, FMODE_READ); | 930 | blkdev_put(hib_resume_bdev, FMODE_READ); |
655 | else | 931 | else |
656 | pr_debug("PM: Signature found, resuming\n"); | 932 | pr_debug("PM: Image signature found, resuming\n"); |
657 | } else { | 933 | } else { |
658 | error = PTR_ERR(hib_resume_bdev); | 934 | error = PTR_ERR(hib_resume_bdev); |
659 | } | 935 | } |
660 | 936 | ||
661 | if (error) | 937 | if (error) |
662 | pr_debug("PM: Error %d checking image file\n", error); | 938 | pr_debug("PM: Image not found (code %d)\n", error); |
663 | 939 | ||
664 | return error; | 940 | return error; |
665 | } | 941 | } |
diff --git a/kernel/printk.c b/kernel/printk.c index 444b770c9595..b2ebaee8c377 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -37,6 +37,8 @@ | |||
37 | #include <linux/ratelimit.h> | 37 | #include <linux/ratelimit.h> |
38 | #include <linux/kmsg_dump.h> | 38 | #include <linux/kmsg_dump.h> |
39 | #include <linux/syslog.h> | 39 | #include <linux/syslog.h> |
40 | #include <linux/cpu.h> | ||
41 | #include <linux/notifier.h> | ||
40 | 42 | ||
41 | #include <asm/uaccess.h> | 43 | #include <asm/uaccess.h> |
42 | 44 | ||
@@ -83,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress); | |||
83 | * provides serialisation for access to the entire console | 85 | * provides serialisation for access to the entire console |
84 | * driver system. | 86 | * driver system. |
85 | */ | 87 | */ |
86 | static DECLARE_MUTEX(console_sem); | 88 | static DEFINE_SEMAPHORE(console_sem); |
87 | struct console *console_drivers; | 89 | struct console *console_drivers; |
88 | EXPORT_SYMBOL_GPL(console_drivers); | 90 | EXPORT_SYMBOL_GPL(console_drivers); |
89 | 91 | ||
@@ -208,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup); | |||
208 | 210 | ||
209 | #ifdef CONFIG_BOOT_PRINTK_DELAY | 211 | #ifdef CONFIG_BOOT_PRINTK_DELAY |
210 | 212 | ||
211 | static unsigned int boot_delay; /* msecs delay after each printk during bootup */ | 213 | static int boot_delay; /* msecs delay after each printk during bootup */ |
212 | static unsigned long long loops_per_msec; /* based on boot_delay */ | 214 | static unsigned long long loops_per_msec; /* based on boot_delay */ |
213 | 215 | ||
214 | static int __init boot_delay_setup(char *str) | 216 | static int __init boot_delay_setup(char *str) |
@@ -554,7 +556,7 @@ static void zap_locks(void) | |||
554 | /* If a crash is occurring, make sure we can't deadlock */ | 556 | /* If a crash is occurring, make sure we can't deadlock */ |
555 | spin_lock_init(&logbuf_lock); | 557 | spin_lock_init(&logbuf_lock); |
556 | /* And make sure that we print immediately */ | 558 | /* And make sure that we print immediately */ |
557 | init_MUTEX(&console_sem); | 559 | sema_init(&console_sem, 1); |
558 | } | 560 | } |
559 | 561 | ||
560 | #if defined(CONFIG_PRINTK_TIME) | 562 | #if defined(CONFIG_PRINTK_TIME) |
@@ -645,6 +647,7 @@ static inline int can_use_console(unsigned int cpu) | |||
645 | * released but interrupts still disabled. | 647 | * released but interrupts still disabled. |
646 | */ | 648 | */ |
647 | static int acquire_console_semaphore_for_printk(unsigned int cpu) | 649 | static int acquire_console_semaphore_for_printk(unsigned int cpu) |
650 | __releases(&logbuf_lock) | ||
648 | { | 651 | { |
649 | int retval = 0; | 652 | int retval = 0; |
650 | 653 | ||
@@ -985,6 +988,32 @@ void resume_console(void) | |||
985 | } | 988 | } |
986 | 989 | ||
987 | /** | 990 | /** |
991 | * console_cpu_notify - print deferred console messages after CPU hotplug | ||
992 | * @self: notifier struct | ||
993 | * @action: CPU hotplug event | ||
994 | * @hcpu: unused | ||
995 | * | ||
996 | * If printk() is called from a CPU that is not online yet, the messages | ||
997 | * will be spooled but will not show up on the console. This function is | ||
998 | * called when a new CPU comes online (or fails to come up), and ensures | ||
999 | * that any such output gets printed. | ||
1000 | */ | ||
1001 | static int __cpuinit console_cpu_notify(struct notifier_block *self, | ||
1002 | unsigned long action, void *hcpu) | ||
1003 | { | ||
1004 | switch (action) { | ||
1005 | case CPU_ONLINE: | ||
1006 | case CPU_DEAD: | ||
1007 | case CPU_DYING: | ||
1008 | case CPU_DOWN_FAILED: | ||
1009 | case CPU_UP_CANCELED: | ||
1010 | acquire_console_sem(); | ||
1011 | release_console_sem(); | ||
1012 | } | ||
1013 | return NOTIFY_OK; | ||
1014 | } | ||
1015 | |||
1016 | /** | ||
988 | * acquire_console_sem - lock the console system for exclusive use. | 1017 | * acquire_console_sem - lock the console system for exclusive use. |
989 | * | 1018 | * |
990 | * Acquires a semaphore which guarantees that the caller has | 1019 | * Acquires a semaphore which guarantees that the caller has |
@@ -1371,7 +1400,7 @@ int unregister_console(struct console *console) | |||
1371 | } | 1400 | } |
1372 | EXPORT_SYMBOL(unregister_console); | 1401 | EXPORT_SYMBOL(unregister_console); |
1373 | 1402 | ||
1374 | static int __init disable_boot_consoles(void) | 1403 | static int __init printk_late_init(void) |
1375 | { | 1404 | { |
1376 | struct console *con; | 1405 | struct console *con; |
1377 | 1406 | ||
@@ -1382,9 +1411,10 @@ static int __init disable_boot_consoles(void) | |||
1382 | unregister_console(con); | 1411 | unregister_console(con); |
1383 | } | 1412 | } |
1384 | } | 1413 | } |
1414 | hotcpu_notifier(console_cpu_notify, 0); | ||
1385 | return 0; | 1415 | return 0; |
1386 | } | 1416 | } |
1387 | late_initcall(disable_boot_consoles); | 1417 | late_initcall(printk_late_init); |
1388 | 1418 | ||
1389 | #if defined CONFIG_PRINTK | 1419 | #if defined CONFIG_PRINTK |
1390 | 1420 | ||
@@ -1482,7 +1512,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) | |||
1482 | } | 1512 | } |
1483 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | 1513 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); |
1484 | 1514 | ||
1485 | static const char const *kmsg_reasons[] = { | 1515 | static const char * const kmsg_reasons[] = { |
1486 | [KMSG_DUMP_OOPS] = "oops", | 1516 | [KMSG_DUMP_OOPS] = "oops", |
1487 | [KMSG_DUMP_PANIC] = "panic", | 1517 | [KMSG_DUMP_PANIC] = "panic", |
1488 | [KMSG_DUMP_KEXEC] = "kexec", | 1518 | [KMSG_DUMP_KEXEC] = "kexec", |
@@ -1520,9 +1550,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
1520 | chars = logged_chars; | 1550 | chars = logged_chars; |
1521 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1551 | spin_unlock_irqrestore(&logbuf_lock, flags); |
1522 | 1552 | ||
1523 | if (logged_chars > end) { | 1553 | if (chars > end) { |
1524 | s1 = log_buf + log_buf_len - logged_chars + end; | 1554 | s1 = log_buf + log_buf_len - chars + end; |
1525 | l1 = logged_chars - end; | 1555 | l1 = chars - end; |
1526 | 1556 | ||
1527 | s2 = log_buf; | 1557 | s2 = log_buf; |
1528 | l2 = end; | 1558 | l2 = end; |
@@ -1530,8 +1560,8 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
1530 | s1 = ""; | 1560 | s1 = ""; |
1531 | l1 = 0; | 1561 | l1 = 0; |
1532 | 1562 | ||
1533 | s2 = log_buf + end - logged_chars; | 1563 | s2 = log_buf + end - chars; |
1534 | l2 = logged_chars; | 1564 | l2 = chars; |
1535 | } | 1565 | } |
1536 | 1566 | ||
1537 | if (!spin_trylock_irqsave(&dump_list_lock, flags)) { | 1567 | if (!spin_trylock_irqsave(&dump_list_lock, flags)) { |
diff --git a/kernel/profile.c b/kernel/profile.c index b22a899934cc..66f841b7fbd3 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -555,6 +555,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf, | |||
555 | static const struct file_operations proc_profile_operations = { | 555 | static const struct file_operations proc_profile_operations = { |
556 | .read = read_profile, | 556 | .read = read_profile, |
557 | .write = write_profile, | 557 | .write = write_profile, |
558 | .llseek = default_llseek, | ||
558 | }; | 559 | }; |
559 | 560 | ||
560 | #ifdef CONFIG_SMP | 561 | #ifdef CONFIG_SMP |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 74a3d693c196..99bbaa3e5b0d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task) | |||
181 | * under ptrace. | 181 | * under ptrace. |
182 | */ | 182 | */ |
183 | retval = -ERESTARTNOINTR; | 183 | retval = -ERESTARTNOINTR; |
184 | if (mutex_lock_interruptible(&task->cred_guard_mutex)) | 184 | if (mutex_lock_interruptible(&task->signal->cred_guard_mutex)) |
185 | goto out; | 185 | goto out; |
186 | 186 | ||
187 | task_lock(task); | 187 | task_lock(task); |
@@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task) | |||
208 | unlock_tasklist: | 208 | unlock_tasklist: |
209 | write_unlock_irq(&tasklist_lock); | 209 | write_unlock_irq(&tasklist_lock); |
210 | unlock_creds: | 210 | unlock_creds: |
211 | mutex_unlock(&task->cred_guard_mutex); | 211 | mutex_unlock(&task->signal->cred_guard_mutex); |
212 | out: | 212 | out: |
213 | return retval; | 213 | return retval; |
214 | } | 214 | } |
@@ -324,26 +324,34 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
324 | } | 324 | } |
325 | 325 | ||
326 | /* | 326 | /* |
327 | * Detach all tasks we were using ptrace on. | 327 | * Detach all tasks we were using ptrace on. Called with tasklist held |
328 | * for writing, and returns with it held too. But note it can release | ||
329 | * and reacquire the lock. | ||
328 | */ | 330 | */ |
329 | void exit_ptrace(struct task_struct *tracer) | 331 | void exit_ptrace(struct task_struct *tracer) |
332 | __releases(&tasklist_lock) | ||
333 | __acquires(&tasklist_lock) | ||
330 | { | 334 | { |
331 | struct task_struct *p, *n; | 335 | struct task_struct *p, *n; |
332 | LIST_HEAD(ptrace_dead); | 336 | LIST_HEAD(ptrace_dead); |
333 | 337 | ||
334 | write_lock_irq(&tasklist_lock); | 338 | if (likely(list_empty(&tracer->ptraced))) |
339 | return; | ||
340 | |||
335 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { | 341 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { |
336 | if (__ptrace_detach(tracer, p)) | 342 | if (__ptrace_detach(tracer, p)) |
337 | list_add(&p->ptrace_entry, &ptrace_dead); | 343 | list_add(&p->ptrace_entry, &ptrace_dead); |
338 | } | 344 | } |
339 | write_unlock_irq(&tasklist_lock); | ||
340 | 345 | ||
346 | write_unlock_irq(&tasklist_lock); | ||
341 | BUG_ON(!list_empty(&tracer->ptraced)); | 347 | BUG_ON(!list_empty(&tracer->ptraced)); |
342 | 348 | ||
343 | list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { | 349 | list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { |
344 | list_del_init(&p->ptrace_entry); | 350 | list_del_init(&p->ptrace_entry); |
345 | release_task(p); | 351 | release_task(p); |
346 | } | 352 | } |
353 | |||
354 | write_lock_irq(&tasklist_lock); | ||
347 | } | 355 | } |
348 | 356 | ||
349 | int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) | 357 | int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) |
@@ -396,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds | |||
396 | return copied; | 404 | return copied; |
397 | } | 405 | } |
398 | 406 | ||
399 | static int ptrace_setoptions(struct task_struct *child, long data) | 407 | static int ptrace_setoptions(struct task_struct *child, unsigned long data) |
400 | { | 408 | { |
401 | child->ptrace &= ~PT_TRACE_MASK; | 409 | child->ptrace &= ~PT_TRACE_MASK; |
402 | 410 | ||
@@ -475,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) | |||
475 | #define is_sysemu_singlestep(request) 0 | 483 | #define is_sysemu_singlestep(request) 0 |
476 | #endif | 484 | #endif |
477 | 485 | ||
478 | static int ptrace_resume(struct task_struct *child, long request, long data) | 486 | static int ptrace_resume(struct task_struct *child, long request, |
487 | unsigned long data) | ||
479 | { | 488 | { |
480 | if (!valid_signal(data)) | 489 | if (!valid_signal(data)) |
481 | return -EIO; | 490 | return -EIO; |
@@ -552,10 +561,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, | |||
552 | #endif | 561 | #endif |
553 | 562 | ||
554 | int ptrace_request(struct task_struct *child, long request, | 563 | int ptrace_request(struct task_struct *child, long request, |
555 | long addr, long data) | 564 | unsigned long addr, unsigned long data) |
556 | { | 565 | { |
557 | int ret = -EIO; | 566 | int ret = -EIO; |
558 | siginfo_t siginfo; | 567 | siginfo_t siginfo; |
568 | void __user *datavp = (void __user *) data; | ||
569 | unsigned long __user *datalp = datavp; | ||
559 | 570 | ||
560 | switch (request) { | 571 | switch (request) { |
561 | case PTRACE_PEEKTEXT: | 572 | case PTRACE_PEEKTEXT: |
@@ -572,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request, | |||
572 | ret = ptrace_setoptions(child, data); | 583 | ret = ptrace_setoptions(child, data); |
573 | break; | 584 | break; |
574 | case PTRACE_GETEVENTMSG: | 585 | case PTRACE_GETEVENTMSG: |
575 | ret = put_user(child->ptrace_message, (unsigned long __user *) data); | 586 | ret = put_user(child->ptrace_message, datalp); |
576 | break; | 587 | break; |
577 | 588 | ||
578 | case PTRACE_GETSIGINFO: | 589 | case PTRACE_GETSIGINFO: |
579 | ret = ptrace_getsiginfo(child, &siginfo); | 590 | ret = ptrace_getsiginfo(child, &siginfo); |
580 | if (!ret) | 591 | if (!ret) |
581 | ret = copy_siginfo_to_user((siginfo_t __user *) data, | 592 | ret = copy_siginfo_to_user(datavp, &siginfo); |
582 | &siginfo); | ||
583 | break; | 593 | break; |
584 | 594 | ||
585 | case PTRACE_SETSIGINFO: | 595 | case PTRACE_SETSIGINFO: |
586 | if (copy_from_user(&siginfo, (siginfo_t __user *) data, | 596 | if (copy_from_user(&siginfo, datavp, sizeof siginfo)) |
587 | sizeof siginfo)) | ||
588 | ret = -EFAULT; | 597 | ret = -EFAULT; |
589 | else | 598 | else |
590 | ret = ptrace_setsiginfo(child, &siginfo); | 599 | ret = ptrace_setsiginfo(child, &siginfo); |
@@ -615,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request, | |||
615 | } | 624 | } |
616 | mmput(mm); | 625 | mmput(mm); |
617 | 626 | ||
618 | ret = put_user(tmp, (unsigned long __user *) data); | 627 | ret = put_user(tmp, datalp); |
619 | break; | 628 | break; |
620 | } | 629 | } |
621 | #endif | 630 | #endif |
@@ -644,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request, | |||
644 | case PTRACE_SETREGSET: | 653 | case PTRACE_SETREGSET: |
645 | { | 654 | { |
646 | struct iovec kiov; | 655 | struct iovec kiov; |
647 | struct iovec __user *uiov = (struct iovec __user *) data; | 656 | struct iovec __user *uiov = datavp; |
648 | 657 | ||
649 | if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) | 658 | if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) |
650 | return -EFAULT; | 659 | return -EFAULT; |
@@ -685,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid) | |||
685 | #define arch_ptrace_attach(child) do { } while (0) | 694 | #define arch_ptrace_attach(child) do { } while (0) |
686 | #endif | 695 | #endif |
687 | 696 | ||
688 | SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) | 697 | SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, |
698 | unsigned long, data) | ||
689 | { | 699 | { |
690 | struct task_struct *child; | 700 | struct task_struct *child; |
691 | long ret; | 701 | long ret; |
@@ -726,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) | |||
726 | return ret; | 736 | return ret; |
727 | } | 737 | } |
728 | 738 | ||
729 | int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) | 739 | int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, |
740 | unsigned long data) | ||
730 | { | 741 | { |
731 | unsigned long tmp; | 742 | unsigned long tmp; |
732 | int copied; | 743 | int copied; |
@@ -737,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) | |||
737 | return put_user(tmp, (unsigned long __user *)data); | 748 | return put_user(tmp, (unsigned long __user *)data); |
738 | } | 749 | } |
739 | 750 | ||
740 | int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) | 751 | int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, |
752 | unsigned long data) | ||
741 | { | 753 | { |
742 | int copied; | 754 | int copied; |
743 | 755 | ||
diff --git a/kernel/range.c b/kernel/range.c index 74e2e6114927..471b66acabb5 100644 --- a/kernel/range.c +++ b/kernel/range.c | |||
@@ -7,10 +7,6 @@ | |||
7 | 7 | ||
8 | #include <linux/range.h> | 8 | #include <linux/range.h> |
9 | 9 | ||
10 | #ifndef ARRAY_SIZE | ||
11 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) | ||
12 | #endif | ||
13 | |||
14 | int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) | 10 | int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) |
15 | { | 11 | { |
16 | if (start >= end) | 12 | if (start >= end) |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 72a8dc9567f5..a23a57a976d1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void) | |||
73 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); | 73 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); |
74 | 74 | ||
75 | /** | 75 | /** |
76 | * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? | 76 | * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? |
77 | * | 77 | * |
78 | * Check for bottom half being disabled, which covers both the | 78 | * Check for bottom half being disabled, which covers both the |
79 | * CONFIG_PROVE_RCU and not cases. Note that if someone uses | 79 | * CONFIG_PROVE_RCU and not cases. Note that if someone uses |
80 | * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) | 80 | * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) |
81 | * will show the situation. | 81 | * will show the situation. This is useful for debug checks in functions |
82 | * that require that they be called within an RCU read-side critical | ||
83 | * section. | ||
82 | * | 84 | * |
83 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. | 85 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. |
84 | */ | 86 | */ |
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void) | |||
86 | { | 88 | { |
87 | if (!debug_lockdep_rcu_enabled()) | 89 | if (!debug_lockdep_rcu_enabled()) |
88 | return 1; | 90 | return 1; |
89 | return in_softirq(); | 91 | return in_softirq() || irqs_disabled(); |
90 | } | 92 | } |
91 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | 93 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); |
92 | 94 | ||
@@ -114,3 +116,163 @@ int rcu_my_thread_group_empty(void) | |||
114 | } | 116 | } |
115 | EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); | 117 | EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); |
116 | #endif /* #ifdef CONFIG_PROVE_RCU */ | 118 | #endif /* #ifdef CONFIG_PROVE_RCU */ |
119 | |||
120 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
121 | static inline void debug_init_rcu_head(struct rcu_head *head) | ||
122 | { | ||
123 | debug_object_init(head, &rcuhead_debug_descr); | ||
124 | } | ||
125 | |||
126 | static inline void debug_rcu_head_free(struct rcu_head *head) | ||
127 | { | ||
128 | debug_object_free(head, &rcuhead_debug_descr); | ||
129 | } | ||
130 | |||
131 | /* | ||
132 | * fixup_init is called when: | ||
133 | * - an active object is initialized | ||
134 | */ | ||
135 | static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) | ||
136 | { | ||
137 | struct rcu_head *head = addr; | ||
138 | |||
139 | switch (state) { | ||
140 | case ODEBUG_STATE_ACTIVE: | ||
141 | /* | ||
142 | * Ensure that queued callbacks are all executed. | ||
143 | * If we detect that we are nested in a RCU read-side critical | ||
144 | * section, we should simply fail, otherwise we would deadlock. | ||
145 | */ | ||
146 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
147 | irqs_disabled()) { | ||
148 | WARN_ON(1); | ||
149 | return 0; | ||
150 | } | ||
151 | rcu_barrier(); | ||
152 | rcu_barrier_sched(); | ||
153 | rcu_barrier_bh(); | ||
154 | debug_object_init(head, &rcuhead_debug_descr); | ||
155 | return 1; | ||
156 | default: | ||
157 | return 0; | ||
158 | } | ||
159 | } | ||
160 | |||
161 | /* | ||
162 | * fixup_activate is called when: | ||
163 | * - an active object is activated | ||
164 | * - an unknown object is activated (might be a statically initialized object) | ||
165 | * Activation is performed internally by call_rcu(). | ||
166 | */ | ||
167 | static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) | ||
168 | { | ||
169 | struct rcu_head *head = addr; | ||
170 | |||
171 | switch (state) { | ||
172 | |||
173 | case ODEBUG_STATE_NOTAVAILABLE: | ||
174 | /* | ||
175 | * This is not really a fixup. We just make sure that it is | ||
176 | * tracked in the object tracker. | ||
177 | */ | ||
178 | debug_object_init(head, &rcuhead_debug_descr); | ||
179 | debug_object_activate(head, &rcuhead_debug_descr); | ||
180 | return 0; | ||
181 | |||
182 | case ODEBUG_STATE_ACTIVE: | ||
183 | /* | ||
184 | * Ensure that queued callbacks are all executed. | ||
185 | * If we detect that we are nested in a RCU read-side critical | ||
186 | * section, we should simply fail, otherwise we would deadlock. | ||
187 | */ | ||
188 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
189 | irqs_disabled()) { | ||
190 | WARN_ON(1); | ||
191 | return 0; | ||
192 | } | ||
193 | rcu_barrier(); | ||
194 | rcu_barrier_sched(); | ||
195 | rcu_barrier_bh(); | ||
196 | debug_object_activate(head, &rcuhead_debug_descr); | ||
197 | return 1; | ||
198 | default: | ||
199 | return 0; | ||
200 | } | ||
201 | } | ||
202 | |||
203 | /* | ||
204 | * fixup_free is called when: | ||
205 | * - an active object is freed | ||
206 | */ | ||
207 | static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) | ||
208 | { | ||
209 | struct rcu_head *head = addr; | ||
210 | |||
211 | switch (state) { | ||
212 | case ODEBUG_STATE_ACTIVE: | ||
213 | /* | ||
214 | * Ensure that queued callbacks are all executed. | ||
215 | * If we detect that we are nested in a RCU read-side critical | ||
216 | * section, we should simply fail, otherwise we would deadlock. | ||
217 | */ | ||
218 | #ifndef CONFIG_PREEMPT | ||
219 | WARN_ON(1); | ||
220 | return 0; | ||
221 | #else | ||
222 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
223 | irqs_disabled()) { | ||
224 | WARN_ON(1); | ||
225 | return 0; | ||
226 | } | ||
227 | rcu_barrier(); | ||
228 | rcu_barrier_sched(); | ||
229 | rcu_barrier_bh(); | ||
230 | debug_object_free(head, &rcuhead_debug_descr); | ||
231 | return 1; | ||
232 | #endif | ||
233 | default: | ||
234 | return 0; | ||
235 | } | ||
236 | } | ||
237 | |||
238 | /** | ||
239 | * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects | ||
240 | * @head: pointer to rcu_head structure to be initialized | ||
241 | * | ||
242 | * This function informs debugobjects of a new rcu_head structure that | ||
243 | * has been allocated as an auto variable on the stack. This function | ||
244 | * is not required for rcu_head structures that are statically defined or | ||
245 | * that are dynamically allocated on the heap. This function has no | ||
246 | * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. | ||
247 | */ | ||
248 | void init_rcu_head_on_stack(struct rcu_head *head) | ||
249 | { | ||
250 | debug_object_init_on_stack(head, &rcuhead_debug_descr); | ||
251 | } | ||
252 | EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); | ||
253 | |||
254 | /** | ||
255 | * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects | ||
256 | * @head: pointer to rcu_head structure to be initialized | ||
257 | * | ||
258 | * This function informs debugobjects that an on-stack rcu_head structure | ||
259 | * is about to go out of scope. As with init_rcu_head_on_stack(), this | ||
260 | * function is not required for rcu_head structures that are statically | ||
261 | * defined or that are dynamically allocated on the heap. Also as with | ||
262 | * init_rcu_head_on_stack(), this function has no effect for | ||
263 | * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. | ||
264 | */ | ||
265 | void destroy_rcu_head_on_stack(struct rcu_head *head) | ||
266 | { | ||
267 | debug_object_free(head, &rcuhead_debug_descr); | ||
268 | } | ||
269 | EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); | ||
270 | |||
271 | struct debug_obj_descr rcuhead_debug_descr = { | ||
272 | .name = "rcu_head", | ||
273 | .fixup_init = rcuhead_fixup_init, | ||
274 | .fixup_activate = rcuhead_fixup_activate, | ||
275 | .fixup_free = rcuhead_fixup_free, | ||
276 | }; | ||
277 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | ||
278 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 38729d3cd236..d806735342ac 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly; | |||
59 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | 59 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); |
60 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 60 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
61 | 61 | ||
62 | /* Forward declarations for rcutiny_plugin.h. */ | ||
63 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); | ||
64 | static void __call_rcu(struct rcu_head *head, | ||
65 | void (*func)(struct rcu_head *rcu), | ||
66 | struct rcu_ctrlblk *rcp); | ||
67 | |||
68 | #include "rcutiny_plugin.h" | ||
69 | |||
62 | #ifdef CONFIG_NO_HZ | 70 | #ifdef CONFIG_NO_HZ |
63 | 71 | ||
64 | static long rcu_dynticks_nesting = 1; | 72 | static long rcu_dynticks_nesting = 1; |
@@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
140 | rcu_sched_qs(cpu); | 148 | rcu_sched_qs(cpu); |
141 | else if (!in_softirq()) | 149 | else if (!in_softirq()) |
142 | rcu_bh_qs(cpu); | 150 | rcu_bh_qs(cpu); |
151 | rcu_preempt_check_callbacks(); | ||
143 | } | 152 | } |
144 | 153 | ||
145 | /* | 154 | /* |
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
162 | *rcp->donetail = NULL; | 171 | *rcp->donetail = NULL; |
163 | if (rcp->curtail == rcp->donetail) | 172 | if (rcp->curtail == rcp->donetail) |
164 | rcp->curtail = &rcp->rcucblist; | 173 | rcp->curtail = &rcp->rcucblist; |
174 | rcu_preempt_remove_callbacks(rcp); | ||
165 | rcp->donetail = &rcp->rcucblist; | 175 | rcp->donetail = &rcp->rcucblist; |
166 | local_irq_restore(flags); | 176 | local_irq_restore(flags); |
167 | 177 | ||
@@ -169,6 +179,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
169 | while (list) { | 179 | while (list) { |
170 | next = list->next; | 180 | next = list->next; |
171 | prefetch(next); | 181 | prefetch(next); |
182 | debug_rcu_head_unqueue(list); | ||
172 | list->func(list); | 183 | list->func(list); |
173 | list = next; | 184 | list = next; |
174 | } | 185 | } |
@@ -181,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
181 | { | 192 | { |
182 | __rcu_process_callbacks(&rcu_sched_ctrlblk); | 193 | __rcu_process_callbacks(&rcu_sched_ctrlblk); |
183 | __rcu_process_callbacks(&rcu_bh_ctrlblk); | 194 | __rcu_process_callbacks(&rcu_bh_ctrlblk); |
195 | rcu_preempt_process_callbacks(); | ||
184 | } | 196 | } |
185 | 197 | ||
186 | /* | 198 | /* |
@@ -211,6 +223,7 @@ static void __call_rcu(struct rcu_head *head, | |||
211 | { | 223 | { |
212 | unsigned long flags; | 224 | unsigned long flags; |
213 | 225 | ||
226 | debug_rcu_head_queue(head); | ||
214 | head->func = func; | 227 | head->func = func; |
215 | head->next = NULL; | 228 | head->next = NULL; |
216 | 229 | ||
@@ -221,15 +234,15 @@ static void __call_rcu(struct rcu_head *head, | |||
221 | } | 234 | } |
222 | 235 | ||
223 | /* | 236 | /* |
224 | * Post an RCU callback to be invoked after the end of an RCU grace | 237 | * Post an RCU callback to be invoked after the end of an RCU-sched grace |
225 | * period. But since we have but one CPU, that would be after any | 238 | * period. But since we have but one CPU, that would be after any |
226 | * quiescent state. | 239 | * quiescent state. |
227 | */ | 240 | */ |
228 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 241 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
229 | { | 242 | { |
230 | __call_rcu(head, func, &rcu_sched_ctrlblk); | 243 | __call_rcu(head, func, &rcu_sched_ctrlblk); |
231 | } | 244 | } |
232 | EXPORT_SYMBOL_GPL(call_rcu); | 245 | EXPORT_SYMBOL_GPL(call_rcu_sched); |
233 | 246 | ||
234 | /* | 247 | /* |
235 | * Post an RCU bottom-half callback to be invoked after any subsequent | 248 | * Post an RCU bottom-half callback to be invoked after any subsequent |
@@ -241,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
241 | } | 254 | } |
242 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 255 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
243 | 256 | ||
244 | void rcu_barrier(void) | ||
245 | { | ||
246 | struct rcu_synchronize rcu; | ||
247 | |||
248 | init_rcu_head_on_stack(&rcu.head); | ||
249 | init_completion(&rcu.completion); | ||
250 | /* Will wake me after RCU finished. */ | ||
251 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
252 | /* Wait for it. */ | ||
253 | wait_for_completion(&rcu.completion); | ||
254 | destroy_rcu_head_on_stack(&rcu.head); | ||
255 | } | ||
256 | EXPORT_SYMBOL_GPL(rcu_barrier); | ||
257 | |||
258 | void rcu_barrier_bh(void) | 257 | void rcu_barrier_bh(void) |
259 | { | 258 | { |
260 | struct rcu_synchronize rcu; | 259 | struct rcu_synchronize rcu; |
@@ -287,5 +286,3 @@ void __init rcu_init(void) | |||
287 | { | 286 | { |
288 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 287 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
289 | } | 288 | } |
290 | |||
291 | #include "rcutiny_plugin.h" | ||
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index d223a92bc742..6ceca4f745ff 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) | 2 | * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition |
3 | * Internal non-public definitions that provide either classic | 3 | * Internal non-public definitions that provide either classic |
4 | * or preemptable semantics. | 4 | * or preemptible semantics. |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -17,11 +17,587 @@ | |||
17 | * along with this program; if not, write to the Free Software | 17 | * along with this program; if not, write to the Free Software |
18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
19 | * | 19 | * |
20 | * Copyright IBM Corporation, 2009 | 20 | * Copyright (c) 2010 Linaro |
21 | * | 21 | * |
22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> |
23 | */ | 23 | */ |
24 | 24 | ||
25 | #ifdef CONFIG_TINY_PREEMPT_RCU | ||
26 | |||
27 | #include <linux/delay.h> | ||
28 | |||
29 | /* Global control variables for preemptible RCU. */ | ||
30 | struct rcu_preempt_ctrlblk { | ||
31 | struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ | ||
32 | struct rcu_head **nexttail; | ||
33 | /* Tasks blocked in a preemptible RCU */ | ||
34 | /* read-side critical section while an */ | ||
35 | /* preemptible-RCU grace period is in */ | ||
36 | /* progress must wait for a later grace */ | ||
37 | /* period. This pointer points to the */ | ||
38 | /* ->next pointer of the last task that */ | ||
39 | /* must wait for a later grace period, or */ | ||
40 | /* to &->rcb.rcucblist if there is no */ | ||
41 | /* such task. */ | ||
42 | struct list_head blkd_tasks; | ||
43 | /* Tasks blocked in RCU read-side critical */ | ||
44 | /* section. Tasks are placed at the head */ | ||
45 | /* of this list and age towards the tail. */ | ||
46 | struct list_head *gp_tasks; | ||
47 | /* Pointer to the first task blocking the */ | ||
48 | /* current grace period, or NULL if there */ | ||
49 | /* is not such task. */ | ||
50 | struct list_head *exp_tasks; | ||
51 | /* Pointer to first task blocking the */ | ||
52 | /* current expedited grace period, or NULL */ | ||
53 | /* if there is no such task. If there */ | ||
54 | /* is no current expedited grace period, */ | ||
55 | /* then there cannot be any such task. */ | ||
56 | u8 gpnum; /* Current grace period. */ | ||
57 | u8 gpcpu; /* Last grace period blocked by the CPU. */ | ||
58 | u8 completed; /* Last grace period completed. */ | ||
59 | /* If all three are equal, RCU is idle. */ | ||
60 | }; | ||
61 | |||
62 | static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { | ||
63 | .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
64 | .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
65 | .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
66 | .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), | ||
67 | }; | ||
68 | |||
69 | static int rcu_preempted_readers_exp(void); | ||
70 | static void rcu_report_exp_done(void); | ||
71 | |||
72 | /* | ||
73 | * Return true if the CPU has not yet responded to the current grace period. | ||
74 | */ | ||
75 | static int rcu_cpu_blocking_cur_gp(void) | ||
76 | { | ||
77 | return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * Check for a running RCU reader. Because there is only one CPU, | ||
82 | * there can be but one running RCU reader at a time. ;-) | ||
83 | */ | ||
84 | static int rcu_preempt_running_reader(void) | ||
85 | { | ||
86 | return current->rcu_read_lock_nesting; | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * Check for preempted RCU readers blocking any grace period. | ||
91 | * If the caller needs a reliable answer, it must disable hard irqs. | ||
92 | */ | ||
93 | static int rcu_preempt_blocked_readers_any(void) | ||
94 | { | ||
95 | return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); | ||
96 | } | ||
97 | |||
98 | /* | ||
99 | * Check for preempted RCU readers blocking the current grace period. | ||
100 | * If the caller needs a reliable answer, it must disable hard irqs. | ||
101 | */ | ||
102 | static int rcu_preempt_blocked_readers_cgp(void) | ||
103 | { | ||
104 | return rcu_preempt_ctrlblk.gp_tasks != NULL; | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | * Return true if another preemptible-RCU grace period is needed. | ||
109 | */ | ||
110 | static int rcu_preempt_needs_another_gp(void) | ||
111 | { | ||
112 | return *rcu_preempt_ctrlblk.rcb.curtail != NULL; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * Return true if a preemptible-RCU grace period is in progress. | ||
117 | * The caller must disable hardirqs. | ||
118 | */ | ||
119 | static int rcu_preempt_gp_in_progress(void) | ||
120 | { | ||
121 | return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * Record a preemptible-RCU quiescent state for the specified CPU. Note | ||
126 | * that this just means that the task currently running on the CPU is | ||
127 | * in a quiescent state. There might be any number of tasks blocked | ||
128 | * while in an RCU read-side critical section. | ||
129 | * | ||
130 | * Unlike the other rcu_*_qs() functions, callers to this function | ||
131 | * must disable irqs in order to protect the assignment to | ||
132 | * ->rcu_read_unlock_special. | ||
133 | * | ||
134 | * Because this is a single-CPU implementation, the only way a grace | ||
135 | * period can end is if the CPU is in a quiescent state. The reason is | ||
136 | * that a blocked preemptible-RCU reader can exit its critical section | ||
137 | * only if the CPU is running it at the time. Therefore, when the | ||
138 | * last task blocking the current grace period exits its RCU read-side | ||
139 | * critical section, neither the CPU nor blocked tasks will be stopping | ||
140 | * the current grace period. (In contrast, SMP implementations | ||
141 | * might have CPUs running in RCU read-side critical sections that | ||
142 | * block later grace periods -- but this is not possible given only | ||
143 | * one CPU.) | ||
144 | */ | ||
145 | static void rcu_preempt_cpu_qs(void) | ||
146 | { | ||
147 | /* Record both CPU and task as having responded to current GP. */ | ||
148 | rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; | ||
149 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | ||
150 | |||
151 | /* | ||
152 | * If there is no GP, or if blocked readers are still blocking GP, | ||
153 | * then there is nothing more to do. | ||
154 | */ | ||
155 | if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) | ||
156 | return; | ||
157 | |||
158 | /* Advance callbacks. */ | ||
159 | rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; | ||
160 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; | ||
161 | rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; | ||
162 | |||
163 | /* If there are no blocked readers, next GP is done instantly. */ | ||
164 | if (!rcu_preempt_blocked_readers_any()) | ||
165 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; | ||
166 | |||
167 | /* If there are done callbacks, make RCU_SOFTIRQ process them. */ | ||
168 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) | ||
169 | raise_softirq(RCU_SOFTIRQ); | ||
170 | } | ||
171 | |||
172 | /* | ||
173 | * Start a new RCU grace period if warranted. Hard irqs must be disabled. | ||
174 | */ | ||
175 | static void rcu_preempt_start_gp(void) | ||
176 | { | ||
177 | if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { | ||
178 | |||
179 | /* Official start of GP. */ | ||
180 | rcu_preempt_ctrlblk.gpnum++; | ||
181 | |||
182 | /* Any blocked RCU readers block new GP. */ | ||
183 | if (rcu_preempt_blocked_readers_any()) | ||
184 | rcu_preempt_ctrlblk.gp_tasks = | ||
185 | rcu_preempt_ctrlblk.blkd_tasks.next; | ||
186 | |||
187 | /* If there is no running reader, CPU is done with GP. */ | ||
188 | if (!rcu_preempt_running_reader()) | ||
189 | rcu_preempt_cpu_qs(); | ||
190 | } | ||
191 | } | ||
192 | |||
193 | /* | ||
194 | * We have entered the scheduler, and the current task might soon be | ||
195 | * context-switched away from. If this task is in an RCU read-side | ||
196 | * critical section, we will no longer be able to rely on the CPU to | ||
197 | * record that fact, so we enqueue the task on the blkd_tasks list. | ||
198 | * If the task started after the current grace period began, as recorded | ||
199 | * by ->gpcpu, we enqueue at the beginning of the list. Otherwise | ||
200 | * before the element referenced by ->gp_tasks (or at the tail if | ||
201 | * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. | ||
202 | * The task will dequeue itself when it exits the outermost enclosing | ||
203 | * RCU read-side critical section. Therefore, the current grace period | ||
204 | * cannot be permitted to complete until the ->gp_tasks pointer becomes | ||
205 | * NULL. | ||
206 | * | ||
207 | * Caller must disable preemption. | ||
208 | */ | ||
209 | void rcu_preempt_note_context_switch(void) | ||
210 | { | ||
211 | struct task_struct *t = current; | ||
212 | unsigned long flags; | ||
213 | |||
214 | local_irq_save(flags); /* must exclude scheduler_tick(). */ | ||
215 | if (rcu_preempt_running_reader() && | ||
216 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | ||
217 | |||
218 | /* Possibly blocking in an RCU read-side critical section. */ | ||
219 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | ||
220 | |||
221 | /* | ||
222 | * If this CPU has already checked in, then this task | ||
223 | * will hold up the next grace period rather than the | ||
224 | * current grace period. Queue the task accordingly. | ||
225 | * If the task is queued for the current grace period | ||
226 | * (i.e., this CPU has not yet passed through a quiescent | ||
227 | * state for the current grace period), then as long | ||
228 | * as that task remains queued, the current grace period | ||
229 | * cannot end. | ||
230 | */ | ||
231 | list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); | ||
232 | if (rcu_cpu_blocking_cur_gp()) | ||
233 | rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | * Either we were not in an RCU read-side critical section to | ||
238 | * begin with, or we have now recorded that critical section | ||
239 | * globally. Either way, we can now note a quiescent state | ||
240 | * for this CPU. Again, if we were in an RCU read-side critical | ||
241 | * section, and if that critical section was blocking the current | ||
242 | * grace period, then the fact that the task has been enqueued | ||
243 | * means that current grace period continues to be blocked. | ||
244 | */ | ||
245 | rcu_preempt_cpu_qs(); | ||
246 | local_irq_restore(flags); | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * Tiny-preemptible RCU implementation for rcu_read_lock(). | ||
251 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | ||
252 | * if we block. | ||
253 | */ | ||
254 | void __rcu_read_lock(void) | ||
255 | { | ||
256 | current->rcu_read_lock_nesting++; | ||
257 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ | ||
258 | } | ||
259 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
260 | |||
261 | /* | ||
262 | * Handle special cases during rcu_read_unlock(), such as needing to | ||
263 | * notify RCU core processing or task having blocked during the RCU | ||
264 | * read-side critical section. | ||
265 | */ | ||
266 | static void rcu_read_unlock_special(struct task_struct *t) | ||
267 | { | ||
268 | int empty; | ||
269 | int empty_exp; | ||
270 | unsigned long flags; | ||
271 | struct list_head *np; | ||
272 | int special; | ||
273 | |||
274 | /* | ||
275 | * NMI handlers cannot block and cannot safely manipulate state. | ||
276 | * They therefore cannot possibly be special, so just leave. | ||
277 | */ | ||
278 | if (in_nmi()) | ||
279 | return; | ||
280 | |||
281 | local_irq_save(flags); | ||
282 | |||
283 | /* | ||
284 | * If RCU core is waiting for this CPU to exit critical section, | ||
285 | * let it know that we have done so. | ||
286 | */ | ||
287 | special = t->rcu_read_unlock_special; | ||
288 | if (special & RCU_READ_UNLOCK_NEED_QS) | ||
289 | rcu_preempt_cpu_qs(); | ||
290 | |||
291 | /* Hardware IRQ handlers cannot block. */ | ||
292 | if (in_irq()) { | ||
293 | local_irq_restore(flags); | ||
294 | return; | ||
295 | } | ||
296 | |||
297 | /* Clean up if blocked during RCU read-side critical section. */ | ||
298 | if (special & RCU_READ_UNLOCK_BLOCKED) { | ||
299 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; | ||
300 | |||
301 | /* | ||
302 | * Remove this task from the ->blkd_tasks list and adjust | ||
303 | * any pointers that might have been referencing it. | ||
304 | */ | ||
305 | empty = !rcu_preempt_blocked_readers_cgp(); | ||
306 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; | ||
307 | np = t->rcu_node_entry.next; | ||
308 | if (np == &rcu_preempt_ctrlblk.blkd_tasks) | ||
309 | np = NULL; | ||
310 | list_del(&t->rcu_node_entry); | ||
311 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) | ||
312 | rcu_preempt_ctrlblk.gp_tasks = np; | ||
313 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) | ||
314 | rcu_preempt_ctrlblk.exp_tasks = np; | ||
315 | INIT_LIST_HEAD(&t->rcu_node_entry); | ||
316 | |||
317 | /* | ||
318 | * If this was the last task on the current list, and if | ||
319 | * we aren't waiting on the CPU, report the quiescent state | ||
320 | * and start a new grace period if needed. | ||
321 | */ | ||
322 | if (!empty && !rcu_preempt_blocked_readers_cgp()) { | ||
323 | rcu_preempt_cpu_qs(); | ||
324 | rcu_preempt_start_gp(); | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | * If this was the last task on the expedited lists, | ||
329 | * then we need wake up the waiting task. | ||
330 | */ | ||
331 | if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
332 | rcu_report_exp_done(); | ||
333 | } | ||
334 | local_irq_restore(flags); | ||
335 | } | ||
336 | |||
337 | /* | ||
338 | * Tiny-preemptible RCU implementation for rcu_read_unlock(). | ||
339 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | ||
340 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | ||
341 | * invoke rcu_read_unlock_special() to clean up after a context switch | ||
342 | * in an RCU read-side critical section and other special cases. | ||
343 | */ | ||
344 | void __rcu_read_unlock(void) | ||
345 | { | ||
346 | struct task_struct *t = current; | ||
347 | |||
348 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ | ||
349 | --t->rcu_read_lock_nesting; | ||
350 | barrier(); /* decrement before load of ->rcu_read_unlock_special */ | ||
351 | if (t->rcu_read_lock_nesting == 0 && | ||
352 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | ||
353 | rcu_read_unlock_special(t); | ||
354 | #ifdef CONFIG_PROVE_LOCKING | ||
355 | WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); | ||
356 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | ||
357 | } | ||
358 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
359 | |||
360 | /* | ||
361 | * Check for a quiescent state from the current CPU. When a task blocks, | ||
362 | * the task is recorded in the rcu_preempt_ctrlblk structure, which is | ||
363 | * checked elsewhere. This is called from the scheduling-clock interrupt. | ||
364 | * | ||
365 | * Caller must disable hard irqs. | ||
366 | */ | ||
367 | static void rcu_preempt_check_callbacks(void) | ||
368 | { | ||
369 | struct task_struct *t = current; | ||
370 | |||
371 | if (rcu_preempt_gp_in_progress() && | ||
372 | (!rcu_preempt_running_reader() || | ||
373 | !rcu_cpu_blocking_cur_gp())) | ||
374 | rcu_preempt_cpu_qs(); | ||
375 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != | ||
376 | rcu_preempt_ctrlblk.rcb.donetail) | ||
377 | raise_softirq(RCU_SOFTIRQ); | ||
378 | if (rcu_preempt_gp_in_progress() && | ||
379 | rcu_cpu_blocking_cur_gp() && | ||
380 | rcu_preempt_running_reader()) | ||
381 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * TINY_PREEMPT_RCU has an extra callback-list tail pointer to | ||
386 | * update, so this is invoked from __rcu_process_callbacks() to | ||
387 | * handle that case. Of course, it is invoked for all flavors of | ||
388 | * RCU, but RCU callbacks can appear only on one of the lists, and | ||
389 | * neither ->nexttail nor ->donetail can possibly be NULL, so there | ||
390 | * is no need for an explicit check. | ||
391 | */ | ||
392 | static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | ||
393 | { | ||
394 | if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) | ||
395 | rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; | ||
396 | } | ||
397 | |||
398 | /* | ||
399 | * Process callbacks for preemptible RCU. | ||
400 | */ | ||
401 | static void rcu_preempt_process_callbacks(void) | ||
402 | { | ||
403 | __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); | ||
404 | } | ||
405 | |||
406 | /* | ||
407 | * Queue a preemptible -RCU callback for invocation after a grace period. | ||
408 | */ | ||
409 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
410 | { | ||
411 | unsigned long flags; | ||
412 | |||
413 | debug_rcu_head_queue(head); | ||
414 | head->func = func; | ||
415 | head->next = NULL; | ||
416 | |||
417 | local_irq_save(flags); | ||
418 | *rcu_preempt_ctrlblk.nexttail = head; | ||
419 | rcu_preempt_ctrlblk.nexttail = &head->next; | ||
420 | rcu_preempt_start_gp(); /* checks to see if GP needed. */ | ||
421 | local_irq_restore(flags); | ||
422 | } | ||
423 | EXPORT_SYMBOL_GPL(call_rcu); | ||
424 | |||
425 | void rcu_barrier(void) | ||
426 | { | ||
427 | struct rcu_synchronize rcu; | ||
428 | |||
429 | init_rcu_head_on_stack(&rcu.head); | ||
430 | init_completion(&rcu.completion); | ||
431 | /* Will wake me after RCU finished. */ | ||
432 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
433 | /* Wait for it. */ | ||
434 | wait_for_completion(&rcu.completion); | ||
435 | destroy_rcu_head_on_stack(&rcu.head); | ||
436 | } | ||
437 | EXPORT_SYMBOL_GPL(rcu_barrier); | ||
438 | |||
439 | /* | ||
440 | * synchronize_rcu - wait until a grace period has elapsed. | ||
441 | * | ||
442 | * Control will return to the caller some time after a full grace | ||
443 | * period has elapsed, in other words after all currently executing RCU | ||
444 | * read-side critical sections have completed. RCU read-side critical | ||
445 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
446 | * and may be nested. | ||
447 | */ | ||
448 | void synchronize_rcu(void) | ||
449 | { | ||
450 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
451 | if (!rcu_scheduler_active) | ||
452 | return; | ||
453 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
454 | |||
455 | WARN_ON_ONCE(rcu_preempt_running_reader()); | ||
456 | if (!rcu_preempt_blocked_readers_any()) | ||
457 | return; | ||
458 | |||
459 | /* Once we get past the fastpath checks, same code as rcu_barrier(). */ | ||
460 | rcu_barrier(); | ||
461 | } | ||
462 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
463 | |||
464 | static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); | ||
465 | static unsigned long sync_rcu_preempt_exp_count; | ||
466 | static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); | ||
467 | |||
468 | /* | ||
469 | * Return non-zero if there are any tasks in RCU read-side critical | ||
470 | * sections blocking the current preemptible-RCU expedited grace period. | ||
471 | * If there is no preemptible-RCU expedited grace period currently in | ||
472 | * progress, returns zero unconditionally. | ||
473 | */ | ||
474 | static int rcu_preempted_readers_exp(void) | ||
475 | { | ||
476 | return rcu_preempt_ctrlblk.exp_tasks != NULL; | ||
477 | } | ||
478 | |||
479 | /* | ||
480 | * Report the exit from RCU read-side critical section for the last task | ||
481 | * that queued itself during or before the current expedited preemptible-RCU | ||
482 | * grace period. | ||
483 | */ | ||
484 | static void rcu_report_exp_done(void) | ||
485 | { | ||
486 | wake_up(&sync_rcu_preempt_exp_wq); | ||
487 | } | ||
488 | |||
489 | /* | ||
490 | * Wait for an rcu-preempt grace period, but expedite it. The basic idea | ||
491 | * is to rely in the fact that there is but one CPU, and that it is | ||
492 | * illegal for a task to invoke synchronize_rcu_expedited() while in a | ||
493 | * preemptible-RCU read-side critical section. Therefore, any such | ||
494 | * critical sections must correspond to blocked tasks, which must therefore | ||
495 | * be on the ->blkd_tasks list. So just record the current head of the | ||
496 | * list in the ->exp_tasks pointer, and wait for all tasks including and | ||
497 | * after the task pointed to by ->exp_tasks to drain. | ||
498 | */ | ||
499 | void synchronize_rcu_expedited(void) | ||
500 | { | ||
501 | unsigned long flags; | ||
502 | struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; | ||
503 | unsigned long snap; | ||
504 | |||
505 | barrier(); /* ensure prior action seen before grace period. */ | ||
506 | |||
507 | WARN_ON_ONCE(rcu_preempt_running_reader()); | ||
508 | |||
509 | /* | ||
510 | * Acquire lock so that there is only one preemptible RCU grace | ||
511 | * period in flight. Of course, if someone does the expedited | ||
512 | * grace period for us while we are acquiring the lock, just leave. | ||
513 | */ | ||
514 | snap = sync_rcu_preempt_exp_count + 1; | ||
515 | mutex_lock(&sync_rcu_preempt_exp_mutex); | ||
516 | if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) | ||
517 | goto unlock_mb_ret; /* Others did our work for us. */ | ||
518 | |||
519 | local_irq_save(flags); | ||
520 | |||
521 | /* | ||
522 | * All RCU readers have to already be on blkd_tasks because | ||
523 | * we cannot legally be executing in an RCU read-side critical | ||
524 | * section. | ||
525 | */ | ||
526 | |||
527 | /* Snapshot current head of ->blkd_tasks list. */ | ||
528 | rpcp->exp_tasks = rpcp->blkd_tasks.next; | ||
529 | if (rpcp->exp_tasks == &rpcp->blkd_tasks) | ||
530 | rpcp->exp_tasks = NULL; | ||
531 | local_irq_restore(flags); | ||
532 | |||
533 | /* Wait for tail of ->blkd_tasks list to drain. */ | ||
534 | if (rcu_preempted_readers_exp()) | ||
535 | wait_event(sync_rcu_preempt_exp_wq, | ||
536 | !rcu_preempted_readers_exp()); | ||
537 | |||
538 | /* Clean up and exit. */ | ||
539 | barrier(); /* ensure expedited GP seen before counter increment. */ | ||
540 | sync_rcu_preempt_exp_count++; | ||
541 | unlock_mb_ret: | ||
542 | mutex_unlock(&sync_rcu_preempt_exp_mutex); | ||
543 | barrier(); /* ensure subsequent action seen after grace period. */ | ||
544 | } | ||
545 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
546 | |||
547 | /* | ||
548 | * Does preemptible RCU need the CPU to stay out of dynticks mode? | ||
549 | */ | ||
550 | int rcu_preempt_needs_cpu(void) | ||
551 | { | ||
552 | if (!rcu_preempt_running_reader()) | ||
553 | rcu_preempt_cpu_qs(); | ||
554 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; | ||
555 | } | ||
556 | |||
557 | /* | ||
558 | * Check for a task exiting while in a preemptible -RCU read-side | ||
559 | * critical section, clean up if so. No need to issue warnings, | ||
560 | * as debug_check_no_locks_held() already does this if lockdep | ||
561 | * is enabled. | ||
562 | */ | ||
563 | void exit_rcu(void) | ||
564 | { | ||
565 | struct task_struct *t = current; | ||
566 | |||
567 | if (t->rcu_read_lock_nesting == 0) | ||
568 | return; | ||
569 | t->rcu_read_lock_nesting = 1; | ||
570 | rcu_read_unlock(); | ||
571 | } | ||
572 | |||
573 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | ||
574 | |||
575 | /* | ||
576 | * Because preemptible RCU does not exist, it never has any callbacks | ||
577 | * to check. | ||
578 | */ | ||
579 | static void rcu_preempt_check_callbacks(void) | ||
580 | { | ||
581 | } | ||
582 | |||
583 | /* | ||
584 | * Because preemptible RCU does not exist, it never has any callbacks | ||
585 | * to remove. | ||
586 | */ | ||
587 | static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | ||
588 | { | ||
589 | } | ||
590 | |||
591 | /* | ||
592 | * Because preemptible RCU does not exist, it never has any callbacks | ||
593 | * to process. | ||
594 | */ | ||
595 | static void rcu_preempt_process_callbacks(void) | ||
596 | { | ||
597 | } | ||
598 | |||
599 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ | ||
600 | |||
25 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 601 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
26 | 602 | ||
27 | #include <linux/kernel_stat.h> | 603 | #include <linux/kernel_stat.h> |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 6535ac8bc6a5..9d8e8fb2515f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -120,7 +120,7 @@ struct rcu_torture { | |||
120 | }; | 120 | }; |
121 | 121 | ||
122 | static LIST_HEAD(rcu_torture_freelist); | 122 | static LIST_HEAD(rcu_torture_freelist); |
123 | static struct rcu_torture *rcu_torture_current; | 123 | static struct rcu_torture __rcu *rcu_torture_current; |
124 | static long rcu_torture_current_version; | 124 | static long rcu_torture_current_version; |
125 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; | 125 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; |
126 | static DEFINE_SPINLOCK(rcu_torture_lock); | 126 | static DEFINE_SPINLOCK(rcu_torture_lock); |
@@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | |||
153 | #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ | 153 | #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ |
154 | #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ | 154 | #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ |
155 | static int fullstop = FULLSTOP_RMMOD; | 155 | static int fullstop = FULLSTOP_RMMOD; |
156 | DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ | 156 | /* |
157 | /* of kthreads. */ | 157 | * Protect fullstop transitions and spawning of kthreads. |
158 | */ | ||
159 | static DEFINE_MUTEX(fullstop_mutex); | ||
158 | 160 | ||
159 | /* | 161 | /* |
160 | * Detect and respond to a system shutdown. | 162 | * Detect and respond to a system shutdown. |
@@ -239,8 +241,7 @@ static unsigned long | |||
239 | rcu_random(struct rcu_random_state *rrsp) | 241 | rcu_random(struct rcu_random_state *rrsp) |
240 | { | 242 | { |
241 | if (--rrsp->rrs_count < 0) { | 243 | if (--rrsp->rrs_count < 0) { |
242 | rrsp->rrs_state += | 244 | rrsp->rrs_state += (unsigned long)local_clock(); |
243 | (unsigned long)cpu_clock(raw_smp_processor_id()); | ||
244 | rrsp->rrs_count = RCU_RANDOM_REFRESH; | 245 | rrsp->rrs_count = RCU_RANDOM_REFRESH; |
245 | } | 246 | } |
246 | rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; | 247 | rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; |
@@ -304,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp) | |||
304 | mdelay(longdelay_ms); | 305 | mdelay(longdelay_ms); |
305 | if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) | 306 | if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) |
306 | udelay(shortdelay_us); | 307 | udelay(shortdelay_us); |
308 | #ifdef CONFIG_PREEMPT | ||
309 | if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) | ||
310 | preempt_schedule(); /* No QS if preempt_disable() in effect */ | ||
311 | #endif | ||
307 | } | 312 | } |
308 | 313 | ||
309 | static void rcu_torture_read_unlock(int idx) __releases(RCU) | 314 | static void rcu_torture_read_unlock(int idx) __releases(RCU) |
@@ -537,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) | |||
537 | delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); | 542 | delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); |
538 | if (!delay) | 543 | if (!delay) |
539 | schedule_timeout_interruptible(longdelay); | 544 | schedule_timeout_interruptible(longdelay); |
545 | else | ||
546 | rcu_read_delay(rrsp); | ||
540 | } | 547 | } |
541 | 548 | ||
542 | static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) | 549 | static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) |
@@ -732,7 +739,8 @@ rcu_torture_writer(void *arg) | |||
732 | continue; | 739 | continue; |
733 | rp->rtort_pipe_count = 0; | 740 | rp->rtort_pipe_count = 0; |
734 | udelay(rcu_random(&rand) & 0x3ff); | 741 | udelay(rcu_random(&rand) & 0x3ff); |
735 | old_rp = rcu_torture_current; | 742 | old_rp = rcu_dereference_check(rcu_torture_current, |
743 | current == writer_task); | ||
736 | rp->rtort_mbtest = 1; | 744 | rp->rtort_mbtest = 1; |
737 | rcu_assign_pointer(rcu_torture_current, rp); | 745 | rcu_assign_pointer(rcu_torture_current, rp); |
738 | smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ | 746 | smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d4437345706f..ccdc04c47981 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -143,6 +143,11 @@ module_param(blimit, int, 0); | |||
143 | module_param(qhimark, int, 0); | 143 | module_param(qhimark, int, 0); |
144 | module_param(qlowmark, int, 0); | 144 | module_param(qlowmark, int, 0); |
145 | 145 | ||
146 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
147 | int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; | ||
148 | module_param(rcu_cpu_stall_suppress, int, 0644); | ||
149 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
150 | |||
146 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); | 151 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); |
147 | static int rcu_pending(int cpu); | 152 | static int rcu_pending(int cpu); |
148 | 153 | ||
@@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
450 | 455 | ||
451 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 456 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
452 | 457 | ||
453 | int rcu_cpu_stall_panicking __read_mostly; | 458 | int rcu_cpu_stall_suppress __read_mostly; |
454 | 459 | ||
455 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 460 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
456 | { | 461 | { |
@@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
482 | rcu_print_task_stall(rnp); | 487 | rcu_print_task_stall(rnp); |
483 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 488 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
484 | 489 | ||
485 | /* OK, time to rat on our buddy... */ | 490 | /* |
486 | 491 | * OK, time to rat on our buddy... | |
492 | * See Documentation/RCU/stallwarn.txt for info on how to debug | ||
493 | * RCU CPU stall warnings. | ||
494 | */ | ||
487 | printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", | 495 | printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", |
488 | rsp->name); | 496 | rsp->name); |
489 | rcu_for_each_leaf_node(rsp, rnp) { | 497 | rcu_for_each_leaf_node(rsp, rnp) { |
@@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
512 | unsigned long flags; | 520 | unsigned long flags; |
513 | struct rcu_node *rnp = rcu_get_root(rsp); | 521 | struct rcu_node *rnp = rcu_get_root(rsp); |
514 | 522 | ||
523 | /* | ||
524 | * OK, time to rat on ourselves... | ||
525 | * See Documentation/RCU/stallwarn.txt for info on how to debug | ||
526 | * RCU CPU stall warnings. | ||
527 | */ | ||
515 | printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", | 528 | printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", |
516 | rsp->name, smp_processor_id(), jiffies - rsp->gp_start); | 529 | rsp->name, smp_processor_id(), jiffies - rsp->gp_start); |
517 | trigger_all_cpu_backtrace(); | 530 | trigger_all_cpu_backtrace(); |
@@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | |||
530 | long delta; | 543 | long delta; |
531 | struct rcu_node *rnp; | 544 | struct rcu_node *rnp; |
532 | 545 | ||
533 | if (rcu_cpu_stall_panicking) | 546 | if (rcu_cpu_stall_suppress) |
534 | return; | 547 | return; |
535 | delta = jiffies - rsp->jiffies_stall; | 548 | delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); |
536 | rnp = rdp->mynode; | 549 | rnp = rdp->mynode; |
537 | if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { | 550 | if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { |
538 | 551 | ||
539 | /* We haven't checked in, so go dump stack. */ | 552 | /* We haven't checked in, so go dump stack. */ |
540 | print_cpu_stall(rsp); | 553 | print_cpu_stall(rsp); |
@@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | |||
548 | 561 | ||
549 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) | 562 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) |
550 | { | 563 | { |
551 | rcu_cpu_stall_panicking = 1; | 564 | rcu_cpu_stall_suppress = 1; |
552 | return NOTIFY_DONE; | 565 | return NOTIFY_DONE; |
553 | } | 566 | } |
554 | 567 | ||
568 | /** | ||
569 | * rcu_cpu_stall_reset - prevent further stall warnings in current grace period | ||
570 | * | ||
571 | * Set the stall-warning timeout way off into the future, thus preventing | ||
572 | * any RCU CPU stall-warning messages from appearing in the current set of | ||
573 | * RCU grace periods. | ||
574 | * | ||
575 | * The caller must disable hard irqs. | ||
576 | */ | ||
577 | void rcu_cpu_stall_reset(void) | ||
578 | { | ||
579 | rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; | ||
580 | rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; | ||
581 | rcu_preempt_stall_reset(); | ||
582 | } | ||
583 | |||
555 | static struct notifier_block rcu_panic_block = { | 584 | static struct notifier_block rcu_panic_block = { |
556 | .notifier_call = rcu_panic, | 585 | .notifier_call = rcu_panic, |
557 | }; | 586 | }; |
@@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | |||
571 | { | 600 | { |
572 | } | 601 | } |
573 | 602 | ||
603 | void rcu_cpu_stall_reset(void) | ||
604 | { | ||
605 | } | ||
606 | |||
574 | static void __init check_cpu_stall_init(void) | 607 | static void __init check_cpu_stall_init(void) |
575 | { | 608 | { |
576 | } | 609 | } |
@@ -712,7 +745,7 @@ static void | |||
712 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | 745 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) |
713 | __releases(rcu_get_root(rsp)->lock) | 746 | __releases(rcu_get_root(rsp)->lock) |
714 | { | 747 | { |
715 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; | 748 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
716 | struct rcu_node *rnp = rcu_get_root(rsp); | 749 | struct rcu_node *rnp = rcu_get_root(rsp); |
717 | 750 | ||
718 | if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { | 751 | if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { |
@@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
960 | static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | 993 | static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) |
961 | { | 994 | { |
962 | int i; | 995 | int i; |
963 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; | 996 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
964 | 997 | ||
965 | if (rdp->nxtlist == NULL) | 998 | if (rdp->nxtlist == NULL) |
966 | return; /* irqs disabled, so comparison is stable. */ | 999 | return; /* irqs disabled, so comparison is stable. */ |
@@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | |||
971 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1004 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
972 | rdp->nxttail[i] = &rdp->nxtlist; | 1005 | rdp->nxttail[i] = &rdp->nxtlist; |
973 | rsp->orphan_qlen += rdp->qlen; | 1006 | rsp->orphan_qlen += rdp->qlen; |
1007 | rdp->n_cbs_orphaned += rdp->qlen; | ||
974 | rdp->qlen = 0; | 1008 | rdp->qlen = 0; |
975 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | 1009 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ |
976 | } | 1010 | } |
@@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
984 | struct rcu_data *rdp; | 1018 | struct rcu_data *rdp; |
985 | 1019 | ||
986 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1020 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
987 | rdp = rsp->rda[smp_processor_id()]; | 1021 | rdp = this_cpu_ptr(rsp->rda); |
988 | if (rsp->orphan_cbs_list == NULL) { | 1022 | if (rsp->orphan_cbs_list == NULL) { |
989 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 1023 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
990 | return; | 1024 | return; |
@@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
992 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; | 1026 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; |
993 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; | 1027 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; |
994 | rdp->qlen += rsp->orphan_qlen; | 1028 | rdp->qlen += rsp->orphan_qlen; |
1029 | rdp->n_cbs_adopted += rsp->orphan_qlen; | ||
995 | rsp->orphan_cbs_list = NULL; | 1030 | rsp->orphan_cbs_list = NULL; |
996 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; | 1031 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; |
997 | rsp->orphan_qlen = 0; | 1032 | rsp->orphan_qlen = 0; |
@@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1007 | unsigned long flags; | 1042 | unsigned long flags; |
1008 | unsigned long mask; | 1043 | unsigned long mask; |
1009 | int need_report = 0; | 1044 | int need_report = 0; |
1010 | struct rcu_data *rdp = rsp->rda[cpu]; | 1045 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1011 | struct rcu_node *rnp; | 1046 | struct rcu_node *rnp; |
1012 | 1047 | ||
1013 | /* Exclude any attempts to start a new grace period. */ | 1048 | /* Exclude any attempts to start a new grace period. */ |
@@ -1112,6 +1147,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1112 | while (list) { | 1147 | while (list) { |
1113 | next = list->next; | 1148 | next = list->next; |
1114 | prefetch(next); | 1149 | prefetch(next); |
1150 | debug_rcu_head_unqueue(list); | ||
1115 | list->func(list); | 1151 | list->func(list); |
1116 | list = next; | 1152 | list = next; |
1117 | if (++count >= rdp->blimit) | 1153 | if (++count >= rdp->blimit) |
@@ -1122,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1122 | 1158 | ||
1123 | /* Update count, and requeue any remaining callbacks. */ | 1159 | /* Update count, and requeue any remaining callbacks. */ |
1124 | rdp->qlen -= count; | 1160 | rdp->qlen -= count; |
1161 | rdp->n_cbs_invoked += count; | ||
1125 | if (list != NULL) { | 1162 | if (list != NULL) { |
1126 | *tail = rdp->nxtlist; | 1163 | *tail = rdp->nxtlist; |
1127 | rdp->nxtlist = list; | 1164 | rdp->nxtlist = list; |
@@ -1225,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
1225 | cpu = rnp->grplo; | 1262 | cpu = rnp->grplo; |
1226 | bit = 1; | 1263 | bit = 1; |
1227 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | 1264 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { |
1228 | if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) | 1265 | if ((rnp->qsmask & bit) != 0 && |
1266 | f(per_cpu_ptr(rsp->rda, cpu))) | ||
1229 | mask |= bit; | 1267 | mask |= bit; |
1230 | } | 1268 | } |
1231 | if (mask != 0) { | 1269 | if (mask != 0) { |
@@ -1388,6 +1426,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1388 | unsigned long flags; | 1426 | unsigned long flags; |
1389 | struct rcu_data *rdp; | 1427 | struct rcu_data *rdp; |
1390 | 1428 | ||
1429 | debug_rcu_head_queue(head); | ||
1391 | head->func = func; | 1430 | head->func = func; |
1392 | head->next = NULL; | 1431 | head->next = NULL; |
1393 | 1432 | ||
@@ -1400,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1400 | * a quiescent state betweentimes. | 1439 | * a quiescent state betweentimes. |
1401 | */ | 1440 | */ |
1402 | local_irq_save(flags); | 1441 | local_irq_save(flags); |
1403 | rdp = rsp->rda[smp_processor_id()]; | 1442 | rdp = this_cpu_ptr(rsp->rda); |
1404 | rcu_process_gp_end(rsp, rdp); | 1443 | rcu_process_gp_end(rsp, rdp); |
1405 | check_for_new_grace_period(rsp, rdp); | 1444 | check_for_new_grace_period(rsp, rdp); |
1406 | 1445 | ||
@@ -1699,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1699 | { | 1738 | { |
1700 | unsigned long flags; | 1739 | unsigned long flags; |
1701 | int i; | 1740 | int i; |
1702 | struct rcu_data *rdp = rsp->rda[cpu]; | 1741 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1703 | struct rcu_node *rnp = rcu_get_root(rsp); | 1742 | struct rcu_node *rnp = rcu_get_root(rsp); |
1704 | 1743 | ||
1705 | /* Set up local state, ensuring consistent view of global state. */ | 1744 | /* Set up local state, ensuring consistent view of global state. */ |
@@ -1727,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1727 | { | 1766 | { |
1728 | unsigned long flags; | 1767 | unsigned long flags; |
1729 | unsigned long mask; | 1768 | unsigned long mask; |
1730 | struct rcu_data *rdp = rsp->rda[cpu]; | 1769 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1731 | struct rcu_node *rnp = rcu_get_root(rsp); | 1770 | struct rcu_node *rnp = rcu_get_root(rsp); |
1732 | 1771 | ||
1733 | /* Set up local state, ensuring consistent view of global state. */ | 1772 | /* Set up local state, ensuring consistent view of global state. */ |
@@ -1863,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
1863 | /* | 1902 | /* |
1864 | * Helper function for rcu_init() that initializes one rcu_state structure. | 1903 | * Helper function for rcu_init() that initializes one rcu_state structure. |
1865 | */ | 1904 | */ |
1866 | static void __init rcu_init_one(struct rcu_state *rsp) | 1905 | static void __init rcu_init_one(struct rcu_state *rsp, |
1906 | struct rcu_data __percpu *rda) | ||
1867 | { | 1907 | { |
1868 | static char *buf[] = { "rcu_node_level_0", | 1908 | static char *buf[] = { "rcu_node_level_0", |
1869 | "rcu_node_level_1", | 1909 | "rcu_node_level_1", |
@@ -1916,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
1916 | } | 1956 | } |
1917 | } | 1957 | } |
1918 | 1958 | ||
1959 | rsp->rda = rda; | ||
1919 | rnp = rsp->level[NUM_RCU_LVLS - 1]; | 1960 | rnp = rsp->level[NUM_RCU_LVLS - 1]; |
1920 | for_each_possible_cpu(i) { | 1961 | for_each_possible_cpu(i) { |
1921 | while (i > rnp->grphi) | 1962 | while (i > rnp->grphi) |
1922 | rnp++; | 1963 | rnp++; |
1923 | rsp->rda[i]->mynode = rnp; | 1964 | per_cpu_ptr(rsp->rda, i)->mynode = rnp; |
1924 | rcu_boot_init_percpu_data(i, rsp); | 1965 | rcu_boot_init_percpu_data(i, rsp); |
1925 | } | 1966 | } |
1926 | } | 1967 | } |
1927 | 1968 | ||
1928 | /* | ||
1929 | * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used | ||
1930 | * nowhere else! Assigns leaf node pointers into each CPU's rcu_data | ||
1931 | * structure. | ||
1932 | */ | ||
1933 | #define RCU_INIT_FLAVOR(rsp, rcu_data) \ | ||
1934 | do { \ | ||
1935 | int i; \ | ||
1936 | \ | ||
1937 | for_each_possible_cpu(i) { \ | ||
1938 | (rsp)->rda[i] = &per_cpu(rcu_data, i); \ | ||
1939 | } \ | ||
1940 | rcu_init_one(rsp); \ | ||
1941 | } while (0) | ||
1942 | |||
1943 | void __init rcu_init(void) | 1969 | void __init rcu_init(void) |
1944 | { | 1970 | { |
1945 | int cpu; | 1971 | int cpu; |
1946 | 1972 | ||
1947 | rcu_bootup_announce(); | 1973 | rcu_bootup_announce(); |
1948 | RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); | 1974 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); |
1949 | RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); | 1975 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
1950 | __rcu_init_preempt(); | 1976 | __rcu_init_preempt(); |
1951 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 1977 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
1952 | 1978 | ||
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 14c040b18ed0..91d4170c5c13 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -202,6 +202,9 @@ struct rcu_data { | |||
202 | long qlen; /* # of queued callbacks */ | 202 | long qlen; /* # of queued callbacks */ |
203 | long qlen_last_fqs_check; | 203 | long qlen_last_fqs_check; |
204 | /* qlen at last check for QS forcing */ | 204 | /* qlen at last check for QS forcing */ |
205 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | ||
206 | unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ | ||
207 | unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ | ||
205 | unsigned long n_force_qs_snap; | 208 | unsigned long n_force_qs_snap; |
206 | /* did other CPU force QS recently? */ | 209 | /* did other CPU force QS recently? */ |
207 | long blimit; /* Upper limit on a processed batch */ | 210 | long blimit; /* Upper limit on a processed batch */ |
@@ -254,19 +257,23 @@ struct rcu_data { | |||
254 | #define RCU_STALL_DELAY_DELTA 0 | 257 | #define RCU_STALL_DELAY_DELTA 0 |
255 | #endif | 258 | #endif |
256 | 259 | ||
257 | #define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) | 260 | #define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ |
261 | RCU_STALL_DELAY_DELTA) | ||
258 | /* for rsp->jiffies_stall */ | 262 | /* for rsp->jiffies_stall */ |
259 | #define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) | 263 | #define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) |
260 | /* for rsp->jiffies_stall */ | 264 | /* for rsp->jiffies_stall */ |
261 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ | 265 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ |
262 | /* to take at least one */ | 266 | /* to take at least one */ |
263 | /* scheduling clock irq */ | 267 | /* scheduling clock irq */ |
264 | /* before ratting on them. */ | 268 | /* before ratting on them. */ |
265 | 269 | ||
266 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 270 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE |
271 | #define RCU_CPU_STALL_SUPPRESS_INIT 0 | ||
272 | #else | ||
273 | #define RCU_CPU_STALL_SUPPRESS_INIT 1 | ||
274 | #endif | ||
267 | 275 | ||
268 | #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) | 276 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
269 | #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) | ||
270 | 277 | ||
271 | /* | 278 | /* |
272 | * RCU global state, including node hierarchy. This hierarchy is | 279 | * RCU global state, including node hierarchy. This hierarchy is |
@@ -283,7 +290,7 @@ struct rcu_state { | |||
283 | struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ | 290 | struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ |
284 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ | 291 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ |
285 | u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ | 292 | u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ |
286 | struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ | 293 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
287 | 294 | ||
288 | /* The following fields are guarded by the root rcu_node's lock. */ | 295 | /* The following fields are guarded by the root rcu_node's lock. */ |
289 | 296 | ||
@@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | |||
365 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 372 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
366 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | 373 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); |
367 | static void rcu_print_task_stall(struct rcu_node *rnp); | 374 | static void rcu_print_task_stall(struct rcu_node *rnp); |
375 | static void rcu_preempt_stall_reset(void); | ||
368 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 376 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
369 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 377 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
370 | #ifdef CONFIG_HOTPLUG_CPU | 378 | #ifdef CONFIG_HOTPLUG_CPU |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0e4f420245d9..71a4147473f9 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
57 | printk(KERN_INFO | 57 | printk(KERN_INFO |
58 | "\tRCU-based detection of stalled CPUs is disabled.\n"); | 58 | "\tRCU-based detection of stalled CPUs is disabled.\n"); |
59 | #endif | 59 | #endif |
60 | #ifndef CONFIG_RCU_CPU_STALL_VERBOSE | 60 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) |
61 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); | 61 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); |
62 | #endif | 62 | #endif |
63 | #if NUM_RCU_LVL_4 != 0 | 63 | #if NUM_RCU_LVL_4 != 0 |
@@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
154 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 154 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { |
155 | 155 | ||
156 | /* Possibly blocking in an RCU read-side critical section. */ | 156 | /* Possibly blocking in an RCU read-side critical section. */ |
157 | rdp = rcu_preempt_state.rda[cpu]; | 157 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); |
158 | rnp = rdp->mynode; | 158 | rnp = rdp->mynode; |
159 | raw_spin_lock_irqsave(&rnp->lock, flags); | 159 | raw_spin_lock_irqsave(&rnp->lock, flags); |
160 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 160 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
@@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
201 | */ | 201 | */ |
202 | void __rcu_read_lock(void) | 202 | void __rcu_read_lock(void) |
203 | { | 203 | { |
204 | ACCESS_ONCE(current->rcu_read_lock_nesting)++; | 204 | current->rcu_read_lock_nesting++; |
205 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ | 205 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ |
206 | } | 206 | } |
207 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | 207 | EXPORT_SYMBOL_GPL(__rcu_read_lock); |
@@ -344,7 +344,9 @@ void __rcu_read_unlock(void) | |||
344 | struct task_struct *t = current; | 344 | struct task_struct *t = current; |
345 | 345 | ||
346 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ | 346 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ |
347 | if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && | 347 | --t->rcu_read_lock_nesting; |
348 | barrier(); /* decrement before load of ->rcu_read_unlock_special */ | ||
349 | if (t->rcu_read_lock_nesting == 0 && | ||
348 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 350 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) |
349 | rcu_read_unlock_special(t); | 351 | rcu_read_unlock_special(t); |
350 | #ifdef CONFIG_PROVE_LOCKING | 352 | #ifdef CONFIG_PROVE_LOCKING |
@@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp) | |||
417 | } | 419 | } |
418 | } | 420 | } |
419 | 421 | ||
422 | /* | ||
423 | * Suppress preemptible RCU's CPU stall warnings by pushing the | ||
424 | * time of the next stall-warning message comfortably far into the | ||
425 | * future. | ||
426 | */ | ||
427 | static void rcu_preempt_stall_reset(void) | ||
428 | { | ||
429 | rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; | ||
430 | } | ||
431 | |||
420 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 432 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
421 | 433 | ||
422 | /* | 434 | /* |
@@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu); | |||
546 | * | 558 | * |
547 | * Control will return to the caller some time after a full grace | 559 | * Control will return to the caller some time after a full grace |
548 | * period has elapsed, in other words after all currently executing RCU | 560 | * period has elapsed, in other words after all currently executing RCU |
549 | * read-side critical sections have completed. RCU read-side critical | 561 | * read-side critical sections have completed. Note, however, that |
550 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | 562 | * upon return from synchronize_rcu(), the caller might well be executing |
551 | * and may be nested. | 563 | * concurrently with new RCU read-side critical sections that began while |
564 | * synchronize_rcu() was waiting. RCU read-side critical sections are | ||
565 | * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. | ||
552 | */ | 566 | */ |
553 | void synchronize_rcu(void) | 567 | void synchronize_rcu(void) |
554 | { | 568 | { |
@@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void) | |||
771 | */ | 785 | */ |
772 | static void __init __rcu_init_preempt(void) | 786 | static void __init __rcu_init_preempt(void) |
773 | { | 787 | { |
774 | RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); | 788 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); |
775 | } | 789 | } |
776 | 790 | ||
777 | /* | 791 | /* |
@@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp) | |||
865 | { | 879 | { |
866 | } | 880 | } |
867 | 881 | ||
882 | /* | ||
883 | * Because preemptible RCU does not exist, there is no need to suppress | ||
884 | * its CPU stall warnings. | ||
885 | */ | ||
886 | static void rcu_preempt_stall_reset(void) | ||
887 | { | ||
888 | } | ||
889 | |||
868 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 890 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
869 | 891 | ||
870 | /* | 892 | /* |
@@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void) | |||
919 | } | 941 | } |
920 | 942 | ||
921 | /* | 943 | /* |
922 | * In classic RCU, call_rcu() is just call_rcu_sched(). | ||
923 | */ | ||
924 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
925 | { | ||
926 | call_rcu_sched(head, func); | ||
927 | } | ||
928 | EXPORT_SYMBOL_GPL(call_rcu); | ||
929 | |||
930 | /* | ||
931 | * Wait for an rcu-preempt grace period, but make it happen quickly. | 944 | * Wait for an rcu-preempt grace period, but make it happen quickly. |
932 | * But because preemptable RCU does not exist, map to rcu-sched. | 945 | * But because preemptable RCU does not exist, map to rcu-sched. |
933 | */ | 946 | */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 36c95b45738e..d15430b9d122 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
64 | rdp->dynticks_fqs); | 64 | rdp->dynticks_fqs); |
65 | #endif /* #ifdef CONFIG_NO_HZ */ | 65 | #endif /* #ifdef CONFIG_NO_HZ */ |
66 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); | 66 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); |
67 | seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); | 67 | seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); |
68 | seq_printf(m, " ci=%lu co=%lu ca=%lu\n", | ||
69 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
68 | } | 70 | } |
69 | 71 | ||
70 | #define PRINT_RCU_DATA(name, func, m) \ | 72 | #define PRINT_RCU_DATA(name, func, m) \ |
@@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
119 | rdp->dynticks_fqs); | 121 | rdp->dynticks_fqs); |
120 | #endif /* #ifdef CONFIG_NO_HZ */ | 122 | #endif /* #ifdef CONFIG_NO_HZ */ |
121 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); | 123 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); |
122 | seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); | 124 | seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); |
125 | seq_printf(m, ",%lu,%lu,%lu\n", | ||
126 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
123 | } | 127 | } |
124 | 128 | ||
125 | static int show_rcudata_csv(struct seq_file *m, void *unused) | 129 | static int show_rcudata_csv(struct seq_file *m, void *unused) |
@@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) | |||
128 | #ifdef CONFIG_NO_HZ | 132 | #ifdef CONFIG_NO_HZ |
129 | seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); | 133 | seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); |
130 | #endif /* #ifdef CONFIG_NO_HZ */ | 134 | #endif /* #ifdef CONFIG_NO_HZ */ |
131 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); | 135 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); |
132 | #ifdef CONFIG_TREE_PREEMPT_RCU | 136 | #ifdef CONFIG_TREE_PREEMPT_RCU |
133 | seq_puts(m, "\"rcu_preempt:\"\n"); | 137 | seq_puts(m, "\"rcu_preempt:\"\n"); |
134 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); | 138 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); |
@@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) | |||
262 | struct rcu_data *rdp; | 266 | struct rcu_data *rdp; |
263 | 267 | ||
264 | for_each_possible_cpu(cpu) { | 268 | for_each_possible_cpu(cpu) { |
265 | rdp = rsp->rda[cpu]; | 269 | rdp = per_cpu_ptr(rsp->rda, cpu); |
266 | if (rdp->beenonline) | 270 | if (rdp->beenonline) |
267 | print_one_rcu_pending(m, rdp); | 271 | print_one_rcu_pending(m, rdp); |
268 | } | 272 | } |
diff --git a/kernel/resource.c b/kernel/resource.c index 7b36976e5dea..9fad33efd0db 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -40,6 +40,23 @@ EXPORT_SYMBOL(iomem_resource); | |||
40 | 40 | ||
41 | static DEFINE_RWLOCK(resource_lock); | 41 | static DEFINE_RWLOCK(resource_lock); |
42 | 42 | ||
43 | /* | ||
44 | * By default, we allocate free space bottom-up. The architecture can request | ||
45 | * top-down by clearing this flag. The user can override the architecture's | ||
46 | * choice with the "resource_alloc_from_bottom" kernel boot option, but that | ||
47 | * should only be a debugging tool. | ||
48 | */ | ||
49 | int resource_alloc_from_bottom = 1; | ||
50 | |||
51 | static __init int setup_alloc_from_bottom(char *s) | ||
52 | { | ||
53 | printk(KERN_INFO | ||
54 | "resource: allocating from bottom-up; please report a bug\n"); | ||
55 | resource_alloc_from_bottom = 1; | ||
56 | return 0; | ||
57 | } | ||
58 | early_param("resource_alloc_from_bottom", setup_alloc_from_bottom); | ||
59 | |||
43 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | 60 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) |
44 | { | 61 | { |
45 | struct resource *p = v; | 62 | struct resource *p = v; |
@@ -357,8 +374,97 @@ int __weak page_is_ram(unsigned long pfn) | |||
357 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; | 374 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; |
358 | } | 375 | } |
359 | 376 | ||
377 | static resource_size_t simple_align_resource(void *data, | ||
378 | const struct resource *avail, | ||
379 | resource_size_t size, | ||
380 | resource_size_t align) | ||
381 | { | ||
382 | return avail->start; | ||
383 | } | ||
384 | |||
385 | static void resource_clip(struct resource *res, resource_size_t min, | ||
386 | resource_size_t max) | ||
387 | { | ||
388 | if (res->start < min) | ||
389 | res->start = min; | ||
390 | if (res->end > max) | ||
391 | res->end = max; | ||
392 | } | ||
393 | |||
394 | static bool resource_contains(struct resource *res1, struct resource *res2) | ||
395 | { | ||
396 | return res1->start <= res2->start && res1->end >= res2->end; | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * Find the resource before "child" in the sibling list of "root" children. | ||
401 | */ | ||
402 | static struct resource *find_sibling_prev(struct resource *root, struct resource *child) | ||
403 | { | ||
404 | struct resource *this; | ||
405 | |||
406 | for (this = root->child; this; this = this->sibling) | ||
407 | if (this->sibling == child) | ||
408 | return this; | ||
409 | |||
410 | return NULL; | ||
411 | } | ||
412 | |||
413 | /* | ||
414 | * Find empty slot in the resource tree given range and alignment. | ||
415 | * This version allocates from the end of the root resource first. | ||
416 | */ | ||
417 | static int find_resource_from_top(struct resource *root, struct resource *new, | ||
418 | resource_size_t size, resource_size_t min, | ||
419 | resource_size_t max, resource_size_t align, | ||
420 | resource_size_t (*alignf)(void *, | ||
421 | const struct resource *, | ||
422 | resource_size_t, | ||
423 | resource_size_t), | ||
424 | void *alignf_data) | ||
425 | { | ||
426 | struct resource *this; | ||
427 | struct resource tmp, avail, alloc; | ||
428 | |||
429 | tmp.start = root->end; | ||
430 | tmp.end = root->end; | ||
431 | |||
432 | this = find_sibling_prev(root, NULL); | ||
433 | for (;;) { | ||
434 | if (this) { | ||
435 | if (this->end < root->end) | ||
436 | tmp.start = this->end + 1; | ||
437 | } else | ||
438 | tmp.start = root->start; | ||
439 | |||
440 | resource_clip(&tmp, min, max); | ||
441 | |||
442 | /* Check for overflow after ALIGN() */ | ||
443 | avail = *new; | ||
444 | avail.start = ALIGN(tmp.start, align); | ||
445 | avail.end = tmp.end; | ||
446 | if (avail.start >= tmp.start) { | ||
447 | alloc.start = alignf(alignf_data, &avail, size, align); | ||
448 | alloc.end = alloc.start + size - 1; | ||
449 | if (resource_contains(&avail, &alloc)) { | ||
450 | new->start = alloc.start; | ||
451 | new->end = alloc.end; | ||
452 | return 0; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | if (!this || this->start == root->start) | ||
457 | break; | ||
458 | |||
459 | tmp.end = this->start - 1; | ||
460 | this = find_sibling_prev(root, this); | ||
461 | } | ||
462 | return -EBUSY; | ||
463 | } | ||
464 | |||
360 | /* | 465 | /* |
361 | * Find empty slot in the resource tree given range and alignment. | 466 | * Find empty slot in the resource tree given range and alignment. |
467 | * This version allocates from the beginning of the root resource first. | ||
362 | */ | 468 | */ |
363 | static int find_resource(struct resource *root, struct resource *new, | 469 | static int find_resource(struct resource *root, struct resource *new, |
364 | resource_size_t size, resource_size_t min, | 470 | resource_size_t size, resource_size_t min, |
@@ -370,36 +476,43 @@ static int find_resource(struct resource *root, struct resource *new, | |||
370 | void *alignf_data) | 476 | void *alignf_data) |
371 | { | 477 | { |
372 | struct resource *this = root->child; | 478 | struct resource *this = root->child; |
373 | struct resource tmp = *new; | 479 | struct resource tmp = *new, avail, alloc; |
374 | 480 | ||
375 | tmp.start = root->start; | 481 | tmp.start = root->start; |
376 | /* | 482 | /* |
377 | * Skip past an allocated resource that starts at 0, since the assignment | 483 | * Skip past an allocated resource that starts at 0, since the |
378 | * of this->start - 1 to tmp->end below would cause an underflow. | 484 | * assignment of this->start - 1 to tmp->end below would cause an |
485 | * underflow. | ||
379 | */ | 486 | */ |
380 | if (this && this->start == 0) { | 487 | if (this && this->start == 0) { |
381 | tmp.start = this->end + 1; | 488 | tmp.start = this->end + 1; |
382 | this = this->sibling; | 489 | this = this->sibling; |
383 | } | 490 | } |
384 | for(;;) { | 491 | for (;;) { |
385 | if (this) | 492 | if (this) |
386 | tmp.end = this->start - 1; | 493 | tmp.end = this->start - 1; |
387 | else | 494 | else |
388 | tmp.end = root->end; | 495 | tmp.end = root->end; |
389 | if (tmp.start < min) | 496 | |
390 | tmp.start = min; | 497 | resource_clip(&tmp, min, max); |
391 | if (tmp.end > max) | 498 | |
392 | tmp.end = max; | 499 | /* Check for overflow after ALIGN() */ |
393 | tmp.start = ALIGN(tmp.start, align); | 500 | avail = *new; |
394 | if (alignf) | 501 | avail.start = ALIGN(tmp.start, align); |
395 | tmp.start = alignf(alignf_data, &tmp, size, align); | 502 | avail.end = tmp.end; |
396 | if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { | 503 | if (avail.start >= tmp.start) { |
397 | new->start = tmp.start; | 504 | alloc.start = alignf(alignf_data, &avail, size, align); |
398 | new->end = tmp.start + size - 1; | 505 | alloc.end = alloc.start + size - 1; |
399 | return 0; | 506 | if (resource_contains(&avail, &alloc)) { |
507 | new->start = alloc.start; | ||
508 | new->end = alloc.end; | ||
509 | return 0; | ||
510 | } | ||
400 | } | 511 | } |
512 | |||
401 | if (!this) | 513 | if (!this) |
402 | break; | 514 | break; |
515 | |||
403 | tmp.start = this->end + 1; | 516 | tmp.start = this->end + 1; |
404 | this = this->sibling; | 517 | this = this->sibling; |
405 | } | 518 | } |
@@ -428,8 +541,14 @@ int allocate_resource(struct resource *root, struct resource *new, | |||
428 | { | 541 | { |
429 | int err; | 542 | int err; |
430 | 543 | ||
544 | if (!alignf) | ||
545 | alignf = simple_align_resource; | ||
546 | |||
431 | write_lock(&resource_lock); | 547 | write_lock(&resource_lock); |
432 | err = find_resource(root, new, size, min, max, align, alignf, alignf_data); | 548 | if (resource_alloc_from_bottom) |
549 | err = find_resource(root, new, size, min, max, align, alignf, alignf_data); | ||
550 | else | ||
551 | err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data); | ||
433 | if (err >= 0 && __request_resource(root, new)) | 552 | if (err >= 0 && __request_resource(root, new)) |
434 | err = -EBUSY; | 553 | err = -EBUSY; |
435 | write_unlock(&resource_lock); | 554 | write_unlock(&resource_lock); |
@@ -453,6 +572,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou | |||
453 | 572 | ||
454 | if (first == parent) | 573 | if (first == parent) |
455 | return first; | 574 | return first; |
575 | if (WARN_ON(first == new)) /* duplicated insertion */ | ||
576 | return first; | ||
456 | 577 | ||
457 | if ((first->start > new->start) || (first->end < new->end)) | 578 | if ((first->start > new->start) || (first->end < new->end)) |
458 | break; | 579 | break; |
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index a56f629b057a..66cb89bc5ef1 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
@@ -76,7 +76,9 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) | |||
76 | } | 76 | } |
77 | 77 | ||
78 | if (!lockwakeup && td->bkl == 4) { | 78 | if (!lockwakeup && td->bkl == 4) { |
79 | #ifdef CONFIG_LOCK_KERNEL | ||
79 | unlock_kernel(); | 80 | unlock_kernel(); |
81 | #endif | ||
80 | td->bkl = 0; | 82 | td->bkl = 0; |
81 | } | 83 | } |
82 | return 0; | 84 | return 0; |
@@ -133,14 +135,18 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) | |||
133 | if (td->bkl) | 135 | if (td->bkl) |
134 | return 0; | 136 | return 0; |
135 | td->bkl = 1; | 137 | td->bkl = 1; |
138 | #ifdef CONFIG_LOCK_KERNEL | ||
136 | lock_kernel(); | 139 | lock_kernel(); |
140 | #endif | ||
137 | td->bkl = 4; | 141 | td->bkl = 4; |
138 | return 0; | 142 | return 0; |
139 | 143 | ||
140 | case RTTEST_UNLOCKBKL: | 144 | case RTTEST_UNLOCKBKL: |
141 | if (td->bkl != 4) | 145 | if (td->bkl != 4) |
142 | break; | 146 | break; |
147 | #ifdef CONFIG_LOCK_KERNEL | ||
143 | unlock_kernel(); | 148 | unlock_kernel(); |
149 | #endif | ||
144 | td->bkl = 0; | 150 | td->bkl = 0; |
145 | return 0; | 151 | return 0; |
146 | 152 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index f52a8801b7a2..d42992bccdfa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -77,6 +77,7 @@ | |||
77 | #include <asm/irq_regs.h> | 77 | #include <asm/irq_regs.h> |
78 | 78 | ||
79 | #include "sched_cpupri.h" | 79 | #include "sched_cpupri.h" |
80 | #include "workqueue_sched.h" | ||
80 | 81 | ||
81 | #define CREATE_TRACE_POINTS | 82 | #define CREATE_TRACE_POINTS |
82 | #include <trace/events/sched.h> | 83 | #include <trace/events/sched.h> |
@@ -425,9 +426,7 @@ struct root_domain { | |||
425 | */ | 426 | */ |
426 | cpumask_var_t rto_mask; | 427 | cpumask_var_t rto_mask; |
427 | atomic_t rto_count; | 428 | atomic_t rto_count; |
428 | #ifdef CONFIG_SMP | ||
429 | struct cpupri cpupri; | 429 | struct cpupri cpupri; |
430 | #endif | ||
431 | }; | 430 | }; |
432 | 431 | ||
433 | /* | 432 | /* |
@@ -436,7 +435,7 @@ struct root_domain { | |||
436 | */ | 435 | */ |
437 | static struct root_domain def_root_domain; | 436 | static struct root_domain def_root_domain; |
438 | 437 | ||
439 | #endif | 438 | #endif /* CONFIG_SMP */ |
440 | 439 | ||
441 | /* | 440 | /* |
442 | * This is the main, per-CPU runqueue data structure. | 441 | * This is the main, per-CPU runqueue data structure. |
@@ -456,9 +455,10 @@ struct rq { | |||
456 | unsigned long nr_running; | 455 | unsigned long nr_running; |
457 | #define CPU_LOAD_IDX_MAX 5 | 456 | #define CPU_LOAD_IDX_MAX 5 |
458 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 457 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
458 | unsigned long last_load_update_tick; | ||
459 | #ifdef CONFIG_NO_HZ | 459 | #ifdef CONFIG_NO_HZ |
460 | u64 nohz_stamp; | 460 | u64 nohz_stamp; |
461 | unsigned char in_nohz_recently; | 461 | unsigned char nohz_balance_kick; |
462 | #endif | 462 | #endif |
463 | unsigned int skip_clock_update; | 463 | unsigned int skip_clock_update; |
464 | 464 | ||
@@ -486,11 +486,12 @@ struct rq { | |||
486 | */ | 486 | */ |
487 | unsigned long nr_uninterruptible; | 487 | unsigned long nr_uninterruptible; |
488 | 488 | ||
489 | struct task_struct *curr, *idle; | 489 | struct task_struct *curr, *idle, *stop; |
490 | unsigned long next_balance; | 490 | unsigned long next_balance; |
491 | struct mm_struct *prev_mm; | 491 | struct mm_struct *prev_mm; |
492 | 492 | ||
493 | u64 clock; | 493 | u64 clock; |
494 | u64 clock_task; | ||
494 | 495 | ||
495 | atomic_t nr_iowait; | 496 | atomic_t nr_iowait; |
496 | 497 | ||
@@ -518,6 +519,10 @@ struct rq { | |||
518 | u64 avg_idle; | 519 | u64 avg_idle; |
519 | #endif | 520 | #endif |
520 | 521 | ||
522 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
523 | u64 prev_irq_time; | ||
524 | #endif | ||
525 | |||
521 | /* calc_load related fields */ | 526 | /* calc_load related fields */ |
522 | unsigned long calc_load_update; | 527 | unsigned long calc_load_update; |
523 | long calc_load_active; | 528 | long calc_load_active; |
@@ -641,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
641 | 646 | ||
642 | #endif /* CONFIG_CGROUP_SCHED */ | 647 | #endif /* CONFIG_CGROUP_SCHED */ |
643 | 648 | ||
649 | static u64 irq_time_cpu(int cpu); | ||
650 | static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); | ||
651 | |||
644 | inline void update_rq_clock(struct rq *rq) | 652 | inline void update_rq_clock(struct rq *rq) |
645 | { | 653 | { |
646 | if (!rq->skip_clock_update) | 654 | if (!rq->skip_clock_update) { |
647 | rq->clock = sched_clock_cpu(cpu_of(rq)); | 655 | int cpu = cpu_of(rq); |
656 | u64 irq_time; | ||
657 | |||
658 | rq->clock = sched_clock_cpu(cpu); | ||
659 | irq_time = irq_time_cpu(cpu); | ||
660 | if (rq->clock - irq_time > rq->clock_task) | ||
661 | rq->clock_task = rq->clock - irq_time; | ||
662 | |||
663 | sched_irq_time_avg_update(rq, irq_time); | ||
664 | } | ||
648 | } | 665 | } |
649 | 666 | ||
650 | /* | 667 | /* |
@@ -721,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
721 | size_t cnt, loff_t *ppos) | 738 | size_t cnt, loff_t *ppos) |
722 | { | 739 | { |
723 | char buf[64]; | 740 | char buf[64]; |
724 | char *cmp = buf; | 741 | char *cmp; |
725 | int neg = 0; | 742 | int neg = 0; |
726 | int i; | 743 | int i; |
727 | 744 | ||
@@ -732,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
732 | return -EFAULT; | 749 | return -EFAULT; |
733 | 750 | ||
734 | buf[cnt] = 0; | 751 | buf[cnt] = 0; |
752 | cmp = strstrip(buf); | ||
735 | 753 | ||
736 | if (strncmp(buf, "NO_", 3) == 0) { | 754 | if (strncmp(buf, "NO_", 3) == 0) { |
737 | neg = 1; | 755 | neg = 1; |
@@ -739,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
739 | } | 757 | } |
740 | 758 | ||
741 | for (i = 0; sched_feat_names[i]; i++) { | 759 | for (i = 0; sched_feat_names[i]; i++) { |
742 | int len = strlen(sched_feat_names[i]); | 760 | if (strcmp(cmp, sched_feat_names[i]) == 0) { |
743 | |||
744 | if (strncmp(cmp, sched_feat_names[i], len) == 0) { | ||
745 | if (neg) | 761 | if (neg) |
746 | sysctl_sched_features &= ~(1UL << i); | 762 | sysctl_sched_features &= ~(1UL << i); |
747 | else | 763 | else |
@@ -1193,6 +1209,27 @@ static void resched_cpu(int cpu) | |||
1193 | 1209 | ||
1194 | #ifdef CONFIG_NO_HZ | 1210 | #ifdef CONFIG_NO_HZ |
1195 | /* | 1211 | /* |
1212 | * In the semi idle case, use the nearest busy cpu for migrating timers | ||
1213 | * from an idle cpu. This is good for power-savings. | ||
1214 | * | ||
1215 | * We don't do similar optimization for completely idle system, as | ||
1216 | * selecting an idle cpu will add more delays to the timers than intended | ||
1217 | * (as that cpu's timer base may not be uptodate wrt jiffies etc). | ||
1218 | */ | ||
1219 | int get_nohz_timer_target(void) | ||
1220 | { | ||
1221 | int cpu = smp_processor_id(); | ||
1222 | int i; | ||
1223 | struct sched_domain *sd; | ||
1224 | |||
1225 | for_each_domain(cpu, sd) { | ||
1226 | for_each_cpu(i, sched_domain_span(sd)) | ||
1227 | if (!idle_cpu(i)) | ||
1228 | return i; | ||
1229 | } | ||
1230 | return cpu; | ||
1231 | } | ||
1232 | /* | ||
1196 | * When add_timer_on() enqueues a timer into the timer wheel of an | 1233 | * When add_timer_on() enqueues a timer into the timer wheel of an |
1197 | * idle CPU then this timer might expire before the next timer event | 1234 | * idle CPU then this timer might expire before the next timer event |
1198 | * which is scheduled to wake up that CPU. In case of a completely | 1235 | * which is scheduled to wake up that CPU. In case of a completely |
@@ -1232,16 +1269,6 @@ void wake_up_idle_cpu(int cpu) | |||
1232 | smp_send_reschedule(cpu); | 1269 | smp_send_reschedule(cpu); |
1233 | } | 1270 | } |
1234 | 1271 | ||
1235 | int nohz_ratelimit(int cpu) | ||
1236 | { | ||
1237 | struct rq *rq = cpu_rq(cpu); | ||
1238 | u64 diff = rq->clock - rq->nohz_stamp; | ||
1239 | |||
1240 | rq->nohz_stamp = rq->clock; | ||
1241 | |||
1242 | return diff < (NSEC_PER_SEC / HZ) >> 1; | ||
1243 | } | ||
1244 | |||
1245 | #endif /* CONFIG_NO_HZ */ | 1272 | #endif /* CONFIG_NO_HZ */ |
1246 | 1273 | ||
1247 | static u64 sched_avg_period(void) | 1274 | static u64 sched_avg_period(void) |
@@ -1281,6 +1308,10 @@ static void resched_task(struct task_struct *p) | |||
1281 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1308 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) |
1282 | { | 1309 | { |
1283 | } | 1310 | } |
1311 | |||
1312 | static void sched_avg_update(struct rq *rq) | ||
1313 | { | ||
1314 | } | ||
1284 | #endif /* CONFIG_SMP */ | 1315 | #endif /* CONFIG_SMP */ |
1285 | 1316 | ||
1286 | #if BITS_PER_LONG == 32 | 1317 | #if BITS_PER_LONG == 32 |
@@ -1652,7 +1683,7 @@ static void update_shares(struct sched_domain *sd) | |||
1652 | if (root_task_group_empty()) | 1683 | if (root_task_group_empty()) |
1653 | return; | 1684 | return; |
1654 | 1685 | ||
1655 | now = cpu_clock(raw_smp_processor_id()); | 1686 | now = local_clock(); |
1656 | elapsed = now - sd->last_update; | 1687 | elapsed = now - sd->last_update; |
1657 | 1688 | ||
1658 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1689 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
@@ -1805,6 +1836,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1805 | static void calc_load_account_idle(struct rq *this_rq); | 1836 | static void calc_load_account_idle(struct rq *this_rq); |
1806 | static void update_sysctl(void); | 1837 | static void update_sysctl(void); |
1807 | static int get_update_sysctl_factor(void); | 1838 | static int get_update_sysctl_factor(void); |
1839 | static void update_cpu_load(struct rq *this_rq); | ||
1808 | 1840 | ||
1809 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1841 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
1810 | { | 1842 | { |
@@ -1822,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1822 | 1854 | ||
1823 | static const struct sched_class rt_sched_class; | 1855 | static const struct sched_class rt_sched_class; |
1824 | 1856 | ||
1825 | #define sched_class_highest (&rt_sched_class) | 1857 | #define sched_class_highest (&stop_sched_class) |
1826 | #define for_each_class(class) \ | 1858 | #define for_each_class(class) \ |
1827 | for (class = sched_class_highest; class; class = class->next) | 1859 | for (class = sched_class_highest; class; class = class->next) |
1828 | 1860 | ||
@@ -1840,12 +1872,6 @@ static void dec_nr_running(struct rq *rq) | |||
1840 | 1872 | ||
1841 | static void set_load_weight(struct task_struct *p) | 1873 | static void set_load_weight(struct task_struct *p) |
1842 | { | 1874 | { |
1843 | if (task_has_rt_policy(p)) { | ||
1844 | p->se.load.weight = 0; | ||
1845 | p->se.load.inv_weight = WMULT_CONST; | ||
1846 | return; | ||
1847 | } | ||
1848 | |||
1849 | /* | 1875 | /* |
1850 | * SCHED_IDLE tasks get minimal weight: | 1876 | * SCHED_IDLE tasks get minimal weight: |
1851 | */ | 1877 | */ |
@@ -1899,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1899 | dec_nr_running(rq); | 1925 | dec_nr_running(rq); |
1900 | } | 1926 | } |
1901 | 1927 | ||
1928 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
1929 | |||
1930 | /* | ||
1931 | * There are no locks covering percpu hardirq/softirq time. | ||
1932 | * They are only modified in account_system_vtime, on corresponding CPU | ||
1933 | * with interrupts disabled. So, writes are safe. | ||
1934 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
1935 | * This may result in other CPU reading this CPU's irq time and can | ||
1936 | * race with irq/account_system_vtime on this CPU. We would either get old | ||
1937 | * or new value (or semi updated value on 32 bit) with a side effect of | ||
1938 | * accounting a slice of irq time to wrong task when irq is in progress | ||
1939 | * while we read rq->clock. That is a worthy compromise in place of having | ||
1940 | * locks on each irq in account_system_time. | ||
1941 | */ | ||
1942 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
1943 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
1944 | |||
1945 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
1946 | static int sched_clock_irqtime; | ||
1947 | |||
1948 | void enable_sched_clock_irqtime(void) | ||
1949 | { | ||
1950 | sched_clock_irqtime = 1; | ||
1951 | } | ||
1952 | |||
1953 | void disable_sched_clock_irqtime(void) | ||
1954 | { | ||
1955 | sched_clock_irqtime = 0; | ||
1956 | } | ||
1957 | |||
1958 | static u64 irq_time_cpu(int cpu) | ||
1959 | { | ||
1960 | if (!sched_clock_irqtime) | ||
1961 | return 0; | ||
1962 | |||
1963 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
1964 | } | ||
1965 | |||
1966 | void account_system_vtime(struct task_struct *curr) | ||
1967 | { | ||
1968 | unsigned long flags; | ||
1969 | int cpu; | ||
1970 | u64 now, delta; | ||
1971 | |||
1972 | if (!sched_clock_irqtime) | ||
1973 | return; | ||
1974 | |||
1975 | local_irq_save(flags); | ||
1976 | |||
1977 | cpu = smp_processor_id(); | ||
1978 | now = sched_clock_cpu(cpu); | ||
1979 | delta = now - per_cpu(irq_start_time, cpu); | ||
1980 | per_cpu(irq_start_time, cpu) = now; | ||
1981 | /* | ||
1982 | * We do not account for softirq time from ksoftirqd here. | ||
1983 | * We want to continue accounting softirq time to ksoftirqd thread | ||
1984 | * in that case, so as not to confuse scheduler with a special task | ||
1985 | * that do not consume any time, but still wants to run. | ||
1986 | */ | ||
1987 | if (hardirq_count()) | ||
1988 | per_cpu(cpu_hardirq_time, cpu) += delta; | ||
1989 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | ||
1990 | per_cpu(cpu_softirq_time, cpu) += delta; | ||
1991 | |||
1992 | local_irq_restore(flags); | ||
1993 | } | ||
1994 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
1995 | |||
1996 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) | ||
1997 | { | ||
1998 | if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { | ||
1999 | u64 delta_irq = curr_irq_time - rq->prev_irq_time; | ||
2000 | rq->prev_irq_time = curr_irq_time; | ||
2001 | sched_rt_avg_update(rq, delta_irq); | ||
2002 | } | ||
2003 | } | ||
2004 | |||
2005 | #else | ||
2006 | |||
2007 | static u64 irq_time_cpu(int cpu) | ||
2008 | { | ||
2009 | return 0; | ||
2010 | } | ||
2011 | |||
2012 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } | ||
2013 | |||
2014 | #endif | ||
2015 | |||
1902 | #include "sched_idletask.c" | 2016 | #include "sched_idletask.c" |
1903 | #include "sched_fair.c" | 2017 | #include "sched_fair.c" |
1904 | #include "sched_rt.c" | 2018 | #include "sched_rt.c" |
2019 | #include "sched_stoptask.c" | ||
1905 | #ifdef CONFIG_SCHED_DEBUG | 2020 | #ifdef CONFIG_SCHED_DEBUG |
1906 | # include "sched_debug.c" | 2021 | # include "sched_debug.c" |
1907 | #endif | 2022 | #endif |
1908 | 2023 | ||
2024 | void sched_set_stop_task(int cpu, struct task_struct *stop) | ||
2025 | { | ||
2026 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | ||
2027 | struct task_struct *old_stop = cpu_rq(cpu)->stop; | ||
2028 | |||
2029 | if (stop) { | ||
2030 | /* | ||
2031 | * Make it appear like a SCHED_FIFO task, its something | ||
2032 | * userspace knows about and won't get confused about. | ||
2033 | * | ||
2034 | * Also, it will make PI more or less work without too | ||
2035 | * much confusion -- but then, stop work should not | ||
2036 | * rely on PI working anyway. | ||
2037 | */ | ||
2038 | sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); | ||
2039 | |||
2040 | stop->sched_class = &stop_sched_class; | ||
2041 | } | ||
2042 | |||
2043 | cpu_rq(cpu)->stop = stop; | ||
2044 | |||
2045 | if (old_stop) { | ||
2046 | /* | ||
2047 | * Reset it back to a normal scheduling class so that | ||
2048 | * it can die in pieces. | ||
2049 | */ | ||
2050 | old_stop->sched_class = &rt_sched_class; | ||
2051 | } | ||
2052 | } | ||
2053 | |||
1909 | /* | 2054 | /* |
1910 | * __normal_prio - return the priority that is based on the static prio | 2055 | * __normal_prio - return the priority that is based on the static prio |
1911 | */ | 2056 | */ |
@@ -1985,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
1985 | if (p->sched_class != &fair_sched_class) | 2130 | if (p->sched_class != &fair_sched_class) |
1986 | return 0; | 2131 | return 0; |
1987 | 2132 | ||
2133 | if (unlikely(p->policy == SCHED_IDLE)) | ||
2134 | return 0; | ||
2135 | |||
1988 | /* | 2136 | /* |
1989 | * Buddy candidates are cache hot: | 2137 | * Buddy candidates are cache hot: |
1990 | */ | 2138 | */ |
@@ -2267,11 +2415,55 @@ static void update_avg(u64 *avg, u64 sample) | |||
2267 | } | 2415 | } |
2268 | #endif | 2416 | #endif |
2269 | 2417 | ||
2270 | /*** | 2418 | static inline void ttwu_activate(struct task_struct *p, struct rq *rq, |
2419 | bool is_sync, bool is_migrate, bool is_local, | ||
2420 | unsigned long en_flags) | ||
2421 | { | ||
2422 | schedstat_inc(p, se.statistics.nr_wakeups); | ||
2423 | if (is_sync) | ||
2424 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | ||
2425 | if (is_migrate) | ||
2426 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | ||
2427 | if (is_local) | ||
2428 | schedstat_inc(p, se.statistics.nr_wakeups_local); | ||
2429 | else | ||
2430 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | ||
2431 | |||
2432 | activate_task(rq, p, en_flags); | ||
2433 | } | ||
2434 | |||
2435 | static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | ||
2436 | int wake_flags, bool success) | ||
2437 | { | ||
2438 | trace_sched_wakeup(p, success); | ||
2439 | check_preempt_curr(rq, p, wake_flags); | ||
2440 | |||
2441 | p->state = TASK_RUNNING; | ||
2442 | #ifdef CONFIG_SMP | ||
2443 | if (p->sched_class->task_woken) | ||
2444 | p->sched_class->task_woken(rq, p); | ||
2445 | |||
2446 | if (unlikely(rq->idle_stamp)) { | ||
2447 | u64 delta = rq->clock - rq->idle_stamp; | ||
2448 | u64 max = 2*sysctl_sched_migration_cost; | ||
2449 | |||
2450 | if (delta > max) | ||
2451 | rq->avg_idle = max; | ||
2452 | else | ||
2453 | update_avg(&rq->avg_idle, delta); | ||
2454 | rq->idle_stamp = 0; | ||
2455 | } | ||
2456 | #endif | ||
2457 | /* if a worker is waking up, notify workqueue */ | ||
2458 | if ((p->flags & PF_WQ_WORKER) && success) | ||
2459 | wq_worker_waking_up(p, cpu_of(rq)); | ||
2460 | } | ||
2461 | |||
2462 | /** | ||
2271 | * try_to_wake_up - wake up a thread | 2463 | * try_to_wake_up - wake up a thread |
2272 | * @p: the to-be-woken-up thread | 2464 | * @p: the thread to be awakened |
2273 | * @state: the mask of task states that can be woken | 2465 | * @state: the mask of task states that can be woken |
2274 | * @sync: do a synchronous wakeup? | 2466 | * @wake_flags: wake modifier flags (WF_*) |
2275 | * | 2467 | * |
2276 | * Put it on the run-queue if it's not already there. The "current" | 2468 | * Put it on the run-queue if it's not already there. The "current" |
2277 | * thread is always on the run-queue (except when the actual | 2469 | * thread is always on the run-queue (except when the actual |
@@ -2279,7 +2471,8 @@ static void update_avg(u64 *avg, u64 sample) | |||
2279 | * the simpler "current->state = TASK_RUNNING" to mark yourself | 2471 | * the simpler "current->state = TASK_RUNNING" to mark yourself |
2280 | * runnable without the overhead of this. | 2472 | * runnable without the overhead of this. |
2281 | * | 2473 | * |
2282 | * returns failure only if the task is already active. | 2474 | * Returns %true if @p was woken up, %false if it was already running |
2475 | * or @state didn't match @p's state. | ||
2283 | */ | 2476 | */ |
2284 | static int try_to_wake_up(struct task_struct *p, unsigned int state, | 2477 | static int try_to_wake_up(struct task_struct *p, unsigned int state, |
2285 | int wake_flags) | 2478 | int wake_flags) |
@@ -2359,38 +2552,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
2359 | 2552 | ||
2360 | out_activate: | 2553 | out_activate: |
2361 | #endif /* CONFIG_SMP */ | 2554 | #endif /* CONFIG_SMP */ |
2362 | schedstat_inc(p, se.statistics.nr_wakeups); | 2555 | ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, |
2363 | if (wake_flags & WF_SYNC) | 2556 | cpu == this_cpu, en_flags); |
2364 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | ||
2365 | if (orig_cpu != cpu) | ||
2366 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | ||
2367 | if (cpu == this_cpu) | ||
2368 | schedstat_inc(p, se.statistics.nr_wakeups_local); | ||
2369 | else | ||
2370 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | ||
2371 | activate_task(rq, p, en_flags); | ||
2372 | success = 1; | 2557 | success = 1; |
2373 | |||
2374 | out_running: | 2558 | out_running: |
2375 | trace_sched_wakeup(p, success); | 2559 | ttwu_post_activation(p, rq, wake_flags, success); |
2376 | check_preempt_curr(rq, p, wake_flags); | ||
2377 | |||
2378 | p->state = TASK_RUNNING; | ||
2379 | #ifdef CONFIG_SMP | ||
2380 | if (p->sched_class->task_woken) | ||
2381 | p->sched_class->task_woken(rq, p); | ||
2382 | |||
2383 | if (unlikely(rq->idle_stamp)) { | ||
2384 | u64 delta = rq->clock - rq->idle_stamp; | ||
2385 | u64 max = 2*sysctl_sched_migration_cost; | ||
2386 | |||
2387 | if (delta > max) | ||
2388 | rq->avg_idle = max; | ||
2389 | else | ||
2390 | update_avg(&rq->avg_idle, delta); | ||
2391 | rq->idle_stamp = 0; | ||
2392 | } | ||
2393 | #endif | ||
2394 | out: | 2560 | out: |
2395 | task_rq_unlock(rq, &flags); | 2561 | task_rq_unlock(rq, &flags); |
2396 | put_cpu(); | 2562 | put_cpu(); |
@@ -2399,6 +2565,37 @@ out: | |||
2399 | } | 2565 | } |
2400 | 2566 | ||
2401 | /** | 2567 | /** |
2568 | * try_to_wake_up_local - try to wake up a local task with rq lock held | ||
2569 | * @p: the thread to be awakened | ||
2570 | * | ||
2571 | * Put @p on the run-queue if it's not alredy there. The caller must | ||
2572 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | ||
2573 | * the current task. this_rq() stays locked over invocation. | ||
2574 | */ | ||
2575 | static void try_to_wake_up_local(struct task_struct *p) | ||
2576 | { | ||
2577 | struct rq *rq = task_rq(p); | ||
2578 | bool success = false; | ||
2579 | |||
2580 | BUG_ON(rq != this_rq()); | ||
2581 | BUG_ON(p == current); | ||
2582 | lockdep_assert_held(&rq->lock); | ||
2583 | |||
2584 | if (!(p->state & TASK_NORMAL)) | ||
2585 | return; | ||
2586 | |||
2587 | if (!p->se.on_rq) { | ||
2588 | if (likely(!task_running(rq, p))) { | ||
2589 | schedstat_inc(rq, ttwu_count); | ||
2590 | schedstat_inc(rq, ttwu_local); | ||
2591 | } | ||
2592 | ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); | ||
2593 | success = true; | ||
2594 | } | ||
2595 | ttwu_post_activation(p, rq, 0, success); | ||
2596 | } | ||
2597 | |||
2598 | /** | ||
2402 | * wake_up_process - Wake up a specific process | 2599 | * wake_up_process - Wake up a specific process |
2403 | * @p: The process to be woken up. | 2600 | * @p: The process to be woken up. |
2404 | * | 2601 | * |
@@ -2785,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2785 | */ | 2982 | */ |
2786 | arch_start_context_switch(prev); | 2983 | arch_start_context_switch(prev); |
2787 | 2984 | ||
2788 | if (likely(!mm)) { | 2985 | if (!mm) { |
2789 | next->active_mm = oldmm; | 2986 | next->active_mm = oldmm; |
2790 | atomic_inc(&oldmm->mm_count); | 2987 | atomic_inc(&oldmm->mm_count); |
2791 | enter_lazy_tlb(oldmm, next); | 2988 | enter_lazy_tlb(oldmm, next); |
2792 | } else | 2989 | } else |
2793 | switch_mm(oldmm, mm, next); | 2990 | switch_mm(oldmm, mm, next); |
2794 | 2991 | ||
2795 | if (likely(!prev->mm)) { | 2992 | if (!prev->mm) { |
2796 | prev->active_mm = NULL; | 2993 | prev->active_mm = NULL; |
2797 | rq->prev_mm = oldmm; | 2994 | rq->prev_mm = oldmm; |
2798 | } | 2995 | } |
@@ -3012,23 +3209,102 @@ static void calc_load_account_active(struct rq *this_rq) | |||
3012 | } | 3209 | } |
3013 | 3210 | ||
3014 | /* | 3211 | /* |
3212 | * The exact cpuload at various idx values, calculated at every tick would be | ||
3213 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | ||
3214 | * | ||
3215 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | ||
3216 | * on nth tick when cpu may be busy, then we have: | ||
3217 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
3218 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | ||
3219 | * | ||
3220 | * decay_load_missed() below does efficient calculation of | ||
3221 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
3222 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | ||
3223 | * | ||
3224 | * The calculation is approximated on a 128 point scale. | ||
3225 | * degrade_zero_ticks is the number of ticks after which load at any | ||
3226 | * particular idx is approximated to be zero. | ||
3227 | * degrade_factor is a precomputed table, a row for each load idx. | ||
3228 | * Each column corresponds to degradation factor for a power of two ticks, | ||
3229 | * based on 128 point scale. | ||
3230 | * Example: | ||
3231 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
3232 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
3233 | * | ||
3234 | * With this power of 2 load factors, we can degrade the load n times | ||
3235 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
3236 | * n mult/shifts needed by the exact degradation. | ||
3237 | */ | ||
3238 | #define DEGRADE_SHIFT 7 | ||
3239 | static const unsigned char | ||
3240 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
3241 | static const unsigned char | ||
3242 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
3243 | {0, 0, 0, 0, 0, 0, 0, 0}, | ||
3244 | {64, 32, 8, 0, 0, 0, 0, 0}, | ||
3245 | {96, 72, 40, 12, 1, 0, 0}, | ||
3246 | {112, 98, 75, 43, 15, 1, 0}, | ||
3247 | {120, 112, 98, 76, 45, 16, 2} }; | ||
3248 | |||
3249 | /* | ||
3250 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
3251 | * would be when CPU is idle and so we just decay the old load without | ||
3252 | * adding any new load. | ||
3253 | */ | ||
3254 | static unsigned long | ||
3255 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
3256 | { | ||
3257 | int j = 0; | ||
3258 | |||
3259 | if (!missed_updates) | ||
3260 | return load; | ||
3261 | |||
3262 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
3263 | return 0; | ||
3264 | |||
3265 | if (idx == 1) | ||
3266 | return load >> missed_updates; | ||
3267 | |||
3268 | while (missed_updates) { | ||
3269 | if (missed_updates % 2) | ||
3270 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
3271 | |||
3272 | missed_updates >>= 1; | ||
3273 | j++; | ||
3274 | } | ||
3275 | return load; | ||
3276 | } | ||
3277 | |||
3278 | /* | ||
3015 | * Update rq->cpu_load[] statistics. This function is usually called every | 3279 | * Update rq->cpu_load[] statistics. This function is usually called every |
3016 | * scheduler tick (TICK_NSEC). | 3280 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
3281 | * every tick. We fix it up based on jiffies. | ||
3017 | */ | 3282 | */ |
3018 | static void update_cpu_load(struct rq *this_rq) | 3283 | static void update_cpu_load(struct rq *this_rq) |
3019 | { | 3284 | { |
3020 | unsigned long this_load = this_rq->load.weight; | 3285 | unsigned long this_load = this_rq->load.weight; |
3286 | unsigned long curr_jiffies = jiffies; | ||
3287 | unsigned long pending_updates; | ||
3021 | int i, scale; | 3288 | int i, scale; |
3022 | 3289 | ||
3023 | this_rq->nr_load_updates++; | 3290 | this_rq->nr_load_updates++; |
3024 | 3291 | ||
3292 | /* Avoid repeated calls on same jiffy, when moving in and out of idle */ | ||
3293 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
3294 | return; | ||
3295 | |||
3296 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
3297 | this_rq->last_load_update_tick = curr_jiffies; | ||
3298 | |||
3025 | /* Update our load: */ | 3299 | /* Update our load: */ |
3026 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 3300 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ |
3301 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
3027 | unsigned long old_load, new_load; | 3302 | unsigned long old_load, new_load; |
3028 | 3303 | ||
3029 | /* scale is effectively 1 << i now, and >> i divides by scale */ | 3304 | /* scale is effectively 1 << i now, and >> i divides by scale */ |
3030 | 3305 | ||
3031 | old_load = this_rq->cpu_load[i]; | 3306 | old_load = this_rq->cpu_load[i]; |
3307 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
3032 | new_load = this_load; | 3308 | new_load = this_load; |
3033 | /* | 3309 | /* |
3034 | * Round up the averaging division if load is increasing. This | 3310 | * Round up the averaging division if load is increasing. This |
@@ -3036,10 +3312,18 @@ static void update_cpu_load(struct rq *this_rq) | |||
3036 | * example. | 3312 | * example. |
3037 | */ | 3313 | */ |
3038 | if (new_load > old_load) | 3314 | if (new_load > old_load) |
3039 | new_load += scale-1; | 3315 | new_load += scale - 1; |
3040 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 3316 | |
3317 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
3041 | } | 3318 | } |
3042 | 3319 | ||
3320 | sched_avg_update(this_rq); | ||
3321 | } | ||
3322 | |||
3323 | static void update_cpu_load_active(struct rq *this_rq) | ||
3324 | { | ||
3325 | update_cpu_load(this_rq); | ||
3326 | |||
3043 | calc_load_account_active(this_rq); | 3327 | calc_load_account_active(this_rq); |
3044 | } | 3328 | } |
3045 | 3329 | ||
@@ -3094,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
3094 | 3378 | ||
3095 | if (task_current(rq, p)) { | 3379 | if (task_current(rq, p)) { |
3096 | update_rq_clock(rq); | 3380 | update_rq_clock(rq); |
3097 | ns = rq->clock - p->se.exec_start; | 3381 | ns = rq->clock_task - p->se.exec_start; |
3098 | if ((s64)ns < 0) | 3382 | if ((s64)ns < 0) |
3099 | ns = 0; | 3383 | ns = 0; |
3100 | } | 3384 | } |
@@ -3243,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3243 | tmp = cputime_to_cputime64(cputime); | 3527 | tmp = cputime_to_cputime64(cputime); |
3244 | if (hardirq_count() - hardirq_offset) | 3528 | if (hardirq_count() - hardirq_offset) |
3245 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3529 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
3246 | else if (softirq_count()) | 3530 | else if (in_serving_softirq()) |
3247 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3531 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); |
3248 | else | 3532 | else |
3249 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3533 | cpustat->system = cputime64_add(cpustat->system, tmp); |
@@ -3359,9 +3643,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
3359 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | 3643 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); |
3360 | 3644 | ||
3361 | if (total) { | 3645 | if (total) { |
3362 | u64 temp; | 3646 | u64 temp = rtime; |
3363 | 3647 | ||
3364 | temp = (u64)(rtime * utime); | 3648 | temp *= utime; |
3365 | do_div(temp, total); | 3649 | do_div(temp, total); |
3366 | utime = (cputime_t)temp; | 3650 | utime = (cputime_t)temp; |
3367 | } else | 3651 | } else |
@@ -3392,9 +3676,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
3392 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | 3676 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); |
3393 | 3677 | ||
3394 | if (total) { | 3678 | if (total) { |
3395 | u64 temp; | 3679 | u64 temp = rtime; |
3396 | 3680 | ||
3397 | temp = (u64)(rtime * cputime.utime); | 3681 | temp *= cputime.utime; |
3398 | do_div(temp, total); | 3682 | do_div(temp, total); |
3399 | utime = (cputime_t)temp; | 3683 | utime = (cputime_t)temp; |
3400 | } else | 3684 | } else |
@@ -3426,11 +3710,11 @@ void scheduler_tick(void) | |||
3426 | 3710 | ||
3427 | raw_spin_lock(&rq->lock); | 3711 | raw_spin_lock(&rq->lock); |
3428 | update_rq_clock(rq); | 3712 | update_rq_clock(rq); |
3429 | update_cpu_load(rq); | 3713 | update_cpu_load_active(rq); |
3430 | curr->sched_class->task_tick(rq, curr, 0); | 3714 | curr->sched_class->task_tick(rq, curr, 0); |
3431 | raw_spin_unlock(&rq->lock); | 3715 | raw_spin_unlock(&rq->lock); |
3432 | 3716 | ||
3433 | perf_event_task_tick(curr); | 3717 | perf_event_task_tick(); |
3434 | 3718 | ||
3435 | #ifdef CONFIG_SMP | 3719 | #ifdef CONFIG_SMP |
3436 | rq->idle_at_tick = idle_cpu(cpu); | 3720 | rq->idle_at_tick = idle_cpu(cpu); |
@@ -3569,17 +3853,13 @@ pick_next_task(struct rq *rq) | |||
3569 | return p; | 3853 | return p; |
3570 | } | 3854 | } |
3571 | 3855 | ||
3572 | class = sched_class_highest; | 3856 | for_each_class(class) { |
3573 | for ( ; ; ) { | ||
3574 | p = class->pick_next_task(rq); | 3857 | p = class->pick_next_task(rq); |
3575 | if (p) | 3858 | if (p) |
3576 | return p; | 3859 | return p; |
3577 | /* | ||
3578 | * Will never be NULL as the idle class always | ||
3579 | * returns a non-NULL p: | ||
3580 | */ | ||
3581 | class = class->next; | ||
3582 | } | 3860 | } |
3861 | |||
3862 | BUG(); /* the idle class will always have a runnable task */ | ||
3583 | } | 3863 | } |
3584 | 3864 | ||
3585 | /* | 3865 | /* |
@@ -3598,7 +3878,6 @@ need_resched: | |||
3598 | rq = cpu_rq(cpu); | 3878 | rq = cpu_rq(cpu); |
3599 | rcu_note_context_switch(cpu); | 3879 | rcu_note_context_switch(cpu); |
3600 | prev = rq->curr; | 3880 | prev = rq->curr; |
3601 | switch_count = &prev->nivcsw; | ||
3602 | 3881 | ||
3603 | release_kernel_lock(prev); | 3882 | release_kernel_lock(prev); |
3604 | need_resched_nonpreemptible: | 3883 | need_resched_nonpreemptible: |
@@ -3611,11 +3890,26 @@ need_resched_nonpreemptible: | |||
3611 | raw_spin_lock_irq(&rq->lock); | 3890 | raw_spin_lock_irq(&rq->lock); |
3612 | clear_tsk_need_resched(prev); | 3891 | clear_tsk_need_resched(prev); |
3613 | 3892 | ||
3893 | switch_count = &prev->nivcsw; | ||
3614 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3894 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
3615 | if (unlikely(signal_pending_state(prev->state, prev))) | 3895 | if (unlikely(signal_pending_state(prev->state, prev))) { |
3616 | prev->state = TASK_RUNNING; | 3896 | prev->state = TASK_RUNNING; |
3617 | else | 3897 | } else { |
3898 | /* | ||
3899 | * If a worker is going to sleep, notify and | ||
3900 | * ask workqueue whether it wants to wake up a | ||
3901 | * task to maintain concurrency. If so, wake | ||
3902 | * up the task. | ||
3903 | */ | ||
3904 | if (prev->flags & PF_WQ_WORKER) { | ||
3905 | struct task_struct *to_wakeup; | ||
3906 | |||
3907 | to_wakeup = wq_worker_sleeping(prev, cpu); | ||
3908 | if (to_wakeup) | ||
3909 | try_to_wake_up_local(to_wakeup); | ||
3910 | } | ||
3618 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 3911 | deactivate_task(rq, prev, DEQUEUE_SLEEP); |
3912 | } | ||
3619 | switch_count = &prev->nvcsw; | 3913 | switch_count = &prev->nvcsw; |
3620 | } | 3914 | } |
3621 | 3915 | ||
@@ -3637,8 +3931,10 @@ need_resched_nonpreemptible: | |||
3637 | 3931 | ||
3638 | context_switch(rq, prev, next); /* unlocks the rq */ | 3932 | context_switch(rq, prev, next); /* unlocks the rq */ |
3639 | /* | 3933 | /* |
3640 | * the context switch might have flipped the stack from under | 3934 | * The context switch have flipped the stack from under us |
3641 | * us, hence refresh the local variables. | 3935 | * and restored the local variables which were saved when |
3936 | * this task called schedule() in the past. prev == current | ||
3937 | * is still correct, but it can be moved to another cpu/rq. | ||
3642 | */ | 3938 | */ |
3643 | cpu = smp_processor_id(); | 3939 | cpu = smp_processor_id(); |
3644 | rq = cpu_rq(cpu); | 3940 | rq = cpu_rq(cpu); |
@@ -3647,11 +3943,8 @@ need_resched_nonpreemptible: | |||
3647 | 3943 | ||
3648 | post_schedule(rq); | 3944 | post_schedule(rq); |
3649 | 3945 | ||
3650 | if (unlikely(reacquire_kernel_lock(current) < 0)) { | 3946 | if (unlikely(reacquire_kernel_lock(prev))) |
3651 | prev = rq->curr; | ||
3652 | switch_count = &prev->nivcsw; | ||
3653 | goto need_resched_nonpreemptible; | 3947 | goto need_resched_nonpreemptible; |
3654 | } | ||
3655 | 3948 | ||
3656 | preempt_enable_no_resched(); | 3949 | preempt_enable_no_resched(); |
3657 | if (need_resched()) | 3950 | if (need_resched()) |
@@ -3704,8 +3997,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
3704 | /* | 3997 | /* |
3705 | * Owner changed, break to re-assess state. | 3998 | * Owner changed, break to re-assess state. |
3706 | */ | 3999 | */ |
3707 | if (lock->owner != owner) | 4000 | if (lock->owner != owner) { |
4001 | /* | ||
4002 | * If the lock has switched to a different owner, | ||
4003 | * we likely have heavy contention. Return 0 to quit | ||
4004 | * optimistic spinning and not contend further: | ||
4005 | */ | ||
4006 | if (lock->owner) | ||
4007 | return 0; | ||
3708 | break; | 4008 | break; |
4009 | } | ||
3709 | 4010 | ||
3710 | /* | 4011 | /* |
3711 | * Is that owner really running on that cpu? | 4012 | * Is that owner really running on that cpu? |
@@ -3726,7 +4027,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
3726 | * off of preempt_enable. Kernel preemptions off return from interrupt | 4027 | * off of preempt_enable. Kernel preemptions off return from interrupt |
3727 | * occur there and call schedule directly. | 4028 | * occur there and call schedule directly. |
3728 | */ | 4029 | */ |
3729 | asmlinkage void __sched preempt_schedule(void) | 4030 | asmlinkage void __sched notrace preempt_schedule(void) |
3730 | { | 4031 | { |
3731 | struct thread_info *ti = current_thread_info(); | 4032 | struct thread_info *ti = current_thread_info(); |
3732 | 4033 | ||
@@ -3738,9 +4039,9 @@ asmlinkage void __sched preempt_schedule(void) | |||
3738 | return; | 4039 | return; |
3739 | 4040 | ||
3740 | do { | 4041 | do { |
3741 | add_preempt_count(PREEMPT_ACTIVE); | 4042 | add_preempt_count_notrace(PREEMPT_ACTIVE); |
3742 | schedule(); | 4043 | schedule(); |
3743 | sub_preempt_count(PREEMPT_ACTIVE); | 4044 | sub_preempt_count_notrace(PREEMPT_ACTIVE); |
3744 | 4045 | ||
3745 | /* | 4046 | /* |
3746 | * Check again in case we missed a preemption opportunity | 4047 | * Check again in case we missed a preemption opportunity |
@@ -4183,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4183 | 4484 | ||
4184 | rq = task_rq_lock(p, &flags); | 4485 | rq = task_rq_lock(p, &flags); |
4185 | 4486 | ||
4487 | trace_sched_pi_setprio(p, prio); | ||
4186 | oldprio = p->prio; | 4488 | oldprio = p->prio; |
4187 | prev_class = p->sched_class; | 4489 | prev_class = p->sched_class; |
4188 | on_rq = p->se.on_rq; | 4490 | on_rq = p->se.on_rq; |
@@ -4441,12 +4743,8 @@ recheck: | |||
4441 | */ | 4743 | */ |
4442 | if (user && !capable(CAP_SYS_NICE)) { | 4744 | if (user && !capable(CAP_SYS_NICE)) { |
4443 | if (rt_policy(policy)) { | 4745 | if (rt_policy(policy)) { |
4444 | unsigned long rlim_rtprio; | 4746 | unsigned long rlim_rtprio = |
4445 | 4747 | task_rlimit(p, RLIMIT_RTPRIO); | |
4446 | if (!lock_task_sighand(p, &flags)) | ||
4447 | return -ESRCH; | ||
4448 | rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); | ||
4449 | unlock_task_sighand(p, &flags); | ||
4450 | 4748 | ||
4451 | /* can't set/change the rt policy */ | 4749 | /* can't set/change the rt policy */ |
4452 | if (policy != p->policy && !rlim_rtprio) | 4750 | if (policy != p->policy && !rlim_rtprio) |
@@ -4474,7 +4772,7 @@ recheck: | |||
4474 | } | 4772 | } |
4475 | 4773 | ||
4476 | if (user) { | 4774 | if (user) { |
4477 | retval = security_task_setscheduler(p, policy, param); | 4775 | retval = security_task_setscheduler(p); |
4478 | if (retval) | 4776 | if (retval) |
4479 | return retval; | 4777 | return retval; |
4480 | } | 4778 | } |
@@ -4490,6 +4788,15 @@ recheck: | |||
4490 | */ | 4788 | */ |
4491 | rq = __task_rq_lock(p); | 4789 | rq = __task_rq_lock(p); |
4492 | 4790 | ||
4791 | /* | ||
4792 | * Changing the policy of the stop threads its a very bad idea | ||
4793 | */ | ||
4794 | if (p == rq->stop) { | ||
4795 | __task_rq_unlock(rq); | ||
4796 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4797 | return -EINVAL; | ||
4798 | } | ||
4799 | |||
4493 | #ifdef CONFIG_RT_GROUP_SCHED | 4800 | #ifdef CONFIG_RT_GROUP_SCHED |
4494 | if (user) { | 4801 | if (user) { |
4495 | /* | 4802 | /* |
@@ -4716,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
4716 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) | 5023 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) |
4717 | goto out_unlock; | 5024 | goto out_unlock; |
4718 | 5025 | ||
4719 | retval = security_task_setscheduler(p, 0, NULL); | 5026 | retval = security_task_setscheduler(p); |
4720 | if (retval) | 5027 | if (retval) |
4721 | goto out_unlock; | 5028 | goto out_unlock; |
4722 | 5029 | ||
4723 | cpuset_cpus_allowed(p, cpus_allowed); | 5030 | cpuset_cpus_allowed(p, cpus_allowed); |
4724 | cpumask_and(new_mask, in_mask, cpus_allowed); | 5031 | cpumask_and(new_mask, in_mask, cpus_allowed); |
4725 | again: | 5032 | again: |
4726 | retval = set_cpus_allowed_ptr(p, new_mask); | 5033 | retval = set_cpus_allowed_ptr(p, new_mask); |
4727 | 5034 | ||
4728 | if (!retval) { | 5035 | if (!retval) { |
@@ -5166,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5166 | idle->se.exec_start = sched_clock(); | 5473 | idle->se.exec_start = sched_clock(); |
5167 | 5474 | ||
5168 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); | 5475 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); |
5476 | /* | ||
5477 | * We're having a chicken and egg problem, even though we are | ||
5478 | * holding rq->lock, the cpu isn't yet set to this cpu so the | ||
5479 | * lockdep check in task_group() will fail. | ||
5480 | * | ||
5481 | * Similar case to sched_fork(). / Alternatively we could | ||
5482 | * use task_rq_lock() here and obtain the other rq->lock. | ||
5483 | * | ||
5484 | * Silence PROVE_RCU | ||
5485 | */ | ||
5486 | rcu_read_lock(); | ||
5169 | __set_task_cpu(idle, cpu); | 5487 | __set_task_cpu(idle, cpu); |
5488 | rcu_read_unlock(); | ||
5170 | 5489 | ||
5171 | rq->curr = rq->idle = idle; | 5490 | rq->curr = rq->idle = idle; |
5172 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5491 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
@@ -5816,20 +6135,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5816 | */ | 6135 | */ |
5817 | static struct notifier_block __cpuinitdata migration_notifier = { | 6136 | static struct notifier_block __cpuinitdata migration_notifier = { |
5818 | .notifier_call = migration_call, | 6137 | .notifier_call = migration_call, |
5819 | .priority = 10 | 6138 | .priority = CPU_PRI_MIGRATION, |
5820 | }; | 6139 | }; |
5821 | 6140 | ||
6141 | static int __cpuinit sched_cpu_active(struct notifier_block *nfb, | ||
6142 | unsigned long action, void *hcpu) | ||
6143 | { | ||
6144 | switch (action & ~CPU_TASKS_FROZEN) { | ||
6145 | case CPU_ONLINE: | ||
6146 | case CPU_DOWN_FAILED: | ||
6147 | set_cpu_active((long)hcpu, true); | ||
6148 | return NOTIFY_OK; | ||
6149 | default: | ||
6150 | return NOTIFY_DONE; | ||
6151 | } | ||
6152 | } | ||
6153 | |||
6154 | static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, | ||
6155 | unsigned long action, void *hcpu) | ||
6156 | { | ||
6157 | switch (action & ~CPU_TASKS_FROZEN) { | ||
6158 | case CPU_DOWN_PREPARE: | ||
6159 | set_cpu_active((long)hcpu, false); | ||
6160 | return NOTIFY_OK; | ||
6161 | default: | ||
6162 | return NOTIFY_DONE; | ||
6163 | } | ||
6164 | } | ||
6165 | |||
5822 | static int __init migration_init(void) | 6166 | static int __init migration_init(void) |
5823 | { | 6167 | { |
5824 | void *cpu = (void *)(long)smp_processor_id(); | 6168 | void *cpu = (void *)(long)smp_processor_id(); |
5825 | int err; | 6169 | int err; |
5826 | 6170 | ||
5827 | /* Start one for the boot CPU: */ | 6171 | /* Initialize migration for the boot CPU */ |
5828 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | 6172 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); |
5829 | BUG_ON(err == NOTIFY_BAD); | 6173 | BUG_ON(err == NOTIFY_BAD); |
5830 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 6174 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
5831 | register_cpu_notifier(&migration_notifier); | 6175 | register_cpu_notifier(&migration_notifier); |
5832 | 6176 | ||
6177 | /* Register cpu active notifiers */ | ||
6178 | cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); | ||
6179 | cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); | ||
6180 | |||
5833 | return 0; | 6181 | return 0; |
5834 | } | 6182 | } |
5835 | early_initcall(migration_init); | 6183 | early_initcall(migration_init); |
@@ -6064,23 +6412,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
6064 | free_rootdomain(old_rd); | 6412 | free_rootdomain(old_rd); |
6065 | } | 6413 | } |
6066 | 6414 | ||
6067 | static int init_rootdomain(struct root_domain *rd, bool bootmem) | 6415 | static int init_rootdomain(struct root_domain *rd) |
6068 | { | 6416 | { |
6069 | gfp_t gfp = GFP_KERNEL; | ||
6070 | |||
6071 | memset(rd, 0, sizeof(*rd)); | 6417 | memset(rd, 0, sizeof(*rd)); |
6072 | 6418 | ||
6073 | if (bootmem) | 6419 | if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) |
6074 | gfp = GFP_NOWAIT; | ||
6075 | |||
6076 | if (!alloc_cpumask_var(&rd->span, gfp)) | ||
6077 | goto out; | 6420 | goto out; |
6078 | if (!alloc_cpumask_var(&rd->online, gfp)) | 6421 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) |
6079 | goto free_span; | 6422 | goto free_span; |
6080 | if (!alloc_cpumask_var(&rd->rto_mask, gfp)) | 6423 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) |
6081 | goto free_online; | 6424 | goto free_online; |
6082 | 6425 | ||
6083 | if (cpupri_init(&rd->cpupri, bootmem) != 0) | 6426 | if (cpupri_init(&rd->cpupri) != 0) |
6084 | goto free_rto_mask; | 6427 | goto free_rto_mask; |
6085 | return 0; | 6428 | return 0; |
6086 | 6429 | ||
@@ -6096,7 +6439,7 @@ out: | |||
6096 | 6439 | ||
6097 | static void init_defrootdomain(void) | 6440 | static void init_defrootdomain(void) |
6098 | { | 6441 | { |
6099 | init_rootdomain(&def_root_domain, true); | 6442 | init_rootdomain(&def_root_domain); |
6100 | 6443 | ||
6101 | atomic_set(&def_root_domain.refcount, 1); | 6444 | atomic_set(&def_root_domain.refcount, 1); |
6102 | } | 6445 | } |
@@ -6109,7 +6452,7 @@ static struct root_domain *alloc_rootdomain(void) | |||
6109 | if (!rd) | 6452 | if (!rd) |
6110 | return NULL; | 6453 | return NULL; |
6111 | 6454 | ||
6112 | if (init_rootdomain(rd, false) != 0) { | 6455 | if (init_rootdomain(rd) != 0) { |
6113 | kfree(rd); | 6456 | kfree(rd); |
6114 | return NULL; | 6457 | return NULL; |
6115 | } | 6458 | } |
@@ -6319,6 +6662,7 @@ struct s_data { | |||
6319 | cpumask_var_t nodemask; | 6662 | cpumask_var_t nodemask; |
6320 | cpumask_var_t this_sibling_map; | 6663 | cpumask_var_t this_sibling_map; |
6321 | cpumask_var_t this_core_map; | 6664 | cpumask_var_t this_core_map; |
6665 | cpumask_var_t this_book_map; | ||
6322 | cpumask_var_t send_covered; | 6666 | cpumask_var_t send_covered; |
6323 | cpumask_var_t tmpmask; | 6667 | cpumask_var_t tmpmask; |
6324 | struct sched_group **sched_group_nodes; | 6668 | struct sched_group **sched_group_nodes; |
@@ -6330,6 +6674,7 @@ enum s_alloc { | |||
6330 | sa_rootdomain, | 6674 | sa_rootdomain, |
6331 | sa_tmpmask, | 6675 | sa_tmpmask, |
6332 | sa_send_covered, | 6676 | sa_send_covered, |
6677 | sa_this_book_map, | ||
6333 | sa_this_core_map, | 6678 | sa_this_core_map, |
6334 | sa_this_sibling_map, | 6679 | sa_this_sibling_map, |
6335 | sa_nodemask, | 6680 | sa_nodemask, |
@@ -6365,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, | |||
6365 | #ifdef CONFIG_SCHED_MC | 6710 | #ifdef CONFIG_SCHED_MC |
6366 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); | 6711 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); |
6367 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); | 6712 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); |
6368 | #endif /* CONFIG_SCHED_MC */ | ||
6369 | 6713 | ||
6370 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | ||
6371 | static int | 6714 | static int |
6372 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 6715 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, |
6373 | struct sched_group **sg, struct cpumask *mask) | 6716 | struct sched_group **sg, struct cpumask *mask) |
6374 | { | 6717 | { |
6375 | int group; | 6718 | int group; |
6376 | 6719 | #ifdef CONFIG_SCHED_SMT | |
6377 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | 6720 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); |
6378 | group = cpumask_first(mask); | 6721 | group = cpumask_first(mask); |
6722 | #else | ||
6723 | group = cpu; | ||
6724 | #endif | ||
6379 | if (sg) | 6725 | if (sg) |
6380 | *sg = &per_cpu(sched_group_core, group).sg; | 6726 | *sg = &per_cpu(sched_group_core, group).sg; |
6381 | return group; | 6727 | return group; |
6382 | } | 6728 | } |
6383 | #elif defined(CONFIG_SCHED_MC) | 6729 | #endif /* CONFIG_SCHED_MC */ |
6730 | |||
6731 | /* | ||
6732 | * book sched-domains: | ||
6733 | */ | ||
6734 | #ifdef CONFIG_SCHED_BOOK | ||
6735 | static DEFINE_PER_CPU(struct static_sched_domain, book_domains); | ||
6736 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); | ||
6737 | |||
6384 | static int | 6738 | static int |
6385 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 6739 | cpu_to_book_group(int cpu, const struct cpumask *cpu_map, |
6386 | struct sched_group **sg, struct cpumask *unused) | 6740 | struct sched_group **sg, struct cpumask *mask) |
6387 | { | 6741 | { |
6742 | int group = cpu; | ||
6743 | #ifdef CONFIG_SCHED_MC | ||
6744 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
6745 | group = cpumask_first(mask); | ||
6746 | #elif defined(CONFIG_SCHED_SMT) | ||
6747 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6748 | group = cpumask_first(mask); | ||
6749 | #endif | ||
6388 | if (sg) | 6750 | if (sg) |
6389 | *sg = &per_cpu(sched_group_core, cpu).sg; | 6751 | *sg = &per_cpu(sched_group_book, group).sg; |
6390 | return cpu; | 6752 | return group; |
6391 | } | 6753 | } |
6392 | #endif | 6754 | #endif /* CONFIG_SCHED_BOOK */ |
6393 | 6755 | ||
6394 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); | 6756 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); |
6395 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); | 6757 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); |
@@ -6399,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, | |||
6399 | struct sched_group **sg, struct cpumask *mask) | 6761 | struct sched_group **sg, struct cpumask *mask) |
6400 | { | 6762 | { |
6401 | int group; | 6763 | int group; |
6402 | #ifdef CONFIG_SCHED_MC | 6764 | #ifdef CONFIG_SCHED_BOOK |
6765 | cpumask_and(mask, cpu_book_mask(cpu), cpu_map); | ||
6766 | group = cpumask_first(mask); | ||
6767 | #elif defined(CONFIG_SCHED_MC) | ||
6403 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | 6768 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); |
6404 | group = cpumask_first(mask); | 6769 | group = cpumask_first(mask); |
6405 | #elif defined(CONFIG_SCHED_SMT) | 6770 | #elif defined(CONFIG_SCHED_SMT) |
@@ -6660,6 +7025,9 @@ SD_INIT_FUNC(CPU) | |||
6660 | #ifdef CONFIG_SCHED_MC | 7025 | #ifdef CONFIG_SCHED_MC |
6661 | SD_INIT_FUNC(MC) | 7026 | SD_INIT_FUNC(MC) |
6662 | #endif | 7027 | #endif |
7028 | #ifdef CONFIG_SCHED_BOOK | ||
7029 | SD_INIT_FUNC(BOOK) | ||
7030 | #endif | ||
6663 | 7031 | ||
6664 | static int default_relax_domain_level = -1; | 7032 | static int default_relax_domain_level = -1; |
6665 | 7033 | ||
@@ -6709,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
6709 | free_cpumask_var(d->tmpmask); /* fall through */ | 7077 | free_cpumask_var(d->tmpmask); /* fall through */ |
6710 | case sa_send_covered: | 7078 | case sa_send_covered: |
6711 | free_cpumask_var(d->send_covered); /* fall through */ | 7079 | free_cpumask_var(d->send_covered); /* fall through */ |
7080 | case sa_this_book_map: | ||
7081 | free_cpumask_var(d->this_book_map); /* fall through */ | ||
6712 | case sa_this_core_map: | 7082 | case sa_this_core_map: |
6713 | free_cpumask_var(d->this_core_map); /* fall through */ | 7083 | free_cpumask_var(d->this_core_map); /* fall through */ |
6714 | case sa_this_sibling_map: | 7084 | case sa_this_sibling_map: |
@@ -6755,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | |||
6755 | return sa_nodemask; | 7125 | return sa_nodemask; |
6756 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | 7126 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) |
6757 | return sa_this_sibling_map; | 7127 | return sa_this_sibling_map; |
6758 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | 7128 | if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) |
6759 | return sa_this_core_map; | 7129 | return sa_this_core_map; |
7130 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
7131 | return sa_this_book_map; | ||
6760 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | 7132 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) |
6761 | return sa_send_covered; | 7133 | return sa_send_covered; |
6762 | d->rd = alloc_rootdomain(); | 7134 | d->rd = alloc_rootdomain(); |
@@ -6814,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, | |||
6814 | return sd; | 7186 | return sd; |
6815 | } | 7187 | } |
6816 | 7188 | ||
7189 | static struct sched_domain *__build_book_sched_domain(struct s_data *d, | ||
7190 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
7191 | struct sched_domain *parent, int i) | ||
7192 | { | ||
7193 | struct sched_domain *sd = parent; | ||
7194 | #ifdef CONFIG_SCHED_BOOK | ||
7195 | sd = &per_cpu(book_domains, i).sd; | ||
7196 | SD_INIT(sd, BOOK); | ||
7197 | set_domain_attribute(sd, attr); | ||
7198 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); | ||
7199 | sd->parent = parent; | ||
7200 | parent->child = sd; | ||
7201 | cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7202 | #endif | ||
7203 | return sd; | ||
7204 | } | ||
7205 | |||
6817 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | 7206 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, |
6818 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7207 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
6819 | struct sched_domain *parent, int i) | 7208 | struct sched_domain *parent, int i) |
@@ -6871,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | |||
6871 | d->send_covered, d->tmpmask); | 7260 | d->send_covered, d->tmpmask); |
6872 | break; | 7261 | break; |
6873 | #endif | 7262 | #endif |
7263 | #ifdef CONFIG_SCHED_BOOK | ||
7264 | case SD_LV_BOOK: /* set up book groups */ | ||
7265 | cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); | ||
7266 | if (cpu == cpumask_first(d->this_book_map)) | ||
7267 | init_sched_build_groups(d->this_book_map, cpu_map, | ||
7268 | &cpu_to_book_group, | ||
7269 | d->send_covered, d->tmpmask); | ||
7270 | break; | ||
7271 | #endif | ||
6874 | case SD_LV_CPU: /* set up physical groups */ | 7272 | case SD_LV_CPU: /* set up physical groups */ |
6875 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); | 7273 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); |
6876 | if (!cpumask_empty(d->nodemask)) | 7274 | if (!cpumask_empty(d->nodemask)) |
@@ -6918,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
6918 | 7316 | ||
6919 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); | 7317 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); |
6920 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); | 7318 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); |
7319 | sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); | ||
6921 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); | 7320 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); |
6922 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); | 7321 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); |
6923 | } | 7322 | } |
6924 | 7323 | ||
6925 | for_each_cpu(i, cpu_map) { | 7324 | for_each_cpu(i, cpu_map) { |
6926 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); | 7325 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); |
7326 | build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); | ||
6927 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | 7327 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); |
6928 | } | 7328 | } |
6929 | 7329 | ||
@@ -6954,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
6954 | init_sched_groups_power(i, sd); | 7354 | init_sched_groups_power(i, sd); |
6955 | } | 7355 | } |
6956 | #endif | 7356 | #endif |
7357 | #ifdef CONFIG_SCHED_BOOK | ||
7358 | for_each_cpu(i, cpu_map) { | ||
7359 | sd = &per_cpu(book_domains, i).sd; | ||
7360 | init_sched_groups_power(i, sd); | ||
7361 | } | ||
7362 | #endif | ||
6957 | 7363 | ||
6958 | for_each_cpu(i, cpu_map) { | 7364 | for_each_cpu(i, cpu_map) { |
6959 | sd = &per_cpu(phys_domains, i).sd; | 7365 | sd = &per_cpu(phys_domains, i).sd; |
@@ -6979,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
6979 | sd = &per_cpu(cpu_domains, i).sd; | 7385 | sd = &per_cpu(cpu_domains, i).sd; |
6980 | #elif defined(CONFIG_SCHED_MC) | 7386 | #elif defined(CONFIG_SCHED_MC) |
6981 | sd = &per_cpu(core_domains, i).sd; | 7387 | sd = &per_cpu(core_domains, i).sd; |
7388 | #elif defined(CONFIG_SCHED_BOOK) | ||
7389 | sd = &per_cpu(book_domains, i).sd; | ||
6982 | #else | 7390 | #else |
6983 | sd = &per_cpu(phys_domains, i).sd; | 7391 | sd = &per_cpu(phys_domains, i).sd; |
6984 | #endif | 7392 | #endif |
@@ -7288,29 +7696,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
7288 | } | 7696 | } |
7289 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | 7697 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
7290 | 7698 | ||
7291 | #ifndef CONFIG_CPUSETS | ||
7292 | /* | 7699 | /* |
7293 | * Add online and remove offline CPUs from the scheduler domains. | 7700 | * Update cpusets according to cpu_active mask. If cpusets are |
7294 | * When cpusets are enabled they take over this function. | 7701 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper |
7702 | * around partition_sched_domains(). | ||
7295 | */ | 7703 | */ |
7296 | static int update_sched_domains(struct notifier_block *nfb, | 7704 | static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, |
7297 | unsigned long action, void *hcpu) | 7705 | void *hcpu) |
7298 | { | 7706 | { |
7299 | switch (action) { | 7707 | switch (action & ~CPU_TASKS_FROZEN) { |
7300 | case CPU_ONLINE: | 7708 | case CPU_ONLINE: |
7301 | case CPU_ONLINE_FROZEN: | ||
7302 | case CPU_DOWN_PREPARE: | ||
7303 | case CPU_DOWN_PREPARE_FROZEN: | ||
7304 | case CPU_DOWN_FAILED: | 7709 | case CPU_DOWN_FAILED: |
7305 | case CPU_DOWN_FAILED_FROZEN: | 7710 | cpuset_update_active_cpus(); |
7306 | partition_sched_domains(1, NULL, NULL); | ||
7307 | return NOTIFY_OK; | 7711 | return NOTIFY_OK; |
7712 | default: | ||
7713 | return NOTIFY_DONE; | ||
7714 | } | ||
7715 | } | ||
7308 | 7716 | ||
7717 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | ||
7718 | void *hcpu) | ||
7719 | { | ||
7720 | switch (action & ~CPU_TASKS_FROZEN) { | ||
7721 | case CPU_DOWN_PREPARE: | ||
7722 | cpuset_update_active_cpus(); | ||
7723 | return NOTIFY_OK; | ||
7309 | default: | 7724 | default: |
7310 | return NOTIFY_DONE; | 7725 | return NOTIFY_DONE; |
7311 | } | 7726 | } |
7312 | } | 7727 | } |
7313 | #endif | ||
7314 | 7728 | ||
7315 | static int update_runtime(struct notifier_block *nfb, | 7729 | static int update_runtime(struct notifier_block *nfb, |
7316 | unsigned long action, void *hcpu) | 7730 | unsigned long action, void *hcpu) |
@@ -7356,10 +7770,8 @@ void __init sched_init_smp(void) | |||
7356 | mutex_unlock(&sched_domains_mutex); | 7770 | mutex_unlock(&sched_domains_mutex); |
7357 | put_online_cpus(); | 7771 | put_online_cpus(); |
7358 | 7772 | ||
7359 | #ifndef CONFIG_CPUSETS | 7773 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
7360 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 7774 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); |
7361 | hotcpu_notifier(update_sched_domains, 0); | ||
7362 | #endif | ||
7363 | 7775 | ||
7364 | /* RT runtime code needs to handle some hotplug events */ | 7776 | /* RT runtime code needs to handle some hotplug events */ |
7365 | hotcpu_notifier(update_runtime, 0); | 7777 | hotcpu_notifier(update_runtime, 0); |
@@ -7604,6 +8016,9 @@ void __init sched_init(void) | |||
7604 | 8016 | ||
7605 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 8017 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
7606 | rq->cpu_load[j] = 0; | 8018 | rq->cpu_load[j] = 0; |
8019 | |||
8020 | rq->last_load_update_tick = jiffies; | ||
8021 | |||
7607 | #ifdef CONFIG_SMP | 8022 | #ifdef CONFIG_SMP |
7608 | rq->sd = NULL; | 8023 | rq->sd = NULL; |
7609 | rq->rd = NULL; | 8024 | rq->rd = NULL; |
@@ -7617,6 +8032,10 @@ void __init sched_init(void) | |||
7617 | rq->idle_stamp = 0; | 8032 | rq->idle_stamp = 0; |
7618 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 8033 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
7619 | rq_attach_root(rq, &def_root_domain); | 8034 | rq_attach_root(rq, &def_root_domain); |
8035 | #ifdef CONFIG_NO_HZ | ||
8036 | rq->nohz_balance_kick = 0; | ||
8037 | init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); | ||
8038 | #endif | ||
7620 | #endif | 8039 | #endif |
7621 | init_rq_hrtick(rq); | 8040 | init_rq_hrtick(rq); |
7622 | atomic_set(&rq->nr_iowait, 0); | 8041 | atomic_set(&rq->nr_iowait, 0); |
@@ -7661,8 +8080,11 @@ void __init sched_init(void) | |||
7661 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 8080 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
7662 | #ifdef CONFIG_SMP | 8081 | #ifdef CONFIG_SMP |
7663 | #ifdef CONFIG_NO_HZ | 8082 | #ifdef CONFIG_NO_HZ |
7664 | zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); | 8083 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
7665 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); | 8084 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
8085 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
8086 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); | ||
8087 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); | ||
7666 | #endif | 8088 | #endif |
7667 | /* May be allocated at isolcpus cmdline parse time */ | 8089 | /* May be allocated at isolcpus cmdline parse time */ |
7668 | if (cpu_isolated_map == NULL) | 8090 | if (cpu_isolated_map == NULL) |
@@ -7869,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
7869 | 8291 | ||
7870 | return 1; | 8292 | return 1; |
7871 | 8293 | ||
7872 | err_free_rq: | 8294 | err_free_rq: |
7873 | kfree(cfs_rq); | 8295 | kfree(cfs_rq); |
7874 | err: | 8296 | err: |
7875 | return 0; | 8297 | return 0; |
7876 | } | 8298 | } |
7877 | 8299 | ||
@@ -7959,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
7959 | 8381 | ||
7960 | return 1; | 8382 | return 1; |
7961 | 8383 | ||
7962 | err_free_rq: | 8384 | err_free_rq: |
7963 | kfree(rt_rq); | 8385 | kfree(rt_rq); |
7964 | err: | 8386 | err: |
7965 | return 0; | 8387 | return 0; |
7966 | } | 8388 | } |
7967 | 8389 | ||
@@ -8319,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
8319 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 8741 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
8320 | } | 8742 | } |
8321 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8743 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
8322 | unlock: | 8744 | unlock: |
8323 | read_unlock(&tasklist_lock); | 8745 | read_unlock(&tasklist_lock); |
8324 | mutex_unlock(&rt_constraints_mutex); | 8746 | mutex_unlock(&rt_constraints_mutex); |
8325 | 8747 | ||
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 906a0f718cb3..52f1a149bfb1 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c | |||
@@ -10,19 +10,55 @@ | |||
10 | * Ingo Molnar <mingo@redhat.com> | 10 | * Ingo Molnar <mingo@redhat.com> |
11 | * Guillaume Chazarain <guichaz@gmail.com> | 11 | * Guillaume Chazarain <guichaz@gmail.com> |
12 | * | 12 | * |
13 | * Create a semi stable clock from a mixture of other events, including: | 13 | * |
14 | * - gtod | 14 | * What: |
15 | * | ||
16 | * cpu_clock(i) provides a fast (execution time) high resolution | ||
17 | * clock with bounded drift between CPUs. The value of cpu_clock(i) | ||
18 | * is monotonic for constant i. The timestamp returned is in nanoseconds. | ||
19 | * | ||
20 | * ######################### BIG FAT WARNING ########################## | ||
21 | * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # | ||
22 | * # go backwards !! # | ||
23 | * #################################################################### | ||
24 | * | ||
25 | * There is no strict promise about the base, although it tends to start | ||
26 | * at 0 on boot (but people really shouldn't rely on that). | ||
27 | * | ||
28 | * cpu_clock(i) -- can be used from any context, including NMI. | ||
29 | * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) | ||
30 | * local_clock() -- is cpu_clock() on the current cpu. | ||
31 | * | ||
32 | * How: | ||
33 | * | ||
34 | * The implementation either uses sched_clock() when | ||
35 | * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the | ||
36 | * sched_clock() is assumed to provide these properties (mostly it means | ||
37 | * the architecture provides a globally synchronized highres time source). | ||
38 | * | ||
39 | * Otherwise it tries to create a semi stable clock from a mixture of other | ||
40 | * clocks, including: | ||
41 | * | ||
42 | * - GTOD (clock monotomic) | ||
15 | * - sched_clock() | 43 | * - sched_clock() |
16 | * - explicit idle events | 44 | * - explicit idle events |
17 | * | 45 | * |
18 | * We use gtod as base and the unstable clock deltas. The deltas are filtered, | 46 | * We use GTOD as base and use sched_clock() deltas to improve resolution. The |
19 | * making it monotonic and keeping it within an expected window. | 47 | * deltas are filtered to provide monotonicity and keeping it within an |
48 | * expected window. | ||
20 | * | 49 | * |
21 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time | 50 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time |
22 | * that is otherwise invisible (TSC gets stopped). | 51 | * that is otherwise invisible (TSC gets stopped). |
23 | * | 52 | * |
24 | * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat | 53 | * |
25 | * consistent between cpus (never more than 2 jiffies difference). | 54 | * Notes: |
55 | * | ||
56 | * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things | ||
57 | * like cpufreq interrupts that can change the base clock (TSC) multiplier | ||
58 | * and cause funny jumps in time -- although the filtering provided by | ||
59 | * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it | ||
60 | * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on | ||
61 | * sched_clock(). | ||
26 | */ | 62 | */ |
27 | #include <linux/spinlock.h> | 63 | #include <linux/spinlock.h> |
28 | #include <linux/hardirq.h> | 64 | #include <linux/hardirq.h> |
@@ -170,6 +206,11 @@ again: | |||
170 | return val; | 206 | return val; |
171 | } | 207 | } |
172 | 208 | ||
209 | /* | ||
210 | * Similar to cpu_clock(), but requires local IRQs to be disabled. | ||
211 | * | ||
212 | * See cpu_clock(). | ||
213 | */ | ||
173 | u64 sched_clock_cpu(int cpu) | 214 | u64 sched_clock_cpu(int cpu) |
174 | { | 215 | { |
175 | struct sched_clock_data *scd; | 216 | struct sched_clock_data *scd; |
@@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
237 | } | 278 | } |
238 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | 279 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); |
239 | 280 | ||
240 | unsigned long long cpu_clock(int cpu) | 281 | /* |
282 | * As outlined at the top, provides a fast, high resolution, nanosecond | ||
283 | * time source that is monotonic per cpu argument and has bounded drift | ||
284 | * between cpus. | ||
285 | * | ||
286 | * ######################### BIG FAT WARNING ########################## | ||
287 | * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # | ||
288 | * # go backwards !! # | ||
289 | * #################################################################### | ||
290 | */ | ||
291 | u64 cpu_clock(int cpu) | ||
241 | { | 292 | { |
242 | unsigned long long clock; | 293 | u64 clock; |
243 | unsigned long flags; | 294 | unsigned long flags; |
244 | 295 | ||
245 | local_irq_save(flags); | 296 | local_irq_save(flags); |
@@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu) | |||
249 | return clock; | 300 | return clock; |
250 | } | 301 | } |
251 | 302 | ||
303 | /* | ||
304 | * Similar to cpu_clock() for the current cpu. Time will only be observed | ||
305 | * to be monotonic if care is taken to only compare timestampt taken on the | ||
306 | * same CPU. | ||
307 | * | ||
308 | * See cpu_clock(). | ||
309 | */ | ||
310 | u64 local_clock(void) | ||
311 | { | ||
312 | u64 clock; | ||
313 | unsigned long flags; | ||
314 | |||
315 | local_irq_save(flags); | ||
316 | clock = sched_clock_cpu(smp_processor_id()); | ||
317 | local_irq_restore(flags); | ||
318 | |||
319 | return clock; | ||
320 | } | ||
321 | |||
252 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 322 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
253 | 323 | ||
254 | void sched_clock_init(void) | 324 | void sched_clock_init(void) |
@@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu) | |||
264 | return sched_clock(); | 334 | return sched_clock(); |
265 | } | 335 | } |
266 | 336 | ||
267 | 337 | u64 cpu_clock(int cpu) | |
268 | unsigned long long cpu_clock(int cpu) | ||
269 | { | 338 | { |
270 | return sched_clock_cpu(cpu); | 339 | return sched_clock_cpu(cpu); |
271 | } | 340 | } |
272 | 341 | ||
342 | u64 local_clock(void) | ||
343 | { | ||
344 | return sched_clock_cpu(0); | ||
345 | } | ||
346 | |||
273 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 347 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
274 | 348 | ||
275 | EXPORT_SYMBOL_GPL(cpu_clock); | 349 | EXPORT_SYMBOL_GPL(cpu_clock); |
350 | EXPORT_SYMBOL_GPL(local_clock); | ||
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index e6871cb3fc83..2722dc1b4138 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c | |||
@@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
166 | * | 166 | * |
167 | * Returns: -ENOMEM if memory fails. | 167 | * Returns: -ENOMEM if memory fails. |
168 | */ | 168 | */ |
169 | int cpupri_init(struct cpupri *cp, bool bootmem) | 169 | int cpupri_init(struct cpupri *cp) |
170 | { | 170 | { |
171 | gfp_t gfp = GFP_KERNEL; | ||
172 | int i; | 171 | int i; |
173 | 172 | ||
174 | if (bootmem) | ||
175 | gfp = GFP_NOWAIT; | ||
176 | |||
177 | memset(cp, 0, sizeof(*cp)); | 173 | memset(cp, 0, sizeof(*cp)); |
178 | 174 | ||
179 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { | 175 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { |
@@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem) | |||
181 | 177 | ||
182 | raw_spin_lock_init(&vec->lock); | 178 | raw_spin_lock_init(&vec->lock); |
183 | vec->count = 0; | 179 | vec->count = 0; |
184 | if (!zalloc_cpumask_var(&vec->mask, gfp)) | 180 | if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) |
185 | goto cleanup; | 181 | goto cleanup; |
186 | } | 182 | } |
187 | 183 | ||
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 7cb5bb6b95be..9fc7d386fea4 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h | |||
@@ -27,7 +27,7 @@ struct cpupri { | |||
27 | int cpupri_find(struct cpupri *cp, | 27 | int cpupri_find(struct cpupri *cp, |
28 | struct task_struct *p, struct cpumask *lowest_mask); | 28 | struct task_struct *p, struct cpumask *lowest_mask); |
29 | void cpupri_set(struct cpupri *cp, int cpu, int pri); | 29 | void cpupri_set(struct cpupri *cp, int cpu, int pri); |
30 | int cpupri_init(struct cpupri *cp, bool bootmem); | 30 | int cpupri_init(struct cpupri *cp); |
31 | void cpupri_cleanup(struct cpupri *cp); | 31 | void cpupri_cleanup(struct cpupri *cp); |
32 | #else | 32 | #else |
33 | #define cpupri_set(cp, cpu, pri) do { } while (0) | 33 | #define cpupri_set(cp, cpu, pri) do { } while (0) |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 35565395d00d..2e1b0d17dd9b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
332 | PN(sysctl_sched_latency); | 332 | PN(sysctl_sched_latency); |
333 | PN(sysctl_sched_min_granularity); | 333 | PN(sysctl_sched_min_granularity); |
334 | PN(sysctl_sched_wakeup_granularity); | 334 | PN(sysctl_sched_wakeup_granularity); |
335 | PN(sysctl_sched_child_runs_first); | 335 | P(sysctl_sched_child_runs_first); |
336 | P(sysctl_sched_features); | 336 | P(sysctl_sched_features); |
337 | #undef PN | 337 | #undef PN |
338 | #undef P | 338 | #undef P |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a878b5332daa..933f3d1b62ea 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -25,7 +25,7 @@ | |||
25 | 25 | ||
26 | /* | 26 | /* |
27 | * Targeted preemption latency for CPU-bound tasks: | 27 | * Targeted preemption latency for CPU-bound tasks: |
28 | * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) | 28 | * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) |
29 | * | 29 | * |
30 | * NOTE: this latency value is not the same as the concept of | 30 | * NOTE: this latency value is not the same as the concept of |
31 | * 'timeslice length' - timeslices in CFS are of variable length | 31 | * 'timeslice length' - timeslices in CFS are of variable length |
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling | |||
52 | 52 | ||
53 | /* | 53 | /* |
54 | * Minimal preemption granularity for CPU-bound tasks: | 54 | * Minimal preemption granularity for CPU-bound tasks: |
55 | * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) | 55 | * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) |
56 | */ | 56 | */ |
57 | unsigned int sysctl_sched_min_granularity = 2000000ULL; | 57 | unsigned int sysctl_sched_min_granularity = 750000ULL; |
58 | unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; | 58 | unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; |
59 | 59 | ||
60 | /* | 60 | /* |
61 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity | 61 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity |
62 | */ | 62 | */ |
63 | static unsigned int sched_nr_latency = 3; | 63 | static unsigned int sched_nr_latency = 8; |
64 | 64 | ||
65 | /* | 65 | /* |
66 | * After fork, child runs first. If set to 0 (default) then | 66 | * After fork, child runs first. If set to 0 (default) then |
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
519 | static void update_curr(struct cfs_rq *cfs_rq) | 519 | static void update_curr(struct cfs_rq *cfs_rq) |
520 | { | 520 | { |
521 | struct sched_entity *curr = cfs_rq->curr; | 521 | struct sched_entity *curr = cfs_rq->curr; |
522 | u64 now = rq_of(cfs_rq)->clock; | 522 | u64 now = rq_of(cfs_rq)->clock_task; |
523 | unsigned long delta_exec; | 523 | unsigned long delta_exec; |
524 | 524 | ||
525 | if (unlikely(!curr)) | 525 | if (unlikely(!curr)) |
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
602 | /* | 602 | /* |
603 | * We are starting a new run period: | 603 | * We are starting a new run period: |
604 | */ | 604 | */ |
605 | se->exec_start = rq_of(cfs_rq)->clock; | 605 | se->exec_start = rq_of(cfs_rq)->clock_task; |
606 | } | 606 | } |
607 | 607 | ||
608 | /************************************************** | 608 | /************************************************** |
@@ -1313,7 +1313,7 @@ static struct sched_group * | |||
1313 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, | 1313 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, |
1314 | int this_cpu, int load_idx) | 1314 | int this_cpu, int load_idx) |
1315 | { | 1315 | { |
1316 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | 1316 | struct sched_group *idlest = NULL, *group = sd->groups; |
1317 | unsigned long min_load = ULONG_MAX, this_load = 0; | 1317 | unsigned long min_load = ULONG_MAX, this_load = 0; |
1318 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 1318 | int imbalance = 100 + (sd->imbalance_pct-100)/2; |
1319 | 1319 | ||
@@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
1348 | 1348 | ||
1349 | if (local_group) { | 1349 | if (local_group) { |
1350 | this_load = avg_load; | 1350 | this_load = avg_load; |
1351 | this = group; | ||
1352 | } else if (avg_load < min_load) { | 1351 | } else if (avg_load < min_load) { |
1353 | min_load = avg_load; | 1352 | min_load = avg_load; |
1354 | idlest = group; | 1353 | idlest = group; |
@@ -1765,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
1765 | set_task_cpu(p, this_cpu); | 1764 | set_task_cpu(p, this_cpu); |
1766 | activate_task(this_rq, p, 0); | 1765 | activate_task(this_rq, p, 0); |
1767 | check_preempt_curr(this_rq, p, 0); | 1766 | check_preempt_curr(this_rq, p, 0); |
1767 | |||
1768 | /* re-arm NEWIDLE balancing when moving tasks */ | ||
1769 | src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost; | ||
1770 | this_rq->idle_stamp = 0; | ||
1768 | } | 1771 | } |
1769 | 1772 | ||
1770 | /* | 1773 | /* |
@@ -1799,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
1799 | * 2) too many balance attempts have failed. | 1802 | * 2) too many balance attempts have failed. |
1800 | */ | 1803 | */ |
1801 | 1804 | ||
1802 | tsk_cache_hot = task_hot(p, rq->clock, sd); | 1805 | tsk_cache_hot = task_hot(p, rq->clock_task, sd); |
1803 | if (!tsk_cache_hot || | 1806 | if (!tsk_cache_hot || |
1804 | sd->nr_balance_failed > sd->cache_nice_tries) { | 1807 | sd->nr_balance_failed > sd->cache_nice_tries) { |
1805 | #ifdef CONFIG_SCHEDSTATS | 1808 | #ifdef CONFIG_SCHEDSTATS |
@@ -2031,12 +2034,14 @@ struct sd_lb_stats { | |||
2031 | unsigned long this_load; | 2034 | unsigned long this_load; |
2032 | unsigned long this_load_per_task; | 2035 | unsigned long this_load_per_task; |
2033 | unsigned long this_nr_running; | 2036 | unsigned long this_nr_running; |
2037 | unsigned long this_has_capacity; | ||
2034 | 2038 | ||
2035 | /* Statistics of the busiest group */ | 2039 | /* Statistics of the busiest group */ |
2036 | unsigned long max_load; | 2040 | unsigned long max_load; |
2037 | unsigned long busiest_load_per_task; | 2041 | unsigned long busiest_load_per_task; |
2038 | unsigned long busiest_nr_running; | 2042 | unsigned long busiest_nr_running; |
2039 | unsigned long busiest_group_capacity; | 2043 | unsigned long busiest_group_capacity; |
2044 | unsigned long busiest_has_capacity; | ||
2040 | 2045 | ||
2041 | int group_imb; /* Is there imbalance in this sd */ | 2046 | int group_imb; /* Is there imbalance in this sd */ |
2042 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2047 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
@@ -2059,6 +2064,7 @@ struct sg_lb_stats { | |||
2059 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 2064 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
2060 | unsigned long group_capacity; | 2065 | unsigned long group_capacity; |
2061 | int group_imb; /* Is there an imbalance in the group ? */ | 2066 | int group_imb; /* Is there an imbalance in the group ? */ |
2067 | int group_has_capacity; /* Is there extra capacity in the group? */ | ||
2062 | }; | 2068 | }; |
2063 | 2069 | ||
2064 | /** | 2070 | /** |
@@ -2268,10 +2274,14 @@ unsigned long scale_rt_power(int cpu) | |||
2268 | struct rq *rq = cpu_rq(cpu); | 2274 | struct rq *rq = cpu_rq(cpu); |
2269 | u64 total, available; | 2275 | u64 total, available; |
2270 | 2276 | ||
2271 | sched_avg_update(rq); | ||
2272 | |||
2273 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | 2277 | total = sched_avg_period() + (rq->clock - rq->age_stamp); |
2274 | available = total - rq->rt_avg; | 2278 | |
2279 | if (unlikely(total < rq->rt_avg)) { | ||
2280 | /* Ensures that power won't end up being negative */ | ||
2281 | available = 0; | ||
2282 | } else { | ||
2283 | available = total - rq->rt_avg; | ||
2284 | } | ||
2275 | 2285 | ||
2276 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | 2286 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) |
2277 | total = SCHED_LOAD_SCALE; | 2287 | total = SCHED_LOAD_SCALE; |
@@ -2287,13 +2297,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
2287 | unsigned long power = SCHED_LOAD_SCALE; | 2297 | unsigned long power = SCHED_LOAD_SCALE; |
2288 | struct sched_group *sdg = sd->groups; | 2298 | struct sched_group *sdg = sd->groups; |
2289 | 2299 | ||
2290 | if (sched_feat(ARCH_POWER)) | ||
2291 | power *= arch_scale_freq_power(sd, cpu); | ||
2292 | else | ||
2293 | power *= default_scale_freq_power(sd, cpu); | ||
2294 | |||
2295 | power >>= SCHED_LOAD_SHIFT; | ||
2296 | |||
2297 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | 2300 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
2298 | if (sched_feat(ARCH_POWER)) | 2301 | if (sched_feat(ARCH_POWER)) |
2299 | power *= arch_scale_smt_power(sd, cpu); | 2302 | power *= arch_scale_smt_power(sd, cpu); |
@@ -2303,6 +2306,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
2303 | power >>= SCHED_LOAD_SHIFT; | 2306 | power >>= SCHED_LOAD_SHIFT; |
2304 | } | 2307 | } |
2305 | 2308 | ||
2309 | sdg->cpu_power_orig = power; | ||
2310 | |||
2311 | if (sched_feat(ARCH_POWER)) | ||
2312 | power *= arch_scale_freq_power(sd, cpu); | ||
2313 | else | ||
2314 | power *= default_scale_freq_power(sd, cpu); | ||
2315 | |||
2316 | power >>= SCHED_LOAD_SHIFT; | ||
2317 | |||
2306 | power *= scale_rt_power(cpu); | 2318 | power *= scale_rt_power(cpu); |
2307 | power >>= SCHED_LOAD_SHIFT; | 2319 | power >>= SCHED_LOAD_SHIFT; |
2308 | 2320 | ||
@@ -2335,6 +2347,31 @@ static void update_group_power(struct sched_domain *sd, int cpu) | |||
2335 | sdg->cpu_power = power; | 2347 | sdg->cpu_power = power; |
2336 | } | 2348 | } |
2337 | 2349 | ||
2350 | /* | ||
2351 | * Try and fix up capacity for tiny siblings, this is needed when | ||
2352 | * things like SD_ASYM_PACKING need f_b_g to select another sibling | ||
2353 | * which on its own isn't powerful enough. | ||
2354 | * | ||
2355 | * See update_sd_pick_busiest() and check_asym_packing(). | ||
2356 | */ | ||
2357 | static inline int | ||
2358 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | ||
2359 | { | ||
2360 | /* | ||
2361 | * Only siblings can have significantly less than SCHED_LOAD_SCALE | ||
2362 | */ | ||
2363 | if (sd->level != SD_LV_SIBLING) | ||
2364 | return 0; | ||
2365 | |||
2366 | /* | ||
2367 | * If ~90% of the cpu_power is still there, we're good. | ||
2368 | */ | ||
2369 | if (group->cpu_power * 32 > group->cpu_power_orig * 29) | ||
2370 | return 1; | ||
2371 | |||
2372 | return 0; | ||
2373 | } | ||
2374 | |||
2338 | /** | 2375 | /** |
2339 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 2376 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
2340 | * @sd: The sched_domain whose statistics are to be updated. | 2377 | * @sd: The sched_domain whose statistics are to be updated. |
@@ -2354,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2354 | int local_group, const struct cpumask *cpus, | 2391 | int local_group, const struct cpumask *cpus, |
2355 | int *balance, struct sg_lb_stats *sgs) | 2392 | int *balance, struct sg_lb_stats *sgs) |
2356 | { | 2393 | { |
2357 | unsigned long load, max_cpu_load, min_cpu_load; | 2394 | unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; |
2358 | int i; | 2395 | int i; |
2359 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 2396 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
2360 | unsigned long avg_load_per_task = 0; | 2397 | unsigned long avg_load_per_task = 0; |
@@ -2365,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2365 | /* Tally up the load of all CPUs in the group */ | 2402 | /* Tally up the load of all CPUs in the group */ |
2366 | max_cpu_load = 0; | 2403 | max_cpu_load = 0; |
2367 | min_cpu_load = ~0UL; | 2404 | min_cpu_load = ~0UL; |
2405 | max_nr_running = 0; | ||
2368 | 2406 | ||
2369 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 2407 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { |
2370 | struct rq *rq = cpu_rq(i); | 2408 | struct rq *rq = cpu_rq(i); |
@@ -2382,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2382 | load = target_load(i, load_idx); | 2420 | load = target_load(i, load_idx); |
2383 | } else { | 2421 | } else { |
2384 | load = source_load(i, load_idx); | 2422 | load = source_load(i, load_idx); |
2385 | if (load > max_cpu_load) | 2423 | if (load > max_cpu_load) { |
2386 | max_cpu_load = load; | 2424 | max_cpu_load = load; |
2425 | max_nr_running = rq->nr_running; | ||
2426 | } | ||
2387 | if (min_cpu_load > load) | 2427 | if (min_cpu_load > load) |
2388 | min_cpu_load = load; | 2428 | min_cpu_load = load; |
2389 | } | 2429 | } |
@@ -2400,14 +2440,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2400 | * domains. In the newly idle case, we will allow all the cpu's | 2440 | * domains. In the newly idle case, we will allow all the cpu's |
2401 | * to do the newly idle load balance. | 2441 | * to do the newly idle load balance. |
2402 | */ | 2442 | */ |
2403 | if (idle != CPU_NEWLY_IDLE && local_group && | 2443 | if (idle != CPU_NEWLY_IDLE && local_group) { |
2404 | balance_cpu != this_cpu) { | 2444 | if (balance_cpu != this_cpu) { |
2405 | *balance = 0; | 2445 | *balance = 0; |
2406 | return; | 2446 | return; |
2447 | } | ||
2448 | update_group_power(sd, this_cpu); | ||
2407 | } | 2449 | } |
2408 | 2450 | ||
2409 | update_group_power(sd, this_cpu); | ||
2410 | |||
2411 | /* Adjust by relative CPU power of the group */ | 2451 | /* Adjust by relative CPU power of the group */ |
2412 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2452 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
2413 | 2453 | ||
@@ -2423,11 +2463,58 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2423 | if (sgs->sum_nr_running) | 2463 | if (sgs->sum_nr_running) |
2424 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 2464 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
2425 | 2465 | ||
2426 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 2466 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) |
2427 | sgs->group_imb = 1; | 2467 | sgs->group_imb = 1; |
2428 | 2468 | ||
2429 | sgs->group_capacity = | 2469 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
2430 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2470 | if (!sgs->group_capacity) |
2471 | sgs->group_capacity = fix_small_capacity(sd, group); | ||
2472 | |||
2473 | if (sgs->group_capacity > sgs->sum_nr_running) | ||
2474 | sgs->group_has_capacity = 1; | ||
2475 | } | ||
2476 | |||
2477 | /** | ||
2478 | * update_sd_pick_busiest - return 1 on busiest group | ||
2479 | * @sd: sched_domain whose statistics are to be checked | ||
2480 | * @sds: sched_domain statistics | ||
2481 | * @sg: sched_group candidate to be checked for being the busiest | ||
2482 | * @sgs: sched_group statistics | ||
2483 | * @this_cpu: the current cpu | ||
2484 | * | ||
2485 | * Determine if @sg is a busier group than the previously selected | ||
2486 | * busiest group. | ||
2487 | */ | ||
2488 | static bool update_sd_pick_busiest(struct sched_domain *sd, | ||
2489 | struct sd_lb_stats *sds, | ||
2490 | struct sched_group *sg, | ||
2491 | struct sg_lb_stats *sgs, | ||
2492 | int this_cpu) | ||
2493 | { | ||
2494 | if (sgs->avg_load <= sds->max_load) | ||
2495 | return false; | ||
2496 | |||
2497 | if (sgs->sum_nr_running > sgs->group_capacity) | ||
2498 | return true; | ||
2499 | |||
2500 | if (sgs->group_imb) | ||
2501 | return true; | ||
2502 | |||
2503 | /* | ||
2504 | * ASYM_PACKING needs to move all the work to the lowest | ||
2505 | * numbered CPUs in the group, therefore mark all groups | ||
2506 | * higher than ourself as busy. | ||
2507 | */ | ||
2508 | if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | ||
2509 | this_cpu < group_first_cpu(sg)) { | ||
2510 | if (!sds->busiest) | ||
2511 | return true; | ||
2512 | |||
2513 | if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) | ||
2514 | return true; | ||
2515 | } | ||
2516 | |||
2517 | return false; | ||
2431 | } | 2518 | } |
2432 | 2519 | ||
2433 | /** | 2520 | /** |
@@ -2435,7 +2522,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2435 | * @sd: sched_domain whose statistics are to be updated. | 2522 | * @sd: sched_domain whose statistics are to be updated. |
2436 | * @this_cpu: Cpu for which load balance is currently performed. | 2523 | * @this_cpu: Cpu for which load balance is currently performed. |
2437 | * @idle: Idle status of this_cpu | 2524 | * @idle: Idle status of this_cpu |
2438 | * @sd_idle: Idle status of the sched_domain containing group. | 2525 | * @sd_idle: Idle status of the sched_domain containing sg. |
2439 | * @cpus: Set of cpus considered for load balancing. | 2526 | * @cpus: Set of cpus considered for load balancing. |
2440 | * @balance: Should we balance. | 2527 | * @balance: Should we balance. |
2441 | * @sds: variable to hold the statistics for this sched_domain. | 2528 | * @sds: variable to hold the statistics for this sched_domain. |
@@ -2446,7 +2533,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2446 | struct sd_lb_stats *sds) | 2533 | struct sd_lb_stats *sds) |
2447 | { | 2534 | { |
2448 | struct sched_domain *child = sd->child; | 2535 | struct sched_domain *child = sd->child; |
2449 | struct sched_group *group = sd->groups; | 2536 | struct sched_group *sg = sd->groups; |
2450 | struct sg_lb_stats sgs; | 2537 | struct sg_lb_stats sgs; |
2451 | int load_idx, prefer_sibling = 0; | 2538 | int load_idx, prefer_sibling = 0; |
2452 | 2539 | ||
@@ -2459,45 +2546,100 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2459 | do { | 2546 | do { |
2460 | int local_group; | 2547 | int local_group; |
2461 | 2548 | ||
2462 | local_group = cpumask_test_cpu(this_cpu, | 2549 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); |
2463 | sched_group_cpus(group)); | ||
2464 | memset(&sgs, 0, sizeof(sgs)); | 2550 | memset(&sgs, 0, sizeof(sgs)); |
2465 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, | 2551 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, |
2466 | local_group, cpus, balance, &sgs); | 2552 | local_group, cpus, balance, &sgs); |
2467 | 2553 | ||
2468 | if (local_group && !(*balance)) | 2554 | if (local_group && !(*balance)) |
2469 | return; | 2555 | return; |
2470 | 2556 | ||
2471 | sds->total_load += sgs.group_load; | 2557 | sds->total_load += sgs.group_load; |
2472 | sds->total_pwr += group->cpu_power; | 2558 | sds->total_pwr += sg->cpu_power; |
2473 | 2559 | ||
2474 | /* | 2560 | /* |
2475 | * In case the child domain prefers tasks go to siblings | 2561 | * In case the child domain prefers tasks go to siblings |
2476 | * first, lower the group capacity to one so that we'll try | 2562 | * first, lower the sg capacity to one so that we'll try |
2477 | * and move all the excess tasks away. | 2563 | * and move all the excess tasks away. We lower the capacity |
2564 | * of a group only if the local group has the capacity to fit | ||
2565 | * these excess tasks, i.e. nr_running < group_capacity. The | ||
2566 | * extra check prevents the case where you always pull from the | ||
2567 | * heaviest group when it is already under-utilized (possible | ||
2568 | * with a large weight task outweighs the tasks on the system). | ||
2478 | */ | 2569 | */ |
2479 | if (prefer_sibling) | 2570 | if (prefer_sibling && !local_group && sds->this_has_capacity) |
2480 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | 2571 | sgs.group_capacity = min(sgs.group_capacity, 1UL); |
2481 | 2572 | ||
2482 | if (local_group) { | 2573 | if (local_group) { |
2483 | sds->this_load = sgs.avg_load; | 2574 | sds->this_load = sgs.avg_load; |
2484 | sds->this = group; | 2575 | sds->this = sg; |
2485 | sds->this_nr_running = sgs.sum_nr_running; | 2576 | sds->this_nr_running = sgs.sum_nr_running; |
2486 | sds->this_load_per_task = sgs.sum_weighted_load; | 2577 | sds->this_load_per_task = sgs.sum_weighted_load; |
2487 | } else if (sgs.avg_load > sds->max_load && | 2578 | sds->this_has_capacity = sgs.group_has_capacity; |
2488 | (sgs.sum_nr_running > sgs.group_capacity || | 2579 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { |
2489 | sgs.group_imb)) { | ||
2490 | sds->max_load = sgs.avg_load; | 2580 | sds->max_load = sgs.avg_load; |
2491 | sds->busiest = group; | 2581 | sds->busiest = sg; |
2492 | sds->busiest_nr_running = sgs.sum_nr_running; | 2582 | sds->busiest_nr_running = sgs.sum_nr_running; |
2493 | sds->busiest_group_capacity = sgs.group_capacity; | 2583 | sds->busiest_group_capacity = sgs.group_capacity; |
2494 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 2584 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
2585 | sds->busiest_has_capacity = sgs.group_has_capacity; | ||
2495 | sds->group_imb = sgs.group_imb; | 2586 | sds->group_imb = sgs.group_imb; |
2496 | } | 2587 | } |
2497 | 2588 | ||
2498 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | 2589 | update_sd_power_savings_stats(sg, sds, local_group, &sgs); |
2499 | group = group->next; | 2590 | sg = sg->next; |
2500 | } while (group != sd->groups); | 2591 | } while (sg != sd->groups); |
2592 | } | ||
2593 | |||
2594 | int __weak arch_sd_sibling_asym_packing(void) | ||
2595 | { | ||
2596 | return 0*SD_ASYM_PACKING; | ||
2597 | } | ||
2598 | |||
2599 | /** | ||
2600 | * check_asym_packing - Check to see if the group is packed into the | ||
2601 | * sched doman. | ||
2602 | * | ||
2603 | * This is primarily intended to used at the sibling level. Some | ||
2604 | * cores like POWER7 prefer to use lower numbered SMT threads. In the | ||
2605 | * case of POWER7, it can move to lower SMT modes only when higher | ||
2606 | * threads are idle. When in lower SMT modes, the threads will | ||
2607 | * perform better since they share less core resources. Hence when we | ||
2608 | * have idle threads, we want them to be the higher ones. | ||
2609 | * | ||
2610 | * This packing function is run on idle threads. It checks to see if | ||
2611 | * the busiest CPU in this domain (core in the P7 case) has a higher | ||
2612 | * CPU number than the packing function is being run on. Here we are | ||
2613 | * assuming lower CPU number will be equivalent to lower a SMT thread | ||
2614 | * number. | ||
2615 | * | ||
2616 | * Returns 1 when packing is required and a task should be moved to | ||
2617 | * this CPU. The amount of the imbalance is returned in *imbalance. | ||
2618 | * | ||
2619 | * @sd: The sched_domain whose packing is to be checked. | ||
2620 | * @sds: Statistics of the sched_domain which is to be packed | ||
2621 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
2622 | * @imbalance: returns amount of imbalanced due to packing. | ||
2623 | */ | ||
2624 | static int check_asym_packing(struct sched_domain *sd, | ||
2625 | struct sd_lb_stats *sds, | ||
2626 | int this_cpu, unsigned long *imbalance) | ||
2627 | { | ||
2628 | int busiest_cpu; | ||
2629 | |||
2630 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
2631 | return 0; | ||
2632 | |||
2633 | if (!sds->busiest) | ||
2634 | return 0; | ||
2635 | |||
2636 | busiest_cpu = group_first_cpu(sds->busiest); | ||
2637 | if (this_cpu > busiest_cpu) | ||
2638 | return 0; | ||
2639 | |||
2640 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, | ||
2641 | SCHED_LOAD_SCALE); | ||
2642 | return 1; | ||
2501 | } | 2643 | } |
2502 | 2644 | ||
2503 | /** | 2645 | /** |
@@ -2637,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
2637 | return fix_small_imbalance(sds, this_cpu, imbalance); | 2779 | return fix_small_imbalance(sds, this_cpu, imbalance); |
2638 | 2780 | ||
2639 | } | 2781 | } |
2782 | |||
2640 | /******* find_busiest_group() helpers end here *********************/ | 2783 | /******* find_busiest_group() helpers end here *********************/ |
2641 | 2784 | ||
2642 | /** | 2785 | /** |
@@ -2688,13 +2831,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2688 | * 4) This group is more busy than the avg busieness at this | 2831 | * 4) This group is more busy than the avg busieness at this |
2689 | * sched_domain. | 2832 | * sched_domain. |
2690 | * 5) The imbalance is within the specified limit. | 2833 | * 5) The imbalance is within the specified limit. |
2834 | * | ||
2835 | * Note: when doing newidle balance, if the local group has excess | ||
2836 | * capacity (i.e. nr_running < group_capacity) and the busiest group | ||
2837 | * does not have any capacity, we force a load balance to pull tasks | ||
2838 | * to the local group. In this case, we skip past checks 3, 4 and 5. | ||
2691 | */ | 2839 | */ |
2692 | if (!(*balance)) | 2840 | if (!(*balance)) |
2693 | goto ret; | 2841 | goto ret; |
2694 | 2842 | ||
2843 | if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && | ||
2844 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | ||
2845 | return sds.busiest; | ||
2846 | |||
2695 | if (!sds.busiest || sds.busiest_nr_running == 0) | 2847 | if (!sds.busiest || sds.busiest_nr_running == 0) |
2696 | goto out_balanced; | 2848 | goto out_balanced; |
2697 | 2849 | ||
2850 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | ||
2851 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | ||
2852 | !sds.busiest_has_capacity) | ||
2853 | goto force_balance; | ||
2854 | |||
2698 | if (sds.this_load >= sds.max_load) | 2855 | if (sds.this_load >= sds.max_load) |
2699 | goto out_balanced; | 2856 | goto out_balanced; |
2700 | 2857 | ||
@@ -2706,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2706 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 2863 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) |
2707 | goto out_balanced; | 2864 | goto out_balanced; |
2708 | 2865 | ||
2866 | force_balance: | ||
2709 | /* Looks like there is an imbalance. Compute it */ | 2867 | /* Looks like there is an imbalance. Compute it */ |
2710 | calculate_imbalance(&sds, this_cpu, imbalance); | 2868 | calculate_imbalance(&sds, this_cpu, imbalance); |
2711 | return sds.busiest; | 2869 | return sds.busiest; |
@@ -2726,8 +2884,9 @@ ret: | |||
2726 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2884 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2727 | */ | 2885 | */ |
2728 | static struct rq * | 2886 | static struct rq * |
2729 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | 2887 | find_busiest_queue(struct sched_domain *sd, struct sched_group *group, |
2730 | unsigned long imbalance, const struct cpumask *cpus) | 2888 | enum cpu_idle_type idle, unsigned long imbalance, |
2889 | const struct cpumask *cpus) | ||
2731 | { | 2890 | { |
2732 | struct rq *busiest = NULL, *rq; | 2891 | struct rq *busiest = NULL, *rq; |
2733 | unsigned long max_load = 0; | 2892 | unsigned long max_load = 0; |
@@ -2738,6 +2897,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
2738 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | 2897 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); |
2739 | unsigned long wl; | 2898 | unsigned long wl; |
2740 | 2899 | ||
2900 | if (!capacity) | ||
2901 | capacity = fix_small_capacity(sd, group); | ||
2902 | |||
2741 | if (!cpumask_test_cpu(i, cpus)) | 2903 | if (!cpumask_test_cpu(i, cpus)) |
2742 | continue; | 2904 | continue; |
2743 | 2905 | ||
@@ -2777,9 +2939,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
2777 | /* Working cpumask for load_balance and load_balance_newidle. */ | 2939 | /* Working cpumask for load_balance and load_balance_newidle. */ |
2778 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 2940 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
2779 | 2941 | ||
2780 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) | 2942 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, |
2943 | int busiest_cpu, int this_cpu) | ||
2781 | { | 2944 | { |
2782 | if (idle == CPU_NEWLY_IDLE) { | 2945 | if (idle == CPU_NEWLY_IDLE) { |
2946 | |||
2947 | /* | ||
2948 | * ASYM_PACKING needs to force migrate tasks from busy but | ||
2949 | * higher numbered CPUs in order to pack all tasks in the | ||
2950 | * lowest numbered CPUs. | ||
2951 | */ | ||
2952 | if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) | ||
2953 | return 1; | ||
2954 | |||
2783 | /* | 2955 | /* |
2784 | * The only task running in a non-idle cpu can be moved to this | 2956 | * The only task running in a non-idle cpu can be moved to this |
2785 | * cpu in an attempt to completely freeup the other CPU | 2957 | * cpu in an attempt to completely freeup the other CPU |
@@ -2854,7 +3026,7 @@ redo: | |||
2854 | goto out_balanced; | 3026 | goto out_balanced; |
2855 | } | 3027 | } |
2856 | 3028 | ||
2857 | busiest = find_busiest_queue(group, idle, imbalance, cpus); | 3029 | busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); |
2858 | if (!busiest) { | 3030 | if (!busiest) { |
2859 | schedstat_inc(sd, lb_nobusyq[idle]); | 3031 | schedstat_inc(sd, lb_nobusyq[idle]); |
2860 | goto out_balanced; | 3032 | goto out_balanced; |
@@ -2896,9 +3068,17 @@ redo: | |||
2896 | 3068 | ||
2897 | if (!ld_moved) { | 3069 | if (!ld_moved) { |
2898 | schedstat_inc(sd, lb_failed[idle]); | 3070 | schedstat_inc(sd, lb_failed[idle]); |
2899 | sd->nr_balance_failed++; | 3071 | /* |
3072 | * Increment the failure counter only on periodic balance. | ||
3073 | * We do not want newidle balance, which can be very | ||
3074 | * frequent, pollute the failure counter causing | ||
3075 | * excessive cache_hot migrations and active balances. | ||
3076 | */ | ||
3077 | if (idle != CPU_NEWLY_IDLE) | ||
3078 | sd->nr_balance_failed++; | ||
2900 | 3079 | ||
2901 | if (need_active_balance(sd, sd_idle, idle)) { | 3080 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), |
3081 | this_cpu)) { | ||
2902 | raw_spin_lock_irqsave(&busiest->lock, flags); | 3082 | raw_spin_lock_irqsave(&busiest->lock, flags); |
2903 | 3083 | ||
2904 | /* don't kick the active_load_balance_cpu_stop, | 3084 | /* don't kick the active_load_balance_cpu_stop, |
@@ -3017,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3017 | interval = msecs_to_jiffies(sd->balance_interval); | 3197 | interval = msecs_to_jiffies(sd->balance_interval); |
3018 | if (time_after(next_balance, sd->last_balance + interval)) | 3198 | if (time_after(next_balance, sd->last_balance + interval)) |
3019 | next_balance = sd->last_balance + interval; | 3199 | next_balance = sd->last_balance + interval; |
3020 | if (pulled_task) { | 3200 | if (pulled_task) |
3021 | this_rq->idle_stamp = 0; | ||
3022 | break; | 3201 | break; |
3023 | } | ||
3024 | } | 3202 | } |
3025 | 3203 | ||
3026 | raw_spin_lock(&this_rq->lock); | 3204 | raw_spin_lock(&this_rq->lock); |
@@ -3093,13 +3271,40 @@ out_unlock: | |||
3093 | } | 3271 | } |
3094 | 3272 | ||
3095 | #ifdef CONFIG_NO_HZ | 3273 | #ifdef CONFIG_NO_HZ |
3274 | |||
3275 | static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); | ||
3276 | |||
3277 | static void trigger_sched_softirq(void *data) | ||
3278 | { | ||
3279 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
3280 | } | ||
3281 | |||
3282 | static inline void init_sched_softirq_csd(struct call_single_data *csd) | ||
3283 | { | ||
3284 | csd->func = trigger_sched_softirq; | ||
3285 | csd->info = NULL; | ||
3286 | csd->flags = 0; | ||
3287 | csd->priv = 0; | ||
3288 | } | ||
3289 | |||
3290 | /* | ||
3291 | * idle load balancing details | ||
3292 | * - One of the idle CPUs nominates itself as idle load_balancer, while | ||
3293 | * entering idle. | ||
3294 | * - This idle load balancer CPU will also go into tickless mode when | ||
3295 | * it is idle, just like all other idle CPUs | ||
3296 | * - When one of the busy CPUs notice that there may be an idle rebalancing | ||
3297 | * needed, they will kick the idle load balancer, which then does idle | ||
3298 | * load balancing for all the idle CPUs. | ||
3299 | */ | ||
3096 | static struct { | 3300 | static struct { |
3097 | atomic_t load_balancer; | 3301 | atomic_t load_balancer; |
3098 | cpumask_var_t cpu_mask; | 3302 | atomic_t first_pick_cpu; |
3099 | cpumask_var_t ilb_grp_nohz_mask; | 3303 | atomic_t second_pick_cpu; |
3100 | } nohz ____cacheline_aligned = { | 3304 | cpumask_var_t idle_cpus_mask; |
3101 | .load_balancer = ATOMIC_INIT(-1), | 3305 | cpumask_var_t grp_idle_mask; |
3102 | }; | 3306 | unsigned long next_balance; /* in jiffy units */ |
3307 | } nohz ____cacheline_aligned; | ||
3103 | 3308 | ||
3104 | int get_nohz_load_balancer(void) | 3309 | int get_nohz_load_balancer(void) |
3105 | { | 3310 | { |
@@ -3153,17 +3358,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
3153 | */ | 3358 | */ |
3154 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | 3359 | static inline int is_semi_idle_group(struct sched_group *ilb_group) |
3155 | { | 3360 | { |
3156 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | 3361 | cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, |
3157 | sched_group_cpus(ilb_group)); | 3362 | sched_group_cpus(ilb_group)); |
3158 | 3363 | ||
3159 | /* | 3364 | /* |
3160 | * A sched_group is semi-idle when it has atleast one busy cpu | 3365 | * A sched_group is semi-idle when it has atleast one busy cpu |
3161 | * and atleast one idle cpu. | 3366 | * and atleast one idle cpu. |
3162 | */ | 3367 | */ |
3163 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | 3368 | if (cpumask_empty(nohz.grp_idle_mask)) |
3164 | return 0; | 3369 | return 0; |
3165 | 3370 | ||
3166 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | 3371 | if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) |
3167 | return 0; | 3372 | return 0; |
3168 | 3373 | ||
3169 | return 1; | 3374 | return 1; |
@@ -3196,7 +3401,7 @@ static int find_new_ilb(int cpu) | |||
3196 | * Optimize for the case when we have no idle CPUs or only one | 3401 | * Optimize for the case when we have no idle CPUs or only one |
3197 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | 3402 | * idle CPU. Don't walk the sched_domain hierarchy in such cases |
3198 | */ | 3403 | */ |
3199 | if (cpumask_weight(nohz.cpu_mask) < 2) | 3404 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) |
3200 | goto out_done; | 3405 | goto out_done; |
3201 | 3406 | ||
3202 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 3407 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
@@ -3204,7 +3409,7 @@ static int find_new_ilb(int cpu) | |||
3204 | 3409 | ||
3205 | do { | 3410 | do { |
3206 | if (is_semi_idle_group(ilb_group)) | 3411 | if (is_semi_idle_group(ilb_group)) |
3207 | return cpumask_first(nohz.ilb_grp_nohz_mask); | 3412 | return cpumask_first(nohz.grp_idle_mask); |
3208 | 3413 | ||
3209 | ilb_group = ilb_group->next; | 3414 | ilb_group = ilb_group->next; |
3210 | 3415 | ||
@@ -3212,98 +3417,116 @@ static int find_new_ilb(int cpu) | |||
3212 | } | 3417 | } |
3213 | 3418 | ||
3214 | out_done: | 3419 | out_done: |
3215 | return cpumask_first(nohz.cpu_mask); | 3420 | return nr_cpu_ids; |
3216 | } | 3421 | } |
3217 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 3422 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
3218 | static inline int find_new_ilb(int call_cpu) | 3423 | static inline int find_new_ilb(int call_cpu) |
3219 | { | 3424 | { |
3220 | return cpumask_first(nohz.cpu_mask); | 3425 | return nr_cpu_ids; |
3221 | } | 3426 | } |
3222 | #endif | 3427 | #endif |
3223 | 3428 | ||
3224 | /* | 3429 | /* |
3430 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | ||
3431 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | ||
3432 | * CPU (if there is one). | ||
3433 | */ | ||
3434 | static void nohz_balancer_kick(int cpu) | ||
3435 | { | ||
3436 | int ilb_cpu; | ||
3437 | |||
3438 | nohz.next_balance++; | ||
3439 | |||
3440 | ilb_cpu = get_nohz_load_balancer(); | ||
3441 | |||
3442 | if (ilb_cpu >= nr_cpu_ids) { | ||
3443 | ilb_cpu = cpumask_first(nohz.idle_cpus_mask); | ||
3444 | if (ilb_cpu >= nr_cpu_ids) | ||
3445 | return; | ||
3446 | } | ||
3447 | |||
3448 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { | ||
3449 | struct call_single_data *cp; | ||
3450 | |||
3451 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; | ||
3452 | cp = &per_cpu(remote_sched_softirq_cb, cpu); | ||
3453 | __smp_call_function_single(ilb_cpu, cp, 0); | ||
3454 | } | ||
3455 | return; | ||
3456 | } | ||
3457 | |||
3458 | /* | ||
3225 | * This routine will try to nominate the ilb (idle load balancing) | 3459 | * This routine will try to nominate the ilb (idle load balancing) |
3226 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | 3460 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle |
3227 | * load balancing on behalf of all those cpus. If all the cpus in the system | 3461 | * load balancing on behalf of all those cpus. |
3228 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
3229 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
3230 | * arrives... | ||
3231 | * | ||
3232 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
3233 | * for idle load balancing. ilb owner will still be part of | ||
3234 | * nohz.cpu_mask.. | ||
3235 | * | 3462 | * |
3236 | * While stopping the tick, this cpu will become the ilb owner if there | 3463 | * When the ilb owner becomes busy, we will not have new ilb owner until some |
3237 | * is no other owner. And will be the owner till that cpu becomes busy | 3464 | * idle CPU wakes up and goes back to idle or some busy CPU tries to kick |
3238 | * or if all cpus in the system stop their ticks at which point | 3465 | * idle load balancing by kicking one of the idle CPUs. |
3239 | * there is no need for ilb owner. | ||
3240 | * | 3466 | * |
3241 | * When the ilb owner becomes busy, it nominates another owner, during the | 3467 | * Ticks are stopped for the ilb owner as well, with busy CPU kicking this |
3242 | * next busy scheduler_tick() | 3468 | * ilb owner CPU in future (when there is a need for idle load balancing on |
3469 | * behalf of all idle CPUs). | ||
3243 | */ | 3470 | */ |
3244 | int select_nohz_load_balancer(int stop_tick) | 3471 | void select_nohz_load_balancer(int stop_tick) |
3245 | { | 3472 | { |
3246 | int cpu = smp_processor_id(); | 3473 | int cpu = smp_processor_id(); |
3247 | 3474 | ||
3248 | if (stop_tick) { | 3475 | if (stop_tick) { |
3249 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
3250 | |||
3251 | if (!cpu_active(cpu)) { | 3476 | if (!cpu_active(cpu)) { |
3252 | if (atomic_read(&nohz.load_balancer) != cpu) | 3477 | if (atomic_read(&nohz.load_balancer) != cpu) |
3253 | return 0; | 3478 | return; |
3254 | 3479 | ||
3255 | /* | 3480 | /* |
3256 | * If we are going offline and still the leader, | 3481 | * If we are going offline and still the leader, |
3257 | * give up! | 3482 | * give up! |
3258 | */ | 3483 | */ |
3259 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3484 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, |
3485 | nr_cpu_ids) != cpu) | ||
3260 | BUG(); | 3486 | BUG(); |
3261 | 3487 | ||
3262 | return 0; | 3488 | return; |
3263 | } | 3489 | } |
3264 | 3490 | ||
3265 | cpumask_set_cpu(cpu, nohz.cpu_mask); | 3491 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
3266 | 3492 | ||
3267 | /* time for ilb owner also to sleep */ | 3493 | if (atomic_read(&nohz.first_pick_cpu) == cpu) |
3268 | if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { | 3494 | atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); |
3269 | if (atomic_read(&nohz.load_balancer) == cpu) | 3495 | if (atomic_read(&nohz.second_pick_cpu) == cpu) |
3270 | atomic_set(&nohz.load_balancer, -1); | 3496 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); |
3271 | return 0; | ||
3272 | } | ||
3273 | 3497 | ||
3274 | if (atomic_read(&nohz.load_balancer) == -1) { | 3498 | if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { |
3275 | /* make me the ilb owner */ | ||
3276 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
3277 | return 1; | ||
3278 | } else if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3279 | int new_ilb; | 3499 | int new_ilb; |
3280 | 3500 | ||
3281 | if (!(sched_smt_power_savings || | 3501 | /* make me the ilb owner */ |
3282 | sched_mc_power_savings)) | 3502 | if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, |
3283 | return 1; | 3503 | cpu) != nr_cpu_ids) |
3504 | return; | ||
3505 | |||
3284 | /* | 3506 | /* |
3285 | * Check to see if there is a more power-efficient | 3507 | * Check to see if there is a more power-efficient |
3286 | * ilb. | 3508 | * ilb. |
3287 | */ | 3509 | */ |
3288 | new_ilb = find_new_ilb(cpu); | 3510 | new_ilb = find_new_ilb(cpu); |
3289 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | 3511 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { |
3290 | atomic_set(&nohz.load_balancer, -1); | 3512 | atomic_set(&nohz.load_balancer, nr_cpu_ids); |
3291 | resched_cpu(new_ilb); | 3513 | resched_cpu(new_ilb); |
3292 | return 0; | 3514 | return; |
3293 | } | 3515 | } |
3294 | return 1; | 3516 | return; |
3295 | } | 3517 | } |
3296 | } else { | 3518 | } else { |
3297 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | 3519 | if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) |
3298 | return 0; | 3520 | return; |
3299 | 3521 | ||
3300 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | 3522 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); |
3301 | 3523 | ||
3302 | if (atomic_read(&nohz.load_balancer) == cpu) | 3524 | if (atomic_read(&nohz.load_balancer) == cpu) |
3303 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3525 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, |
3526 | nr_cpu_ids) != cpu) | ||
3304 | BUG(); | 3527 | BUG(); |
3305 | } | 3528 | } |
3306 | return 0; | 3529 | return; |
3307 | } | 3530 | } |
3308 | #endif | 3531 | #endif |
3309 | 3532 | ||
@@ -3385,11 +3608,102 @@ out: | |||
3385 | rq->next_balance = next_balance; | 3608 | rq->next_balance = next_balance; |
3386 | } | 3609 | } |
3387 | 3610 | ||
3611 | #ifdef CONFIG_NO_HZ | ||
3388 | /* | 3612 | /* |
3389 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 3613 | * In CONFIG_NO_HZ case, the idle balance kickee will do the |
3390 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
3391 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 3614 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
3392 | */ | 3615 | */ |
3616 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | ||
3617 | { | ||
3618 | struct rq *this_rq = cpu_rq(this_cpu); | ||
3619 | struct rq *rq; | ||
3620 | int balance_cpu; | ||
3621 | |||
3622 | if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) | ||
3623 | return; | ||
3624 | |||
3625 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | ||
3626 | if (balance_cpu == this_cpu) | ||
3627 | continue; | ||
3628 | |||
3629 | /* | ||
3630 | * If this cpu gets work to do, stop the load balancing | ||
3631 | * work being done for other cpus. Next load | ||
3632 | * balancing owner will pick it up. | ||
3633 | */ | ||
3634 | if (need_resched()) { | ||
3635 | this_rq->nohz_balance_kick = 0; | ||
3636 | break; | ||
3637 | } | ||
3638 | |||
3639 | raw_spin_lock_irq(&this_rq->lock); | ||
3640 | update_rq_clock(this_rq); | ||
3641 | update_cpu_load(this_rq); | ||
3642 | raw_spin_unlock_irq(&this_rq->lock); | ||
3643 | |||
3644 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
3645 | |||
3646 | rq = cpu_rq(balance_cpu); | ||
3647 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
3648 | this_rq->next_balance = rq->next_balance; | ||
3649 | } | ||
3650 | nohz.next_balance = this_rq->next_balance; | ||
3651 | this_rq->nohz_balance_kick = 0; | ||
3652 | } | ||
3653 | |||
3654 | /* | ||
3655 | * Current heuristic for kicking the idle load balancer | ||
3656 | * - first_pick_cpu is the one of the busy CPUs. It will kick | ||
3657 | * idle load balancer when it has more than one process active. This | ||
3658 | * eliminates the need for idle load balancing altogether when we have | ||
3659 | * only one running process in the system (common case). | ||
3660 | * - If there are more than one busy CPU, idle load balancer may have | ||
3661 | * to run for active_load_balance to happen (i.e., two busy CPUs are | ||
3662 | * SMT or core siblings and can run better if they move to different | ||
3663 | * physical CPUs). So, second_pick_cpu is the second of the busy CPUs | ||
3664 | * which will kick idle load balancer as soon as it has any load. | ||
3665 | */ | ||
3666 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | ||
3667 | { | ||
3668 | unsigned long now = jiffies; | ||
3669 | int ret; | ||
3670 | int first_pick_cpu, second_pick_cpu; | ||
3671 | |||
3672 | if (time_before(now, nohz.next_balance)) | ||
3673 | return 0; | ||
3674 | |||
3675 | if (rq->idle_at_tick) | ||
3676 | return 0; | ||
3677 | |||
3678 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | ||
3679 | second_pick_cpu = atomic_read(&nohz.second_pick_cpu); | ||
3680 | |||
3681 | if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && | ||
3682 | second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) | ||
3683 | return 0; | ||
3684 | |||
3685 | ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); | ||
3686 | if (ret == nr_cpu_ids || ret == cpu) { | ||
3687 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | ||
3688 | if (rq->nr_running > 1) | ||
3689 | return 1; | ||
3690 | } else { | ||
3691 | ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); | ||
3692 | if (ret == nr_cpu_ids || ret == cpu) { | ||
3693 | if (rq->nr_running) | ||
3694 | return 1; | ||
3695 | } | ||
3696 | } | ||
3697 | return 0; | ||
3698 | } | ||
3699 | #else | ||
3700 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | ||
3701 | #endif | ||
3702 | |||
3703 | /* | ||
3704 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
3705 | * Also triggered for nohz idle balancing (with nohz_balancing_kick set). | ||
3706 | */ | ||
3393 | static void run_rebalance_domains(struct softirq_action *h) | 3707 | static void run_rebalance_domains(struct softirq_action *h) |
3394 | { | 3708 | { |
3395 | int this_cpu = smp_processor_id(); | 3709 | int this_cpu = smp_processor_id(); |
@@ -3399,37 +3713,12 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
3399 | 3713 | ||
3400 | rebalance_domains(this_cpu, idle); | 3714 | rebalance_domains(this_cpu, idle); |
3401 | 3715 | ||
3402 | #ifdef CONFIG_NO_HZ | ||
3403 | /* | 3716 | /* |
3404 | * If this cpu is the owner for idle load balancing, then do the | 3717 | * If this cpu has a pending nohz_balance_kick, then do the |
3405 | * balancing on behalf of the other idle cpus whose ticks are | 3718 | * balancing on behalf of the other idle cpus whose ticks are |
3406 | * stopped. | 3719 | * stopped. |
3407 | */ | 3720 | */ |
3408 | if (this_rq->idle_at_tick && | 3721 | nohz_idle_balance(this_cpu, idle); |
3409 | atomic_read(&nohz.load_balancer) == this_cpu) { | ||
3410 | struct rq *rq; | ||
3411 | int balance_cpu; | ||
3412 | |||
3413 | for_each_cpu(balance_cpu, nohz.cpu_mask) { | ||
3414 | if (balance_cpu == this_cpu) | ||
3415 | continue; | ||
3416 | |||
3417 | /* | ||
3418 | * If this cpu gets work to do, stop the load balancing | ||
3419 | * work being done for other cpus. Next load | ||
3420 | * balancing owner will pick it up. | ||
3421 | */ | ||
3422 | if (need_resched()) | ||
3423 | break; | ||
3424 | |||
3425 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
3426 | |||
3427 | rq = cpu_rq(balance_cpu); | ||
3428 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
3429 | this_rq->next_balance = rq->next_balance; | ||
3430 | } | ||
3431 | } | ||
3432 | #endif | ||
3433 | } | 3722 | } |
3434 | 3723 | ||
3435 | static inline int on_null_domain(int cpu) | 3724 | static inline int on_null_domain(int cpu) |
@@ -3439,57 +3728,17 @@ static inline int on_null_domain(int cpu) | |||
3439 | 3728 | ||
3440 | /* | 3729 | /* |
3441 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 3730 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
3442 | * | ||
3443 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
3444 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
3445 | * if the whole system is idle. | ||
3446 | */ | 3731 | */ |
3447 | static inline void trigger_load_balance(struct rq *rq, int cpu) | 3732 | static inline void trigger_load_balance(struct rq *rq, int cpu) |
3448 | { | 3733 | { |
3449 | #ifdef CONFIG_NO_HZ | ||
3450 | /* | ||
3451 | * If we were in the nohz mode recently and busy at the current | ||
3452 | * scheduler tick, then check if we need to nominate new idle | ||
3453 | * load balancer. | ||
3454 | */ | ||
3455 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
3456 | rq->in_nohz_recently = 0; | ||
3457 | |||
3458 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3459 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
3460 | atomic_set(&nohz.load_balancer, -1); | ||
3461 | } | ||
3462 | |||
3463 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
3464 | int ilb = find_new_ilb(cpu); | ||
3465 | |||
3466 | if (ilb < nr_cpu_ids) | ||
3467 | resched_cpu(ilb); | ||
3468 | } | ||
3469 | } | ||
3470 | |||
3471 | /* | ||
3472 | * If this cpu is idle and doing idle load balancing for all the | ||
3473 | * cpus with ticks stopped, is it time for that to stop? | ||
3474 | */ | ||
3475 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
3476 | cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
3477 | resched_cpu(cpu); | ||
3478 | return; | ||
3479 | } | ||
3480 | |||
3481 | /* | ||
3482 | * If this cpu is idle and the idle load balancing is done by | ||
3483 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
3484 | */ | ||
3485 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
3486 | cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
3487 | return; | ||
3488 | #endif | ||
3489 | /* Don't need to rebalance while attached to NULL domain */ | 3734 | /* Don't need to rebalance while attached to NULL domain */ |
3490 | if (time_after_eq(jiffies, rq->next_balance) && | 3735 | if (time_after_eq(jiffies, rq->next_balance) && |
3491 | likely(!on_null_domain(cpu))) | 3736 | likely(!on_null_domain(cpu))) |
3492 | raise_softirq(SCHED_SOFTIRQ); | 3737 | raise_softirq(SCHED_SOFTIRQ); |
3738 | #ifdef CONFIG_NO_HZ | ||
3739 | else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | ||
3740 | nohz_balancer_kick(cpu); | ||
3741 | #endif | ||
3493 | } | 3742 | } |
3494 | 3743 | ||
3495 | static void rq_online_fair(struct rq *rq) | 3744 | static void rq_online_fair(struct rq *rq) |
@@ -3542,8 +3791,13 @@ static void task_fork_fair(struct task_struct *p) | |||
3542 | 3791 | ||
3543 | raw_spin_lock_irqsave(&rq->lock, flags); | 3792 | raw_spin_lock_irqsave(&rq->lock, flags); |
3544 | 3793 | ||
3545 | if (unlikely(task_cpu(p) != this_cpu)) | 3794 | update_rq_clock(rq); |
3795 | |||
3796 | if (unlikely(task_cpu(p) != this_cpu)) { | ||
3797 | rcu_read_lock(); | ||
3546 | __set_task_cpu(p, this_cpu); | 3798 | __set_task_cpu(p, this_cpu); |
3799 | rcu_read_unlock(); | ||
3800 | } | ||
3547 | 3801 | ||
3548 | update_curr(cfs_rq); | 3802 | update_curr(cfs_rq); |
3549 | 3803 | ||
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 83c66e8ad3ee..185f920ec1a2 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1) | |||
61 | * release the lock. Decreases scheduling overhead. | 61 | * release the lock. Decreases scheduling overhead. |
62 | */ | 62 | */ |
63 | SCHED_FEAT(OWNER_SPIN, 1) | 63 | SCHED_FEAT(OWNER_SPIN, 1) |
64 | |||
65 | /* | ||
66 | * Decrement CPU power based on irq activity | ||
67 | */ | ||
68 | SCHED_FEAT(NONIRQ_POWER, 1) | ||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8afb953e31c6..bea7d79f7e9c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq) | |||
609 | if (!task_has_rt_policy(curr)) | 609 | if (!task_has_rt_policy(curr)) |
610 | return; | 610 | return; |
611 | 611 | ||
612 | delta_exec = rq->clock - curr->se.exec_start; | 612 | delta_exec = rq->clock_task - curr->se.exec_start; |
613 | if (unlikely((s64)delta_exec < 0)) | 613 | if (unlikely((s64)delta_exec < 0)) |
614 | delta_exec = 0; | 614 | delta_exec = 0; |
615 | 615 | ||
@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq) | |||
618 | curr->se.sum_exec_runtime += delta_exec; | 618 | curr->se.sum_exec_runtime += delta_exec; |
619 | account_group_exec_runtime(curr, delta_exec); | 619 | account_group_exec_runtime(curr, delta_exec); |
620 | 620 | ||
621 | curr->se.exec_start = rq->clock; | 621 | curr->se.exec_start = rq->clock_task; |
622 | cpuacct_charge(curr, delta_exec); | 622 | cpuacct_charge(curr, delta_exec); |
623 | 623 | ||
624 | sched_rt_avg_update(rq, delta_exec); | 624 | sched_rt_avg_update(rq, delta_exec); |
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | |||
960 | * runqueue. Otherwise simply start this RT task | 960 | * runqueue. Otherwise simply start this RT task |
961 | * on its current runqueue. | 961 | * on its current runqueue. |
962 | * | 962 | * |
963 | * We want to avoid overloading runqueues. Even if | 963 | * We want to avoid overloading runqueues. If the woken |
964 | * the RT task is of higher priority than the current RT task. | 964 | * task is a higher priority, then it will stay on this CPU |
965 | * RT tasks behave differently than other tasks. If | 965 | * and the lower prio task should be moved to another CPU. |
966 | * one gets preempted, we try to push it off to another queue. | 966 | * Even though this will probably make the lower prio task |
967 | * So trying to keep a preempting RT task on the same | 967 | * lose its cache, we do not want to bounce a higher task |
968 | * cache hot CPU will force the running RT task to | 968 | * around just because it gave up its CPU, perhaps for a |
969 | * a cold CPU. So we waste all the cache for the lower | 969 | * lock? |
970 | * RT task in hopes of saving some of a RT task | 970 | * |
971 | * that is just being woken and probably will have | 971 | * For equal prio tasks, we just let the scheduler sort it out. |
972 | * cold cache anyway. | ||
973 | */ | 972 | */ |
974 | if (unlikely(rt_task(rq->curr)) && | 973 | if (unlikely(rt_task(rq->curr)) && |
974 | (rq->curr->rt.nr_cpus_allowed < 2 || | ||
975 | rq->curr->prio < p->prio) && | ||
975 | (p->rt.nr_cpus_allowed > 1)) { | 976 | (p->rt.nr_cpus_allowed > 1)) { |
976 | int cpu = find_lowest_rq(p); | 977 | int cpu = find_lowest_rq(p); |
977 | 978 | ||
@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) | |||
1074 | } while (rt_rq); | 1075 | } while (rt_rq); |
1075 | 1076 | ||
1076 | p = rt_task_of(rt_se); | 1077 | p = rt_task_of(rt_se); |
1077 | p->se.exec_start = rq->clock; | 1078 | p->se.exec_start = rq->clock_task; |
1078 | 1079 | ||
1079 | return p; | 1080 | return p; |
1080 | } | 1081 | } |
@@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) | |||
1139 | for_each_leaf_rt_rq(rt_rq, rq) { | 1140 | for_each_leaf_rt_rq(rt_rq, rq) { |
1140 | array = &rt_rq->active; | 1141 | array = &rt_rq->active; |
1141 | idx = sched_find_first_bit(array->bitmap); | 1142 | idx = sched_find_first_bit(array->bitmap); |
1142 | next_idx: | 1143 | next_idx: |
1143 | if (idx >= MAX_RT_PRIO) | 1144 | if (idx >= MAX_RT_PRIO) |
1144 | continue; | 1145 | continue; |
1145 | if (next && next->prio < idx) | 1146 | if (next && next->prio < idx) |
@@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq) | |||
1315 | if (!next_task) | 1316 | if (!next_task) |
1316 | return 0; | 1317 | return 0; |
1317 | 1318 | ||
1318 | retry: | 1319 | retry: |
1319 | if (unlikely(next_task == rq->curr)) { | 1320 | if (unlikely(next_task == rq->curr)) { |
1320 | WARN_ON(1); | 1321 | WARN_ON(1); |
1321 | return 0; | 1322 | return 0; |
@@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq) | |||
1463 | * but possible) | 1464 | * but possible) |
1464 | */ | 1465 | */ |
1465 | } | 1466 | } |
1466 | skip: | 1467 | skip: |
1467 | double_unlock_balance(this_rq, src_rq); | 1468 | double_unlock_balance(this_rq, src_rq); |
1468 | } | 1469 | } |
1469 | 1470 | ||
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1491 | if (!task_running(rq, p) && | 1492 | if (!task_running(rq, p) && |
1492 | !test_tsk_need_resched(rq->curr) && | 1493 | !test_tsk_need_resched(rq->curr) && |
1493 | has_pushable_tasks(rq) && | 1494 | has_pushable_tasks(rq) && |
1494 | p->rt.nr_cpus_allowed > 1) | 1495 | p->rt.nr_cpus_allowed > 1 && |
1496 | rt_task(rq->curr) && | ||
1497 | (rq->curr->rt.nr_cpus_allowed < 2 || | ||
1498 | rq->curr->prio < p->prio)) | ||
1495 | push_rt_tasks(rq); | 1499 | push_rt_tasks(rq); |
1496 | } | 1500 | } |
1497 | 1501 | ||
@@ -1663,9 +1667,6 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
1663 | { | 1667 | { |
1664 | unsigned long soft, hard; | 1668 | unsigned long soft, hard; |
1665 | 1669 | ||
1666 | if (!p->signal) | ||
1667 | return; | ||
1668 | |||
1669 | /* max may change after cur was read, this will be fixed next tick */ | 1670 | /* max may change after cur was read, this will be fixed next tick */ |
1670 | soft = task_rlimit(p, RLIMIT_RTTIME); | 1671 | soft = task_rlimit(p, RLIMIT_RTTIME); |
1671 | hard = task_rlimit_max(p, RLIMIT_RTTIME); | 1672 | hard = task_rlimit_max(p, RLIMIT_RTTIME); |
@@ -1712,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq) | |||
1712 | { | 1713 | { |
1713 | struct task_struct *p = rq->curr; | 1714 | struct task_struct *p = rq->curr; |
1714 | 1715 | ||
1715 | p->se.exec_start = rq->clock; | 1716 | p->se.exec_start = rq->clock_task; |
1716 | 1717 | ||
1717 | /* The running task is never eligible for pushing */ | 1718 | /* The running task is never eligible for pushing */ |
1718 | dequeue_pushable_task(rq, p); | 1719 | dequeue_pushable_task(rq, p); |
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 32d2bd4061b0..25c2f962f6fc 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
@@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) | |||
295 | static inline void account_group_user_time(struct task_struct *tsk, | 295 | static inline void account_group_user_time(struct task_struct *tsk, |
296 | cputime_t cputime) | 296 | cputime_t cputime) |
297 | { | 297 | { |
298 | struct thread_group_cputimer *cputimer; | 298 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
299 | |||
300 | /* tsk == current, ensure it is safe to use ->signal */ | ||
301 | if (unlikely(tsk->exit_state)) | ||
302 | return; | ||
303 | |||
304 | cputimer = &tsk->signal->cputimer; | ||
305 | 299 | ||
306 | if (!cputimer->running) | 300 | if (!cputimer->running) |
307 | return; | 301 | return; |
@@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk, | |||
325 | static inline void account_group_system_time(struct task_struct *tsk, | 319 | static inline void account_group_system_time(struct task_struct *tsk, |
326 | cputime_t cputime) | 320 | cputime_t cputime) |
327 | { | 321 | { |
328 | struct thread_group_cputimer *cputimer; | 322 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
329 | |||
330 | /* tsk == current, ensure it is safe to use ->signal */ | ||
331 | if (unlikely(tsk->exit_state)) | ||
332 | return; | ||
333 | |||
334 | cputimer = &tsk->signal->cputimer; | ||
335 | 323 | ||
336 | if (!cputimer->running) | 324 | if (!cputimer->running) |
337 | return; | 325 | return; |
@@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk, | |||
355 | static inline void account_group_exec_runtime(struct task_struct *tsk, | 343 | static inline void account_group_exec_runtime(struct task_struct *tsk, |
356 | unsigned long long ns) | 344 | unsigned long long ns) |
357 | { | 345 | { |
358 | struct thread_group_cputimer *cputimer; | 346 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
359 | struct signal_struct *sig; | ||
360 | |||
361 | sig = tsk->signal; | ||
362 | /* see __exit_signal()->task_rq_unlock_wait() */ | ||
363 | barrier(); | ||
364 | if (unlikely(!sig)) | ||
365 | return; | ||
366 | |||
367 | cputimer = &sig->cputimer; | ||
368 | 347 | ||
369 | if (!cputimer->running) | 348 | if (!cputimer->running) |
370 | return; | 349 | return; |
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c new file mode 100644 index 000000000000..45bddc0c1048 --- /dev/null +++ b/kernel/sched_stoptask.c | |||
@@ -0,0 +1,108 @@ | |||
1 | /* | ||
2 | * stop-task scheduling class. | ||
3 | * | ||
4 | * The stop task is the highest priority task in the system, it preempts | ||
5 | * everything and will be preempted by nothing. | ||
6 | * | ||
7 | * See kernel/stop_machine.c | ||
8 | */ | ||
9 | |||
10 | #ifdef CONFIG_SMP | ||
11 | static int | ||
12 | select_task_rq_stop(struct rq *rq, struct task_struct *p, | ||
13 | int sd_flag, int flags) | ||
14 | { | ||
15 | return task_cpu(p); /* stop tasks as never migrate */ | ||
16 | } | ||
17 | #endif /* CONFIG_SMP */ | ||
18 | |||
19 | static void | ||
20 | check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) | ||
21 | { | ||
22 | resched_task(rq->curr); /* we preempt everything */ | ||
23 | } | ||
24 | |||
25 | static struct task_struct *pick_next_task_stop(struct rq *rq) | ||
26 | { | ||
27 | struct task_struct *stop = rq->stop; | ||
28 | |||
29 | if (stop && stop->state == TASK_RUNNING) | ||
30 | return stop; | ||
31 | |||
32 | return NULL; | ||
33 | } | ||
34 | |||
35 | static void | ||
36 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) | ||
37 | { | ||
38 | } | ||
39 | |||
40 | static void | ||
41 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) | ||
42 | { | ||
43 | } | ||
44 | |||
45 | static void yield_task_stop(struct rq *rq) | ||
46 | { | ||
47 | BUG(); /* the stop task should never yield, its pointless. */ | ||
48 | } | ||
49 | |||
50 | static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) | ||
51 | { | ||
52 | } | ||
53 | |||
54 | static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) | ||
55 | { | ||
56 | } | ||
57 | |||
58 | static void set_curr_task_stop(struct rq *rq) | ||
59 | { | ||
60 | } | ||
61 | |||
62 | static void switched_to_stop(struct rq *rq, struct task_struct *p, | ||
63 | int running) | ||
64 | { | ||
65 | BUG(); /* its impossible to change to this class */ | ||
66 | } | ||
67 | |||
68 | static void prio_changed_stop(struct rq *rq, struct task_struct *p, | ||
69 | int oldprio, int running) | ||
70 | { | ||
71 | BUG(); /* how!?, what priority? */ | ||
72 | } | ||
73 | |||
74 | static unsigned int | ||
75 | get_rr_interval_stop(struct rq *rq, struct task_struct *task) | ||
76 | { | ||
77 | return 0; | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * Simple, special scheduling class for the per-CPU stop tasks: | ||
82 | */ | ||
83 | static const struct sched_class stop_sched_class = { | ||
84 | .next = &rt_sched_class, | ||
85 | |||
86 | .enqueue_task = enqueue_task_stop, | ||
87 | .dequeue_task = dequeue_task_stop, | ||
88 | .yield_task = yield_task_stop, | ||
89 | |||
90 | .check_preempt_curr = check_preempt_curr_stop, | ||
91 | |||
92 | .pick_next_task = pick_next_task_stop, | ||
93 | .put_prev_task = put_prev_task_stop, | ||
94 | |||
95 | #ifdef CONFIG_SMP | ||
96 | .select_task_rq = select_task_rq_stop, | ||
97 | #endif | ||
98 | |||
99 | .set_curr_task = set_curr_task_stop, | ||
100 | .task_tick = task_tick_stop, | ||
101 | |||
102 | .get_rr_interval = get_rr_interval_stop, | ||
103 | |||
104 | .prio_changed = prio_changed_stop, | ||
105 | .switched_to = switched_to_stop, | ||
106 | |||
107 | /* no .task_new for stop tasks */ | ||
108 | }; | ||
diff --git a/kernel/signal.c b/kernel/signal.c index 906ae5a1779c..4e3cff10fdce 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -637,7 +637,7 @@ static inline bool si_fromuser(const struct siginfo *info) | |||
637 | 637 | ||
638 | /* | 638 | /* |
639 | * Bad permissions for sending the signal | 639 | * Bad permissions for sending the signal |
640 | * - the caller must hold at least the RCU read lock | 640 | * - the caller must hold the RCU read lock |
641 | */ | 641 | */ |
642 | static int check_kill_permission(int sig, struct siginfo *info, | 642 | static int check_kill_permission(int sig, struct siginfo *info, |
643 | struct task_struct *t) | 643 | struct task_struct *t) |
@@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p) | |||
1105 | return count; | 1105 | return count; |
1106 | } | 1106 | } |
1107 | 1107 | ||
1108 | struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) | 1108 | struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, |
1109 | unsigned long *flags) | ||
1109 | { | 1110 | { |
1110 | struct sighand_struct *sighand; | 1111 | struct sighand_struct *sighand; |
1111 | 1112 | ||
@@ -1127,11 +1128,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long | |||
1127 | 1128 | ||
1128 | /* | 1129 | /* |
1129 | * send signal info to all the members of a group | 1130 | * send signal info to all the members of a group |
1130 | * - the caller must hold the RCU read lock at least | ||
1131 | */ | 1131 | */ |
1132 | int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | 1132 | int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) |
1133 | { | 1133 | { |
1134 | int ret = check_kill_permission(sig, info, p); | 1134 | int ret; |
1135 | |||
1136 | rcu_read_lock(); | ||
1137 | ret = check_kill_permission(sig, info, p); | ||
1138 | rcu_read_unlock(); | ||
1135 | 1139 | ||
1136 | if (!ret && sig) | 1140 | if (!ret && sig) |
1137 | ret = do_send_sig_info(sig, info, p, true); | 1141 | ret = do_send_sig_info(sig, info, p, true); |
@@ -1614,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk) | |||
1614 | * is gone, we keep current->exit_code unless clear_code. | 1618 | * is gone, we keep current->exit_code unless clear_code. |
1615 | */ | 1619 | */ |
1616 | static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | 1620 | static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) |
1621 | __releases(¤t->sighand->siglock) | ||
1622 | __acquires(¤t->sighand->siglock) | ||
1617 | { | 1623 | { |
1618 | if (arch_ptrace_stop_needed(exit_code, info)) { | 1624 | if (arch_ptrace_stop_needed(exit_code, info)) { |
1619 | /* | 1625 | /* |
@@ -2212,6 +2218,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | |||
2212 | #ifdef __ARCH_SI_TRAPNO | 2218 | #ifdef __ARCH_SI_TRAPNO |
2213 | err |= __put_user(from->si_trapno, &to->si_trapno); | 2219 | err |= __put_user(from->si_trapno, &to->si_trapno); |
2214 | #endif | 2220 | #endif |
2221 | #ifdef BUS_MCEERR_AO | ||
2222 | /* | ||
2223 | * Other callers might not initialize the si_lsb field, | ||
2224 | * so check explicitely for the right codes here. | ||
2225 | */ | ||
2226 | if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) | ||
2227 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); | ||
2228 | #endif | ||
2215 | break; | 2229 | break; |
2216 | case __SI_CHLD: | 2230 | case __SI_CHLD: |
2217 | err |= __put_user(from->si_pid, &to->si_pid); | 2231 | err |= __put_user(from->si_pid, &to->si_pid); |
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c deleted file mode 100644 index e45c43645298..000000000000 --- a/kernel/slow-work-debugfs.c +++ /dev/null | |||
@@ -1,227 +0,0 @@ | |||
1 | /* Slow work debugging | ||
2 | * | ||
3 | * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. | ||
4 | * Written by David Howells (dhowells@redhat.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public Licence | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the Licence, or (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #include <linux/module.h> | ||
13 | #include <linux/slow-work.h> | ||
14 | #include <linux/fs.h> | ||
15 | #include <linux/time.h> | ||
16 | #include <linux/seq_file.h> | ||
17 | #include "slow-work.h" | ||
18 | |||
19 | #define ITERATOR_SHIFT (BITS_PER_LONG - 4) | ||
20 | #define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT) | ||
21 | #define ITERATOR_COUNTER (~ITERATOR_SELECTOR) | ||
22 | |||
23 | void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m) | ||
24 | { | ||
25 | seq_puts(m, "Slow-work: New thread"); | ||
26 | } | ||
27 | |||
28 | /* | ||
29 | * Render the time mark field on a work item into a 5-char time with units plus | ||
30 | * a space | ||
31 | */ | ||
32 | static void slow_work_print_mark(struct seq_file *m, struct slow_work *work) | ||
33 | { | ||
34 | struct timespec now, diff; | ||
35 | |||
36 | now = CURRENT_TIME; | ||
37 | diff = timespec_sub(now, work->mark); | ||
38 | |||
39 | if (diff.tv_sec < 0) | ||
40 | seq_puts(m, " -ve "); | ||
41 | else if (diff.tv_sec == 0 && diff.tv_nsec < 1000) | ||
42 | seq_printf(m, "%3luns ", diff.tv_nsec); | ||
43 | else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000) | ||
44 | seq_printf(m, "%3luus ", diff.tv_nsec / 1000); | ||
45 | else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000) | ||
46 | seq_printf(m, "%3lums ", diff.tv_nsec / 1000000); | ||
47 | else if (diff.tv_sec <= 1) | ||
48 | seq_puts(m, " 1s "); | ||
49 | else if (diff.tv_sec < 60) | ||
50 | seq_printf(m, "%4lus ", diff.tv_sec); | ||
51 | else if (diff.tv_sec < 60 * 60) | ||
52 | seq_printf(m, "%4lum ", diff.tv_sec / 60); | ||
53 | else if (diff.tv_sec < 60 * 60 * 24) | ||
54 | seq_printf(m, "%4luh ", diff.tv_sec / 3600); | ||
55 | else | ||
56 | seq_puts(m, "exces "); | ||
57 | } | ||
58 | |||
59 | /* | ||
60 | * Describe a slow work item for debugfs | ||
61 | */ | ||
62 | static int slow_work_runqueue_show(struct seq_file *m, void *v) | ||
63 | { | ||
64 | struct slow_work *work; | ||
65 | struct list_head *p = v; | ||
66 | unsigned long id; | ||
67 | |||
68 | switch ((unsigned long) v) { | ||
69 | case 1: | ||
70 | seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n"); | ||
71 | return 0; | ||
72 | case 2: | ||
73 | seq_puts(m, "=== ===== ================ == ===== ==========\n"); | ||
74 | return 0; | ||
75 | |||
76 | case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1: | ||
77 | id = (unsigned long) v - 3; | ||
78 | |||
79 | read_lock(&slow_work_execs_lock); | ||
80 | work = slow_work_execs[id]; | ||
81 | if (work) { | ||
82 | smp_read_barrier_depends(); | ||
83 | |||
84 | seq_printf(m, "%3lu %5d %16p %2lx ", | ||
85 | id, slow_work_pids[id], work, work->flags); | ||
86 | slow_work_print_mark(m, work); | ||
87 | |||
88 | if (work->ops->desc) | ||
89 | work->ops->desc(work, m); | ||
90 | seq_putc(m, '\n'); | ||
91 | } | ||
92 | read_unlock(&slow_work_execs_lock); | ||
93 | return 0; | ||
94 | |||
95 | default: | ||
96 | work = list_entry(p, struct slow_work, link); | ||
97 | seq_printf(m, "%3s - %16p %2lx ", | ||
98 | work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq", | ||
99 | work, work->flags); | ||
100 | slow_work_print_mark(m, work); | ||
101 | |||
102 | if (work->ops->desc) | ||
103 | work->ops->desc(work, m); | ||
104 | seq_putc(m, '\n'); | ||
105 | return 0; | ||
106 | } | ||
107 | } | ||
108 | |||
109 | /* | ||
110 | * map the iterator to a work item | ||
111 | */ | ||
112 | static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos) | ||
113 | { | ||
114 | struct list_head *p; | ||
115 | unsigned long count, id; | ||
116 | |||
117 | switch (*_pos >> ITERATOR_SHIFT) { | ||
118 | case 0x0: | ||
119 | if (*_pos == 0) | ||
120 | *_pos = 1; | ||
121 | if (*_pos < 3) | ||
122 | return (void *)(unsigned long) *_pos; | ||
123 | if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT) | ||
124 | for (id = *_pos - 3; | ||
125 | id < SLOW_WORK_THREAD_LIMIT; | ||
126 | id++, (*_pos)++) | ||
127 | if (slow_work_execs[id]) | ||
128 | return (void *)(unsigned long) *_pos; | ||
129 | *_pos = 0x1UL << ITERATOR_SHIFT; | ||
130 | |||
131 | case 0x1: | ||
132 | count = *_pos & ITERATOR_COUNTER; | ||
133 | list_for_each(p, &slow_work_queue) { | ||
134 | if (count == 0) | ||
135 | return p; | ||
136 | count--; | ||
137 | } | ||
138 | *_pos = 0x2UL << ITERATOR_SHIFT; | ||
139 | |||
140 | case 0x2: | ||
141 | count = *_pos & ITERATOR_COUNTER; | ||
142 | list_for_each(p, &vslow_work_queue) { | ||
143 | if (count == 0) | ||
144 | return p; | ||
145 | count--; | ||
146 | } | ||
147 | *_pos = 0x3UL << ITERATOR_SHIFT; | ||
148 | |||
149 | default: | ||
150 | return NULL; | ||
151 | } | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * set up the iterator to start reading from the first line | ||
156 | */ | ||
157 | static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos) | ||
158 | { | ||
159 | spin_lock_irq(&slow_work_queue_lock); | ||
160 | return slow_work_runqueue_index(m, _pos); | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * move to the next line | ||
165 | */ | ||
166 | static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos) | ||
167 | { | ||
168 | struct list_head *p = v; | ||
169 | unsigned long selector = *_pos >> ITERATOR_SHIFT; | ||
170 | |||
171 | (*_pos)++; | ||
172 | switch (selector) { | ||
173 | case 0x0: | ||
174 | return slow_work_runqueue_index(m, _pos); | ||
175 | |||
176 | case 0x1: | ||
177 | if (*_pos >> ITERATOR_SHIFT == 0x1) { | ||
178 | p = p->next; | ||
179 | if (p != &slow_work_queue) | ||
180 | return p; | ||
181 | } | ||
182 | *_pos = 0x2UL << ITERATOR_SHIFT; | ||
183 | p = &vslow_work_queue; | ||
184 | |||
185 | case 0x2: | ||
186 | if (*_pos >> ITERATOR_SHIFT == 0x2) { | ||
187 | p = p->next; | ||
188 | if (p != &vslow_work_queue) | ||
189 | return p; | ||
190 | } | ||
191 | *_pos = 0x3UL << ITERATOR_SHIFT; | ||
192 | |||
193 | default: | ||
194 | return NULL; | ||
195 | } | ||
196 | } | ||
197 | |||
198 | /* | ||
199 | * clean up after reading | ||
200 | */ | ||
201 | static void slow_work_runqueue_stop(struct seq_file *m, void *v) | ||
202 | { | ||
203 | spin_unlock_irq(&slow_work_queue_lock); | ||
204 | } | ||
205 | |||
206 | static const struct seq_operations slow_work_runqueue_ops = { | ||
207 | .start = slow_work_runqueue_start, | ||
208 | .stop = slow_work_runqueue_stop, | ||
209 | .next = slow_work_runqueue_next, | ||
210 | .show = slow_work_runqueue_show, | ||
211 | }; | ||
212 | |||
213 | /* | ||
214 | * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents | ||
215 | */ | ||
216 | static int slow_work_runqueue_open(struct inode *inode, struct file *file) | ||
217 | { | ||
218 | return seq_open(file, &slow_work_runqueue_ops); | ||
219 | } | ||
220 | |||
221 | const struct file_operations slow_work_runqueue_fops = { | ||
222 | .owner = THIS_MODULE, | ||
223 | .open = slow_work_runqueue_open, | ||
224 | .read = seq_read, | ||
225 | .llseek = seq_lseek, | ||
226 | .release = seq_release, | ||
227 | }; | ||
diff --git a/kernel/slow-work.c b/kernel/slow-work.c deleted file mode 100644 index 7d3f4fa9ef4f..000000000000 --- a/kernel/slow-work.c +++ /dev/null | |||
@@ -1,1068 +0,0 @@ | |||
1 | /* Worker thread pool for slow items, such as filesystem lookups or mkdirs | ||
2 | * | ||
3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. | ||
4 | * Written by David Howells (dhowells@redhat.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public Licence | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the Licence, or (at your option) any later version. | ||
10 | * | ||
11 | * See Documentation/slow-work.txt | ||
12 | */ | ||
13 | |||
14 | #include <linux/module.h> | ||
15 | #include <linux/slow-work.h> | ||
16 | #include <linux/kthread.h> | ||
17 | #include <linux/freezer.h> | ||
18 | #include <linux/wait.h> | ||
19 | #include <linux/debugfs.h> | ||
20 | #include "slow-work.h" | ||
21 | |||
22 | static void slow_work_cull_timeout(unsigned long); | ||
23 | static void slow_work_oom_timeout(unsigned long); | ||
24 | |||
25 | #ifdef CONFIG_SYSCTL | ||
26 | static int slow_work_min_threads_sysctl(struct ctl_table *, int, | ||
27 | void __user *, size_t *, loff_t *); | ||
28 | |||
29 | static int slow_work_max_threads_sysctl(struct ctl_table *, int , | ||
30 | void __user *, size_t *, loff_t *); | ||
31 | #endif | ||
32 | |||
33 | /* | ||
34 | * The pool of threads has at least min threads in it as long as someone is | ||
35 | * using the facility, and may have as many as max. | ||
36 | * | ||
37 | * A portion of the pool may be processing very slow operations. | ||
38 | */ | ||
39 | static unsigned slow_work_min_threads = 2; | ||
40 | static unsigned slow_work_max_threads = 4; | ||
41 | static unsigned vslow_work_proportion = 50; /* % of threads that may process | ||
42 | * very slow work */ | ||
43 | |||
44 | #ifdef CONFIG_SYSCTL | ||
45 | static const int slow_work_min_min_threads = 2; | ||
46 | static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT; | ||
47 | static const int slow_work_min_vslow = 1; | ||
48 | static const int slow_work_max_vslow = 99; | ||
49 | |||
50 | ctl_table slow_work_sysctls[] = { | ||
51 | { | ||
52 | .procname = "min-threads", | ||
53 | .data = &slow_work_min_threads, | ||
54 | .maxlen = sizeof(unsigned), | ||
55 | .mode = 0644, | ||
56 | .proc_handler = slow_work_min_threads_sysctl, | ||
57 | .extra1 = (void *) &slow_work_min_min_threads, | ||
58 | .extra2 = &slow_work_max_threads, | ||
59 | }, | ||
60 | { | ||
61 | .procname = "max-threads", | ||
62 | .data = &slow_work_max_threads, | ||
63 | .maxlen = sizeof(unsigned), | ||
64 | .mode = 0644, | ||
65 | .proc_handler = slow_work_max_threads_sysctl, | ||
66 | .extra1 = &slow_work_min_threads, | ||
67 | .extra2 = (void *) &slow_work_max_max_threads, | ||
68 | }, | ||
69 | { | ||
70 | .procname = "vslow-percentage", | ||
71 | .data = &vslow_work_proportion, | ||
72 | .maxlen = sizeof(unsigned), | ||
73 | .mode = 0644, | ||
74 | .proc_handler = proc_dointvec_minmax, | ||
75 | .extra1 = (void *) &slow_work_min_vslow, | ||
76 | .extra2 = (void *) &slow_work_max_vslow, | ||
77 | }, | ||
78 | {} | ||
79 | }; | ||
80 | #endif | ||
81 | |||
82 | /* | ||
83 | * The active state of the thread pool | ||
84 | */ | ||
85 | static atomic_t slow_work_thread_count; | ||
86 | static atomic_t vslow_work_executing_count; | ||
87 | |||
88 | static bool slow_work_may_not_start_new_thread; | ||
89 | static bool slow_work_cull; /* cull a thread due to lack of activity */ | ||
90 | static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0); | ||
91 | static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); | ||
92 | static struct slow_work slow_work_new_thread; /* new thread starter */ | ||
93 | |||
94 | /* | ||
95 | * slow work ID allocation (use slow_work_queue_lock) | ||
96 | */ | ||
97 | static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT); | ||
98 | |||
99 | /* | ||
100 | * Unregistration tracking to prevent put_ref() from disappearing during module | ||
101 | * unload | ||
102 | */ | ||
103 | #ifdef CONFIG_MODULES | ||
104 | static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT]; | ||
105 | static struct module *slow_work_unreg_module; | ||
106 | static struct slow_work *slow_work_unreg_work_item; | ||
107 | static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq); | ||
108 | static DEFINE_MUTEX(slow_work_unreg_sync_lock); | ||
109 | |||
110 | static void slow_work_set_thread_processing(int id, struct slow_work *work) | ||
111 | { | ||
112 | if (work) | ||
113 | slow_work_thread_processing[id] = work->owner; | ||
114 | } | ||
115 | static void slow_work_done_thread_processing(int id, struct slow_work *work) | ||
116 | { | ||
117 | struct module *module = slow_work_thread_processing[id]; | ||
118 | |||
119 | slow_work_thread_processing[id] = NULL; | ||
120 | smp_mb(); | ||
121 | if (slow_work_unreg_work_item == work || | ||
122 | slow_work_unreg_module == module) | ||
123 | wake_up_all(&slow_work_unreg_wq); | ||
124 | } | ||
125 | static void slow_work_clear_thread_processing(int id) | ||
126 | { | ||
127 | slow_work_thread_processing[id] = NULL; | ||
128 | } | ||
129 | #else | ||
130 | static void slow_work_set_thread_processing(int id, struct slow_work *work) {} | ||
131 | static void slow_work_done_thread_processing(int id, struct slow_work *work) {} | ||
132 | static void slow_work_clear_thread_processing(int id) {} | ||
133 | #endif | ||
134 | |||
135 | /* | ||
136 | * Data for tracking currently executing items for indication through /proc | ||
137 | */ | ||
138 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
139 | struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT]; | ||
140 | pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT]; | ||
141 | DEFINE_RWLOCK(slow_work_execs_lock); | ||
142 | #endif | ||
143 | |||
144 | /* | ||
145 | * The queues of work items and the lock governing access to them. These are | ||
146 | * shared between all the CPUs. It doesn't make sense to have per-CPU queues | ||
147 | * as the number of threads bears no relation to the number of CPUs. | ||
148 | * | ||
149 | * There are two queues of work items: one for slow work items, and one for | ||
150 | * very slow work items. | ||
151 | */ | ||
152 | LIST_HEAD(slow_work_queue); | ||
153 | LIST_HEAD(vslow_work_queue); | ||
154 | DEFINE_SPINLOCK(slow_work_queue_lock); | ||
155 | |||
156 | /* | ||
157 | * The following are two wait queues that get pinged when a work item is placed | ||
158 | * on an empty queue. These allow work items that are hogging a thread by | ||
159 | * sleeping in a way that could be deferred to yield their thread and enqueue | ||
160 | * themselves. | ||
161 | */ | ||
162 | static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation); | ||
163 | static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation); | ||
164 | |||
165 | /* | ||
166 | * The thread controls. A variable used to signal to the threads that they | ||
167 | * should exit when the queue is empty, a waitqueue used by the threads to wait | ||
168 | * for signals, and a completion set by the last thread to exit. | ||
169 | */ | ||
170 | static bool slow_work_threads_should_exit; | ||
171 | static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq); | ||
172 | static DECLARE_COMPLETION(slow_work_last_thread_exited); | ||
173 | |||
174 | /* | ||
175 | * The number of users of the thread pool and its lock. Whilst this is zero we | ||
176 | * have no threads hanging around, and when this reaches zero, we wait for all | ||
177 | * active or queued work items to complete and kill all the threads we do have. | ||
178 | */ | ||
179 | static int slow_work_user_count; | ||
180 | static DEFINE_MUTEX(slow_work_user_lock); | ||
181 | |||
182 | static inline int slow_work_get_ref(struct slow_work *work) | ||
183 | { | ||
184 | if (work->ops->get_ref) | ||
185 | return work->ops->get_ref(work); | ||
186 | |||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | static inline void slow_work_put_ref(struct slow_work *work) | ||
191 | { | ||
192 | if (work->ops->put_ref) | ||
193 | work->ops->put_ref(work); | ||
194 | } | ||
195 | |||
196 | /* | ||
197 | * Calculate the maximum number of active threads in the pool that are | ||
198 | * permitted to process very slow work items. | ||
199 | * | ||
200 | * The answer is rounded up to at least 1, but may not equal or exceed the | ||
201 | * maximum number of the threads in the pool. This means we always have at | ||
202 | * least one thread that can process slow work items, and we always have at | ||
203 | * least one thread that won't get tied up doing so. | ||
204 | */ | ||
205 | static unsigned slow_work_calc_vsmax(void) | ||
206 | { | ||
207 | unsigned vsmax; | ||
208 | |||
209 | vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion; | ||
210 | vsmax /= 100; | ||
211 | vsmax = max(vsmax, 1U); | ||
212 | return min(vsmax, slow_work_max_threads - 1); | ||
213 | } | ||
214 | |||
215 | /* | ||
216 | * Attempt to execute stuff queued on a slow thread. Return true if we managed | ||
217 | * it, false if there was nothing to do. | ||
218 | */ | ||
219 | static noinline bool slow_work_execute(int id) | ||
220 | { | ||
221 | struct slow_work *work = NULL; | ||
222 | unsigned vsmax; | ||
223 | bool very_slow; | ||
224 | |||
225 | vsmax = slow_work_calc_vsmax(); | ||
226 | |||
227 | /* see if we can schedule a new thread to be started if we're not | ||
228 | * keeping up with the work */ | ||
229 | if (!waitqueue_active(&slow_work_thread_wq) && | ||
230 | (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) && | ||
231 | atomic_read(&slow_work_thread_count) < slow_work_max_threads && | ||
232 | !slow_work_may_not_start_new_thread) | ||
233 | slow_work_enqueue(&slow_work_new_thread); | ||
234 | |||
235 | /* find something to execute */ | ||
236 | spin_lock_irq(&slow_work_queue_lock); | ||
237 | if (!list_empty(&vslow_work_queue) && | ||
238 | atomic_read(&vslow_work_executing_count) < vsmax) { | ||
239 | work = list_entry(vslow_work_queue.next, | ||
240 | struct slow_work, link); | ||
241 | if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) | ||
242 | BUG(); | ||
243 | list_del_init(&work->link); | ||
244 | atomic_inc(&vslow_work_executing_count); | ||
245 | very_slow = true; | ||
246 | } else if (!list_empty(&slow_work_queue)) { | ||
247 | work = list_entry(slow_work_queue.next, | ||
248 | struct slow_work, link); | ||
249 | if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) | ||
250 | BUG(); | ||
251 | list_del_init(&work->link); | ||
252 | very_slow = false; | ||
253 | } else { | ||
254 | very_slow = false; /* avoid the compiler warning */ | ||
255 | } | ||
256 | |||
257 | slow_work_set_thread_processing(id, work); | ||
258 | if (work) { | ||
259 | slow_work_mark_time(work); | ||
260 | slow_work_begin_exec(id, work); | ||
261 | } | ||
262 | |||
263 | spin_unlock_irq(&slow_work_queue_lock); | ||
264 | |||
265 | if (!work) | ||
266 | return false; | ||
267 | |||
268 | if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) | ||
269 | BUG(); | ||
270 | |||
271 | /* don't execute if the work is in the process of being cancelled */ | ||
272 | if (!test_bit(SLOW_WORK_CANCELLING, &work->flags)) | ||
273 | work->ops->execute(work); | ||
274 | |||
275 | if (very_slow) | ||
276 | atomic_dec(&vslow_work_executing_count); | ||
277 | clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); | ||
278 | |||
279 | /* wake up anyone waiting for this work to be complete */ | ||
280 | wake_up_bit(&work->flags, SLOW_WORK_EXECUTING); | ||
281 | |||
282 | slow_work_end_exec(id, work); | ||
283 | |||
284 | /* if someone tried to enqueue the item whilst we were executing it, | ||
285 | * then it'll be left unenqueued to avoid multiple threads trying to | ||
286 | * execute it simultaneously | ||
287 | * | ||
288 | * there is, however, a race between us testing the pending flag and | ||
289 | * getting the spinlock, and between the enqueuer setting the pending | ||
290 | * flag and getting the spinlock, so we use a deferral bit to tell us | ||
291 | * if the enqueuer got there first | ||
292 | */ | ||
293 | if (test_bit(SLOW_WORK_PENDING, &work->flags)) { | ||
294 | spin_lock_irq(&slow_work_queue_lock); | ||
295 | |||
296 | if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) && | ||
297 | test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) | ||
298 | goto auto_requeue; | ||
299 | |||
300 | spin_unlock_irq(&slow_work_queue_lock); | ||
301 | } | ||
302 | |||
303 | /* sort out the race between module unloading and put_ref() */ | ||
304 | slow_work_put_ref(work); | ||
305 | slow_work_done_thread_processing(id, work); | ||
306 | |||
307 | return true; | ||
308 | |||
309 | auto_requeue: | ||
310 | /* we must complete the enqueue operation | ||
311 | * - we transfer our ref on the item back to the appropriate queue | ||
312 | * - don't wake another thread up as we're awake already | ||
313 | */ | ||
314 | slow_work_mark_time(work); | ||
315 | if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) | ||
316 | list_add_tail(&work->link, &vslow_work_queue); | ||
317 | else | ||
318 | list_add_tail(&work->link, &slow_work_queue); | ||
319 | spin_unlock_irq(&slow_work_queue_lock); | ||
320 | slow_work_clear_thread_processing(id); | ||
321 | return true; | ||
322 | } | ||
323 | |||
324 | /** | ||
325 | * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work | ||
326 | * work: The work item under execution that wants to sleep | ||
327 | * _timeout: Scheduler sleep timeout | ||
328 | * | ||
329 | * Allow a requeueable work item to sleep on a slow-work processor thread until | ||
330 | * that thread is needed to do some other work or the sleep is interrupted by | ||
331 | * some other event. | ||
332 | * | ||
333 | * The caller must set up a wake up event before calling this and must have set | ||
334 | * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own | ||
335 | * condition before calling this function as no test is made here. | ||
336 | * | ||
337 | * False is returned if there is nothing on the queue; true is returned if the | ||
338 | * work item should be requeued | ||
339 | */ | ||
340 | bool slow_work_sleep_till_thread_needed(struct slow_work *work, | ||
341 | signed long *_timeout) | ||
342 | { | ||
343 | wait_queue_head_t *wfo_wq; | ||
344 | struct list_head *queue; | ||
345 | |||
346 | DEFINE_WAIT(wait); | ||
347 | |||
348 | if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { | ||
349 | wfo_wq = &vslow_work_queue_waits_for_occupation; | ||
350 | queue = &vslow_work_queue; | ||
351 | } else { | ||
352 | wfo_wq = &slow_work_queue_waits_for_occupation; | ||
353 | queue = &slow_work_queue; | ||
354 | } | ||
355 | |||
356 | if (!list_empty(queue)) | ||
357 | return true; | ||
358 | |||
359 | add_wait_queue_exclusive(wfo_wq, &wait); | ||
360 | if (list_empty(queue)) | ||
361 | *_timeout = schedule_timeout(*_timeout); | ||
362 | finish_wait(wfo_wq, &wait); | ||
363 | |||
364 | return !list_empty(queue); | ||
365 | } | ||
366 | EXPORT_SYMBOL(slow_work_sleep_till_thread_needed); | ||
367 | |||
368 | /** | ||
369 | * slow_work_enqueue - Schedule a slow work item for processing | ||
370 | * @work: The work item to queue | ||
371 | * | ||
372 | * Schedule a slow work item for processing. If the item is already undergoing | ||
373 | * execution, this guarantees not to re-enter the execution routine until the | ||
374 | * first execution finishes. | ||
375 | * | ||
376 | * The item is pinned by this function as it retains a reference to it, managed | ||
377 | * through the item operations. The item is unpinned once it has been | ||
378 | * executed. | ||
379 | * | ||
380 | * An item may hog the thread that is running it for a relatively large amount | ||
381 | * of time, sufficient, for example, to perform several lookup, mkdir, create | ||
382 | * and setxattr operations. It may sleep on I/O and may sleep to obtain locks. | ||
383 | * | ||
384 | * Conversely, if a number of items are awaiting processing, it may take some | ||
385 | * time before any given item is given attention. The number of threads in the | ||
386 | * pool may be increased to deal with demand, but only up to a limit. | ||
387 | * | ||
388 | * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in | ||
389 | * the very slow queue, from which only a portion of the threads will be | ||
390 | * allowed to pick items to execute. This ensures that very slow items won't | ||
391 | * overly block ones that are just ordinarily slow. | ||
392 | * | ||
393 | * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is | ||
394 | * attempted queued) | ||
395 | */ | ||
396 | int slow_work_enqueue(struct slow_work *work) | ||
397 | { | ||
398 | wait_queue_head_t *wfo_wq; | ||
399 | struct list_head *queue; | ||
400 | unsigned long flags; | ||
401 | int ret; | ||
402 | |||
403 | if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) | ||
404 | return -ECANCELED; | ||
405 | |||
406 | BUG_ON(slow_work_user_count <= 0); | ||
407 | BUG_ON(!work); | ||
408 | BUG_ON(!work->ops); | ||
409 | |||
410 | /* when honouring an enqueue request, we only promise that we will run | ||
411 | * the work function in the future; we do not promise to run it once | ||
412 | * per enqueue request | ||
413 | * | ||
414 | * we use the PENDING bit to merge together repeat requests without | ||
415 | * having to disable IRQs and take the spinlock, whilst still | ||
416 | * maintaining our promise | ||
417 | */ | ||
418 | if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { | ||
419 | if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { | ||
420 | wfo_wq = &vslow_work_queue_waits_for_occupation; | ||
421 | queue = &vslow_work_queue; | ||
422 | } else { | ||
423 | wfo_wq = &slow_work_queue_waits_for_occupation; | ||
424 | queue = &slow_work_queue; | ||
425 | } | ||
426 | |||
427 | spin_lock_irqsave(&slow_work_queue_lock, flags); | ||
428 | |||
429 | if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags))) | ||
430 | goto cancelled; | ||
431 | |||
432 | /* we promise that we will not attempt to execute the work | ||
433 | * function in more than one thread simultaneously | ||
434 | * | ||
435 | * this, however, leaves us with a problem if we're asked to | ||
436 | * enqueue the work whilst someone is executing the work | ||
437 | * function as simply queueing the work immediately means that | ||
438 | * another thread may try executing it whilst it is already | ||
439 | * under execution | ||
440 | * | ||
441 | * to deal with this, we set the ENQ_DEFERRED bit instead of | ||
442 | * enqueueing, and the thread currently executing the work | ||
443 | * function will enqueue the work item when the work function | ||
444 | * returns and it has cleared the EXECUTING bit | ||
445 | */ | ||
446 | if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { | ||
447 | set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); | ||
448 | } else { | ||
449 | ret = slow_work_get_ref(work); | ||
450 | if (ret < 0) | ||
451 | goto failed; | ||
452 | slow_work_mark_time(work); | ||
453 | list_add_tail(&work->link, queue); | ||
454 | wake_up(&slow_work_thread_wq); | ||
455 | |||
456 | /* if someone who could be requeued is sleeping on a | ||
457 | * thread, then ask them to yield their thread */ | ||
458 | if (work->link.prev == queue) | ||
459 | wake_up(wfo_wq); | ||
460 | } | ||
461 | |||
462 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
463 | } | ||
464 | return 0; | ||
465 | |||
466 | cancelled: | ||
467 | ret = -ECANCELED; | ||
468 | failed: | ||
469 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
470 | return ret; | ||
471 | } | ||
472 | EXPORT_SYMBOL(slow_work_enqueue); | ||
473 | |||
474 | static int slow_work_wait(void *word) | ||
475 | { | ||
476 | schedule(); | ||
477 | return 0; | ||
478 | } | ||
479 | |||
480 | /** | ||
481 | * slow_work_cancel - Cancel a slow work item | ||
482 | * @work: The work item to cancel | ||
483 | * | ||
484 | * This function will cancel a previously enqueued work item. If we cannot | ||
485 | * cancel the work item, it is guarenteed to have run when this function | ||
486 | * returns. | ||
487 | */ | ||
488 | void slow_work_cancel(struct slow_work *work) | ||
489 | { | ||
490 | bool wait = true, put = false; | ||
491 | |||
492 | set_bit(SLOW_WORK_CANCELLING, &work->flags); | ||
493 | smp_mb(); | ||
494 | |||
495 | /* if the work item is a delayed work item with an active timer, we | ||
496 | * need to wait for the timer to finish _before_ getting the spinlock, | ||
497 | * lest we deadlock against the timer routine | ||
498 | * | ||
499 | * the timer routine will leave DELAYED set if it notices the | ||
500 | * CANCELLING flag in time | ||
501 | */ | ||
502 | if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { | ||
503 | struct delayed_slow_work *dwork = | ||
504 | container_of(work, struct delayed_slow_work, work); | ||
505 | del_timer_sync(&dwork->timer); | ||
506 | } | ||
507 | |||
508 | spin_lock_irq(&slow_work_queue_lock); | ||
509 | |||
510 | if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { | ||
511 | /* the timer routine aborted or never happened, so we are left | ||
512 | * holding the timer's reference on the item and should just | ||
513 | * drop the pending flag and wait for any ongoing execution to | ||
514 | * finish */ | ||
515 | struct delayed_slow_work *dwork = | ||
516 | container_of(work, struct delayed_slow_work, work); | ||
517 | |||
518 | BUG_ON(timer_pending(&dwork->timer)); | ||
519 | BUG_ON(!list_empty(&work->link)); | ||
520 | |||
521 | clear_bit(SLOW_WORK_DELAYED, &work->flags); | ||
522 | put = true; | ||
523 | clear_bit(SLOW_WORK_PENDING, &work->flags); | ||
524 | |||
525 | } else if (test_bit(SLOW_WORK_PENDING, &work->flags) && | ||
526 | !list_empty(&work->link)) { | ||
527 | /* the link in the pending queue holds a reference on the item | ||
528 | * that we will need to release */ | ||
529 | list_del_init(&work->link); | ||
530 | wait = false; | ||
531 | put = true; | ||
532 | clear_bit(SLOW_WORK_PENDING, &work->flags); | ||
533 | |||
534 | } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) { | ||
535 | /* the executor is holding our only reference on the item, so | ||
536 | * we merely need to wait for it to finish executing */ | ||
537 | clear_bit(SLOW_WORK_PENDING, &work->flags); | ||
538 | } | ||
539 | |||
540 | spin_unlock_irq(&slow_work_queue_lock); | ||
541 | |||
542 | /* the EXECUTING flag is set by the executor whilst the spinlock is set | ||
543 | * and before the item is dequeued - so assuming the above doesn't | ||
544 | * actually dequeue it, simply waiting for the EXECUTING flag to be | ||
545 | * released here should be sufficient */ | ||
546 | if (wait) | ||
547 | wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait, | ||
548 | TASK_UNINTERRUPTIBLE); | ||
549 | |||
550 | clear_bit(SLOW_WORK_CANCELLING, &work->flags); | ||
551 | if (put) | ||
552 | slow_work_put_ref(work); | ||
553 | } | ||
554 | EXPORT_SYMBOL(slow_work_cancel); | ||
555 | |||
556 | /* | ||
557 | * Handle expiry of the delay timer, indicating that a delayed slow work item | ||
558 | * should now be queued if not cancelled | ||
559 | */ | ||
560 | static void delayed_slow_work_timer(unsigned long data) | ||
561 | { | ||
562 | wait_queue_head_t *wfo_wq; | ||
563 | struct list_head *queue; | ||
564 | struct slow_work *work = (struct slow_work *) data; | ||
565 | unsigned long flags; | ||
566 | bool queued = false, put = false, first = false; | ||
567 | |||
568 | if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { | ||
569 | wfo_wq = &vslow_work_queue_waits_for_occupation; | ||
570 | queue = &vslow_work_queue; | ||
571 | } else { | ||
572 | wfo_wq = &slow_work_queue_waits_for_occupation; | ||
573 | queue = &slow_work_queue; | ||
574 | } | ||
575 | |||
576 | spin_lock_irqsave(&slow_work_queue_lock, flags); | ||
577 | if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) { | ||
578 | clear_bit(SLOW_WORK_DELAYED, &work->flags); | ||
579 | |||
580 | if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { | ||
581 | /* we discard the reference the timer was holding in | ||
582 | * favour of the one the executor holds */ | ||
583 | set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); | ||
584 | put = true; | ||
585 | } else { | ||
586 | slow_work_mark_time(work); | ||
587 | list_add_tail(&work->link, queue); | ||
588 | queued = true; | ||
589 | if (work->link.prev == queue) | ||
590 | first = true; | ||
591 | } | ||
592 | } | ||
593 | |||
594 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
595 | if (put) | ||
596 | slow_work_put_ref(work); | ||
597 | if (first) | ||
598 | wake_up(wfo_wq); | ||
599 | if (queued) | ||
600 | wake_up(&slow_work_thread_wq); | ||
601 | } | ||
602 | |||
603 | /** | ||
604 | * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing | ||
605 | * @dwork: The delayed work item to queue | ||
606 | * @delay: When to start executing the work, in jiffies from now | ||
607 | * | ||
608 | * This is similar to slow_work_enqueue(), but it adds a delay before the work | ||
609 | * is actually queued for processing. | ||
610 | * | ||
611 | * The item can have delayed processing requested on it whilst it is being | ||
612 | * executed. The delay will begin immediately, and if it expires before the | ||
613 | * item finishes executing, the item will be placed back on the queue when it | ||
614 | * has done executing. | ||
615 | */ | ||
616 | int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, | ||
617 | unsigned long delay) | ||
618 | { | ||
619 | struct slow_work *work = &dwork->work; | ||
620 | unsigned long flags; | ||
621 | int ret; | ||
622 | |||
623 | if (delay == 0) | ||
624 | return slow_work_enqueue(&dwork->work); | ||
625 | |||
626 | BUG_ON(slow_work_user_count <= 0); | ||
627 | BUG_ON(!work); | ||
628 | BUG_ON(!work->ops); | ||
629 | |||
630 | if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) | ||
631 | return -ECANCELED; | ||
632 | |||
633 | if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { | ||
634 | spin_lock_irqsave(&slow_work_queue_lock, flags); | ||
635 | |||
636 | if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) | ||
637 | goto cancelled; | ||
638 | |||
639 | /* the timer holds a reference whilst it is pending */ | ||
640 | ret = slow_work_get_ref(work); | ||
641 | if (ret < 0) | ||
642 | goto cant_get_ref; | ||
643 | |||
644 | if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags)) | ||
645 | BUG(); | ||
646 | dwork->timer.expires = jiffies + delay; | ||
647 | dwork->timer.data = (unsigned long) work; | ||
648 | dwork->timer.function = delayed_slow_work_timer; | ||
649 | add_timer(&dwork->timer); | ||
650 | |||
651 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
652 | } | ||
653 | |||
654 | return 0; | ||
655 | |||
656 | cancelled: | ||
657 | ret = -ECANCELED; | ||
658 | cant_get_ref: | ||
659 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
660 | return ret; | ||
661 | } | ||
662 | EXPORT_SYMBOL(delayed_slow_work_enqueue); | ||
663 | |||
664 | /* | ||
665 | * Schedule a cull of the thread pool at some time in the near future | ||
666 | */ | ||
667 | static void slow_work_schedule_cull(void) | ||
668 | { | ||
669 | mod_timer(&slow_work_cull_timer, | ||
670 | round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT)); | ||
671 | } | ||
672 | |||
673 | /* | ||
674 | * Worker thread culling algorithm | ||
675 | */ | ||
676 | static bool slow_work_cull_thread(void) | ||
677 | { | ||
678 | unsigned long flags; | ||
679 | bool do_cull = false; | ||
680 | |||
681 | spin_lock_irqsave(&slow_work_queue_lock, flags); | ||
682 | |||
683 | if (slow_work_cull) { | ||
684 | slow_work_cull = false; | ||
685 | |||
686 | if (list_empty(&slow_work_queue) && | ||
687 | list_empty(&vslow_work_queue) && | ||
688 | atomic_read(&slow_work_thread_count) > | ||
689 | slow_work_min_threads) { | ||
690 | slow_work_schedule_cull(); | ||
691 | do_cull = true; | ||
692 | } | ||
693 | } | ||
694 | |||
695 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
696 | return do_cull; | ||
697 | } | ||
698 | |||
699 | /* | ||
700 | * Determine if there is slow work available for dispatch | ||
701 | */ | ||
702 | static inline bool slow_work_available(int vsmax) | ||
703 | { | ||
704 | return !list_empty(&slow_work_queue) || | ||
705 | (!list_empty(&vslow_work_queue) && | ||
706 | atomic_read(&vslow_work_executing_count) < vsmax); | ||
707 | } | ||
708 | |||
709 | /* | ||
710 | * Worker thread dispatcher | ||
711 | */ | ||
712 | static int slow_work_thread(void *_data) | ||
713 | { | ||
714 | int vsmax, id; | ||
715 | |||
716 | DEFINE_WAIT(wait); | ||
717 | |||
718 | set_freezable(); | ||
719 | set_user_nice(current, -5); | ||
720 | |||
721 | /* allocate ourselves an ID */ | ||
722 | spin_lock_irq(&slow_work_queue_lock); | ||
723 | id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT); | ||
724 | BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT); | ||
725 | __set_bit(id, slow_work_ids); | ||
726 | slow_work_set_thread_pid(id, current->pid); | ||
727 | spin_unlock_irq(&slow_work_queue_lock); | ||
728 | |||
729 | sprintf(current->comm, "kslowd%03u", id); | ||
730 | |||
731 | for (;;) { | ||
732 | vsmax = vslow_work_proportion; | ||
733 | vsmax *= atomic_read(&slow_work_thread_count); | ||
734 | vsmax /= 100; | ||
735 | |||
736 | prepare_to_wait_exclusive(&slow_work_thread_wq, &wait, | ||
737 | TASK_INTERRUPTIBLE); | ||
738 | if (!freezing(current) && | ||
739 | !slow_work_threads_should_exit && | ||
740 | !slow_work_available(vsmax) && | ||
741 | !slow_work_cull) | ||
742 | schedule(); | ||
743 | finish_wait(&slow_work_thread_wq, &wait); | ||
744 | |||
745 | try_to_freeze(); | ||
746 | |||
747 | vsmax = vslow_work_proportion; | ||
748 | vsmax *= atomic_read(&slow_work_thread_count); | ||
749 | vsmax /= 100; | ||
750 | |||
751 | if (slow_work_available(vsmax) && slow_work_execute(id)) { | ||
752 | cond_resched(); | ||
753 | if (list_empty(&slow_work_queue) && | ||
754 | list_empty(&vslow_work_queue) && | ||
755 | atomic_read(&slow_work_thread_count) > | ||
756 | slow_work_min_threads) | ||
757 | slow_work_schedule_cull(); | ||
758 | continue; | ||
759 | } | ||
760 | |||
761 | if (slow_work_threads_should_exit) | ||
762 | break; | ||
763 | |||
764 | if (slow_work_cull && slow_work_cull_thread()) | ||
765 | break; | ||
766 | } | ||
767 | |||
768 | spin_lock_irq(&slow_work_queue_lock); | ||
769 | slow_work_set_thread_pid(id, 0); | ||
770 | __clear_bit(id, slow_work_ids); | ||
771 | spin_unlock_irq(&slow_work_queue_lock); | ||
772 | |||
773 | if (atomic_dec_and_test(&slow_work_thread_count)) | ||
774 | complete_and_exit(&slow_work_last_thread_exited, 0); | ||
775 | return 0; | ||
776 | } | ||
777 | |||
778 | /* | ||
779 | * Handle thread cull timer expiration | ||
780 | */ | ||
781 | static void slow_work_cull_timeout(unsigned long data) | ||
782 | { | ||
783 | slow_work_cull = true; | ||
784 | wake_up(&slow_work_thread_wq); | ||
785 | } | ||
786 | |||
787 | /* | ||
788 | * Start a new slow work thread | ||
789 | */ | ||
790 | static void slow_work_new_thread_execute(struct slow_work *work) | ||
791 | { | ||
792 | struct task_struct *p; | ||
793 | |||
794 | if (slow_work_threads_should_exit) | ||
795 | return; | ||
796 | |||
797 | if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads) | ||
798 | return; | ||
799 | |||
800 | if (!mutex_trylock(&slow_work_user_lock)) | ||
801 | return; | ||
802 | |||
803 | slow_work_may_not_start_new_thread = true; | ||
804 | atomic_inc(&slow_work_thread_count); | ||
805 | p = kthread_run(slow_work_thread, NULL, "kslowd"); | ||
806 | if (IS_ERR(p)) { | ||
807 | printk(KERN_DEBUG "Slow work thread pool: OOM\n"); | ||
808 | if (atomic_dec_and_test(&slow_work_thread_count)) | ||
809 | BUG(); /* we're running on a slow work thread... */ | ||
810 | mod_timer(&slow_work_oom_timer, | ||
811 | round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT)); | ||
812 | } else { | ||
813 | /* ratelimit the starting of new threads */ | ||
814 | mod_timer(&slow_work_oom_timer, jiffies + 1); | ||
815 | } | ||
816 | |||
817 | mutex_unlock(&slow_work_user_lock); | ||
818 | } | ||
819 | |||
820 | static const struct slow_work_ops slow_work_new_thread_ops = { | ||
821 | .owner = THIS_MODULE, | ||
822 | .execute = slow_work_new_thread_execute, | ||
823 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
824 | .desc = slow_work_new_thread_desc, | ||
825 | #endif | ||
826 | }; | ||
827 | |||
828 | /* | ||
829 | * post-OOM new thread start suppression expiration | ||
830 | */ | ||
831 | static void slow_work_oom_timeout(unsigned long data) | ||
832 | { | ||
833 | slow_work_may_not_start_new_thread = false; | ||
834 | } | ||
835 | |||
836 | #ifdef CONFIG_SYSCTL | ||
837 | /* | ||
838 | * Handle adjustment of the minimum number of threads | ||
839 | */ | ||
840 | static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, | ||
841 | void __user *buffer, | ||
842 | size_t *lenp, loff_t *ppos) | ||
843 | { | ||
844 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
845 | int n; | ||
846 | |||
847 | if (ret == 0) { | ||
848 | mutex_lock(&slow_work_user_lock); | ||
849 | if (slow_work_user_count > 0) { | ||
850 | /* see if we need to start or stop threads */ | ||
851 | n = atomic_read(&slow_work_thread_count) - | ||
852 | slow_work_min_threads; | ||
853 | |||
854 | if (n < 0 && !slow_work_may_not_start_new_thread) | ||
855 | slow_work_enqueue(&slow_work_new_thread); | ||
856 | else if (n > 0) | ||
857 | slow_work_schedule_cull(); | ||
858 | } | ||
859 | mutex_unlock(&slow_work_user_lock); | ||
860 | } | ||
861 | |||
862 | return ret; | ||
863 | } | ||
864 | |||
865 | /* | ||
866 | * Handle adjustment of the maximum number of threads | ||
867 | */ | ||
868 | static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, | ||
869 | void __user *buffer, | ||
870 | size_t *lenp, loff_t *ppos) | ||
871 | { | ||
872 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
873 | int n; | ||
874 | |||
875 | if (ret == 0) { | ||
876 | mutex_lock(&slow_work_user_lock); | ||
877 | if (slow_work_user_count > 0) { | ||
878 | /* see if we need to stop threads */ | ||
879 | n = slow_work_max_threads - | ||
880 | atomic_read(&slow_work_thread_count); | ||
881 | |||
882 | if (n < 0) | ||
883 | slow_work_schedule_cull(); | ||
884 | } | ||
885 | mutex_unlock(&slow_work_user_lock); | ||
886 | } | ||
887 | |||
888 | return ret; | ||
889 | } | ||
890 | #endif /* CONFIG_SYSCTL */ | ||
891 | |||
892 | /** | ||
893 | * slow_work_register_user - Register a user of the facility | ||
894 | * @module: The module about to make use of the facility | ||
895 | * | ||
896 | * Register a user of the facility, starting up the initial threads if there | ||
897 | * aren't any other users at this point. This will return 0 if successful, or | ||
898 | * an error if not. | ||
899 | */ | ||
900 | int slow_work_register_user(struct module *module) | ||
901 | { | ||
902 | struct task_struct *p; | ||
903 | int loop; | ||
904 | |||
905 | mutex_lock(&slow_work_user_lock); | ||
906 | |||
907 | if (slow_work_user_count == 0) { | ||
908 | printk(KERN_NOTICE "Slow work thread pool: Starting up\n"); | ||
909 | init_completion(&slow_work_last_thread_exited); | ||
910 | |||
911 | slow_work_threads_should_exit = false; | ||
912 | slow_work_init(&slow_work_new_thread, | ||
913 | &slow_work_new_thread_ops); | ||
914 | slow_work_may_not_start_new_thread = false; | ||
915 | slow_work_cull = false; | ||
916 | |||
917 | /* start the minimum number of threads */ | ||
918 | for (loop = 0; loop < slow_work_min_threads; loop++) { | ||
919 | atomic_inc(&slow_work_thread_count); | ||
920 | p = kthread_run(slow_work_thread, NULL, "kslowd"); | ||
921 | if (IS_ERR(p)) | ||
922 | goto error; | ||
923 | } | ||
924 | printk(KERN_NOTICE "Slow work thread pool: Ready\n"); | ||
925 | } | ||
926 | |||
927 | slow_work_user_count++; | ||
928 | mutex_unlock(&slow_work_user_lock); | ||
929 | return 0; | ||
930 | |||
931 | error: | ||
932 | if (atomic_dec_and_test(&slow_work_thread_count)) | ||
933 | complete(&slow_work_last_thread_exited); | ||
934 | if (loop > 0) { | ||
935 | printk(KERN_ERR "Slow work thread pool:" | ||
936 | " Aborting startup on ENOMEM\n"); | ||
937 | slow_work_threads_should_exit = true; | ||
938 | wake_up_all(&slow_work_thread_wq); | ||
939 | wait_for_completion(&slow_work_last_thread_exited); | ||
940 | printk(KERN_ERR "Slow work thread pool: Aborted\n"); | ||
941 | } | ||
942 | mutex_unlock(&slow_work_user_lock); | ||
943 | return PTR_ERR(p); | ||
944 | } | ||
945 | EXPORT_SYMBOL(slow_work_register_user); | ||
946 | |||
947 | /* | ||
948 | * wait for all outstanding items from the calling module to complete | ||
949 | * - note that more items may be queued whilst we're waiting | ||
950 | */ | ||
951 | static void slow_work_wait_for_items(struct module *module) | ||
952 | { | ||
953 | #ifdef CONFIG_MODULES | ||
954 | DECLARE_WAITQUEUE(myself, current); | ||
955 | struct slow_work *work; | ||
956 | int loop; | ||
957 | |||
958 | mutex_lock(&slow_work_unreg_sync_lock); | ||
959 | add_wait_queue(&slow_work_unreg_wq, &myself); | ||
960 | |||
961 | for (;;) { | ||
962 | spin_lock_irq(&slow_work_queue_lock); | ||
963 | |||
964 | /* first of all, we wait for the last queued item in each list | ||
965 | * to be processed */ | ||
966 | list_for_each_entry_reverse(work, &vslow_work_queue, link) { | ||
967 | if (work->owner == module) { | ||
968 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
969 | slow_work_unreg_work_item = work; | ||
970 | goto do_wait; | ||
971 | } | ||
972 | } | ||
973 | list_for_each_entry_reverse(work, &slow_work_queue, link) { | ||
974 | if (work->owner == module) { | ||
975 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
976 | slow_work_unreg_work_item = work; | ||
977 | goto do_wait; | ||
978 | } | ||
979 | } | ||
980 | |||
981 | /* then we wait for the items being processed to finish */ | ||
982 | slow_work_unreg_module = module; | ||
983 | smp_mb(); | ||
984 | for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) { | ||
985 | if (slow_work_thread_processing[loop] == module) | ||
986 | goto do_wait; | ||
987 | } | ||
988 | spin_unlock_irq(&slow_work_queue_lock); | ||
989 | break; /* okay, we're done */ | ||
990 | |||
991 | do_wait: | ||
992 | spin_unlock_irq(&slow_work_queue_lock); | ||
993 | schedule(); | ||
994 | slow_work_unreg_work_item = NULL; | ||
995 | slow_work_unreg_module = NULL; | ||
996 | } | ||
997 | |||
998 | remove_wait_queue(&slow_work_unreg_wq, &myself); | ||
999 | mutex_unlock(&slow_work_unreg_sync_lock); | ||
1000 | #endif /* CONFIG_MODULES */ | ||
1001 | } | ||
1002 | |||
1003 | /** | ||
1004 | * slow_work_unregister_user - Unregister a user of the facility | ||
1005 | * @module: The module whose items should be cleared | ||
1006 | * | ||
1007 | * Unregister a user of the facility, killing all the threads if this was the | ||
1008 | * last one. | ||
1009 | * | ||
1010 | * This waits for all the work items belonging to the nominated module to go | ||
1011 | * away before proceeding. | ||
1012 | */ | ||
1013 | void slow_work_unregister_user(struct module *module) | ||
1014 | { | ||
1015 | /* first of all, wait for all outstanding items from the calling module | ||
1016 | * to complete */ | ||
1017 | if (module) | ||
1018 | slow_work_wait_for_items(module); | ||
1019 | |||
1020 | /* then we can actually go about shutting down the facility if need | ||
1021 | * be */ | ||
1022 | mutex_lock(&slow_work_user_lock); | ||
1023 | |||
1024 | BUG_ON(slow_work_user_count <= 0); | ||
1025 | |||
1026 | slow_work_user_count--; | ||
1027 | if (slow_work_user_count == 0) { | ||
1028 | printk(KERN_NOTICE "Slow work thread pool: Shutting down\n"); | ||
1029 | slow_work_threads_should_exit = true; | ||
1030 | del_timer_sync(&slow_work_cull_timer); | ||
1031 | del_timer_sync(&slow_work_oom_timer); | ||
1032 | wake_up_all(&slow_work_thread_wq); | ||
1033 | wait_for_completion(&slow_work_last_thread_exited); | ||
1034 | printk(KERN_NOTICE "Slow work thread pool:" | ||
1035 | " Shut down complete\n"); | ||
1036 | } | ||
1037 | |||
1038 | mutex_unlock(&slow_work_user_lock); | ||
1039 | } | ||
1040 | EXPORT_SYMBOL(slow_work_unregister_user); | ||
1041 | |||
1042 | /* | ||
1043 | * Initialise the slow work facility | ||
1044 | */ | ||
1045 | static int __init init_slow_work(void) | ||
1046 | { | ||
1047 | unsigned nr_cpus = num_possible_cpus(); | ||
1048 | |||
1049 | if (slow_work_max_threads < nr_cpus) | ||
1050 | slow_work_max_threads = nr_cpus; | ||
1051 | #ifdef CONFIG_SYSCTL | ||
1052 | if (slow_work_max_max_threads < nr_cpus * 2) | ||
1053 | slow_work_max_max_threads = nr_cpus * 2; | ||
1054 | #endif | ||
1055 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
1056 | { | ||
1057 | struct dentry *dbdir; | ||
1058 | |||
1059 | dbdir = debugfs_create_dir("slow_work", NULL); | ||
1060 | if (dbdir && !IS_ERR(dbdir)) | ||
1061 | debugfs_create_file("runqueue", S_IFREG | 0400, dbdir, | ||
1062 | NULL, &slow_work_runqueue_fops); | ||
1063 | } | ||
1064 | #endif | ||
1065 | return 0; | ||
1066 | } | ||
1067 | |||
1068 | subsys_initcall(init_slow_work); | ||
diff --git a/kernel/slow-work.h b/kernel/slow-work.h deleted file mode 100644 index a29ebd1ef41d..000000000000 --- a/kernel/slow-work.h +++ /dev/null | |||
@@ -1,72 +0,0 @@ | |||
1 | /* Slow work private definitions | ||
2 | * | ||
3 | * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. | ||
4 | * Written by David Howells (dhowells@redhat.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public Licence | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the Licence, or (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of | ||
13 | * things to do */ | ||
14 | #define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after | ||
15 | * OOM */ | ||
16 | |||
17 | #define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */ | ||
18 | |||
19 | /* | ||
20 | * slow-work.c | ||
21 | */ | ||
22 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
23 | extern struct slow_work *slow_work_execs[]; | ||
24 | extern pid_t slow_work_pids[]; | ||
25 | extern rwlock_t slow_work_execs_lock; | ||
26 | #endif | ||
27 | |||
28 | extern struct list_head slow_work_queue; | ||
29 | extern struct list_head vslow_work_queue; | ||
30 | extern spinlock_t slow_work_queue_lock; | ||
31 | |||
32 | /* | ||
33 | * slow-work-debugfs.c | ||
34 | */ | ||
35 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
36 | extern const struct file_operations slow_work_runqueue_fops; | ||
37 | |||
38 | extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); | ||
39 | #endif | ||
40 | |||
41 | /* | ||
42 | * Helper functions | ||
43 | */ | ||
44 | static inline void slow_work_set_thread_pid(int id, pid_t pid) | ||
45 | { | ||
46 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
47 | slow_work_pids[id] = pid; | ||
48 | #endif | ||
49 | } | ||
50 | |||
51 | static inline void slow_work_mark_time(struct slow_work *work) | ||
52 | { | ||
53 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
54 | work->mark = CURRENT_TIME; | ||
55 | #endif | ||
56 | } | ||
57 | |||
58 | static inline void slow_work_begin_exec(int id, struct slow_work *work) | ||
59 | { | ||
60 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
61 | slow_work_execs[id] = work; | ||
62 | #endif | ||
63 | } | ||
64 | |||
65 | static inline void slow_work_end_exec(int id, struct slow_work *work) | ||
66 | { | ||
67 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
68 | write_lock(&slow_work_execs_lock); | ||
69 | slow_work_execs[id] = NULL; | ||
70 | write_unlock(&slow_work_execs_lock); | ||
71 | #endif | ||
72 | } | ||
diff --git a/kernel/smp.c b/kernel/smp.c index 75c970c715d3..12ed8b013e2d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -267,7 +267,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); | |||
267 | * | 267 | * |
268 | * Returns 0 on success, else a negative status code. | 268 | * Returns 0 on success, else a negative status code. |
269 | */ | 269 | */ |
270 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, | 270 | int smp_call_function_single(int cpu, smp_call_func_t func, void *info, |
271 | int wait) | 271 | int wait) |
272 | { | 272 | { |
273 | struct call_single_data d = { | 273 | struct call_single_data d = { |
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(smp_call_function_single); | |||
336 | * 3) any other online cpu in @mask | 336 | * 3) any other online cpu in @mask |
337 | */ | 337 | */ |
338 | int smp_call_function_any(const struct cpumask *mask, | 338 | int smp_call_function_any(const struct cpumask *mask, |
339 | void (*func)(void *info), void *info, int wait) | 339 | smp_call_func_t func, void *info, int wait) |
340 | { | 340 | { |
341 | unsigned int cpu; | 341 | unsigned int cpu; |
342 | const struct cpumask *nodemask; | 342 | const struct cpumask *nodemask; |
@@ -365,9 +365,10 @@ call: | |||
365 | EXPORT_SYMBOL_GPL(smp_call_function_any); | 365 | EXPORT_SYMBOL_GPL(smp_call_function_any); |
366 | 366 | ||
367 | /** | 367 | /** |
368 | * __smp_call_function_single(): Run a function on another CPU | 368 | * __smp_call_function_single(): Run a function on a specific CPU |
369 | * @cpu: The CPU to run on. | 369 | * @cpu: The CPU to run on. |
370 | * @data: Pre-allocated and setup data structure | 370 | * @data: Pre-allocated and setup data structure |
371 | * @wait: If true, wait until function has completed on specified CPU. | ||
371 | * | 372 | * |
372 | * Like smp_call_function_single(), but allow caller to pass in a | 373 | * Like smp_call_function_single(), but allow caller to pass in a |
373 | * pre-allocated data structure. Useful for embedding @data inside | 374 | * pre-allocated data structure. Useful for embedding @data inside |
@@ -376,8 +377,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_any); | |||
376 | void __smp_call_function_single(int cpu, struct call_single_data *data, | 377 | void __smp_call_function_single(int cpu, struct call_single_data *data, |
377 | int wait) | 378 | int wait) |
378 | { | 379 | { |
379 | csd_lock(data); | 380 | unsigned int this_cpu; |
381 | unsigned long flags; | ||
380 | 382 | ||
383 | this_cpu = get_cpu(); | ||
381 | /* | 384 | /* |
382 | * Can deadlock when called with interrupts disabled. | 385 | * Can deadlock when called with interrupts disabled. |
383 | * We allow cpu's that are not yet online though, as no one else can | 386 | * We allow cpu's that are not yet online though, as no one else can |
@@ -387,7 +390,15 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, | |||
387 | WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() | 390 | WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() |
388 | && !oops_in_progress); | 391 | && !oops_in_progress); |
389 | 392 | ||
390 | generic_exec_single(cpu, data, wait); | 393 | if (cpu == this_cpu) { |
394 | local_irq_save(flags); | ||
395 | data->func(data->info); | ||
396 | local_irq_restore(flags); | ||
397 | } else { | ||
398 | csd_lock(data); | ||
399 | generic_exec_single(cpu, data, wait); | ||
400 | } | ||
401 | put_cpu(); | ||
391 | } | 402 | } |
392 | 403 | ||
393 | /** | 404 | /** |
@@ -405,7 +416,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, | |||
405 | * must be disabled when calling this function. | 416 | * must be disabled when calling this function. |
406 | */ | 417 | */ |
407 | void smp_call_function_many(const struct cpumask *mask, | 418 | void smp_call_function_many(const struct cpumask *mask, |
408 | void (*func)(void *), void *info, bool wait) | 419 | smp_call_func_t func, void *info, bool wait) |
409 | { | 420 | { |
410 | struct call_function_data *data; | 421 | struct call_function_data *data; |
411 | unsigned long flags; | 422 | unsigned long flags; |
@@ -489,7 +500,7 @@ EXPORT_SYMBOL(smp_call_function_many); | |||
489 | * You must not call this function with disabled interrupts or from a | 500 | * You must not call this function with disabled interrupts or from a |
490 | * hardware interrupt handler or from a bottom half handler. | 501 | * hardware interrupt handler or from a bottom half handler. |
491 | */ | 502 | */ |
492 | int smp_call_function(void (*func)(void *), void *info, int wait) | 503 | int smp_call_function(smp_call_func_t func, void *info, int wait) |
493 | { | 504 | { |
494 | preempt_disable(); | 505 | preempt_disable(); |
495 | smp_call_function_many(cpu_online_mask, func, info, wait); | 506 | smp_call_function_many(cpu_online_mask, func, info, wait); |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 07b4f1b1a73a..18f4be0d5fe0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -67,7 +67,7 @@ char *softirq_to_name[NR_SOFTIRQS] = { | |||
67 | * to the pending events, so lets the scheduler to balance | 67 | * to the pending events, so lets the scheduler to balance |
68 | * the softirq load for us. | 68 | * the softirq load for us. |
69 | */ | 69 | */ |
70 | void wakeup_softirqd(void) | 70 | static void wakeup_softirqd(void) |
71 | { | 71 | { |
72 | /* Interrupts are disabled: no need to stop preemption */ | 72 | /* Interrupts are disabled: no need to stop preemption */ |
73 | struct task_struct *tsk = __get_cpu_var(ksoftirqd); | 73 | struct task_struct *tsk = __get_cpu_var(ksoftirqd); |
@@ -77,11 +77,21 @@ void wakeup_softirqd(void) | |||
77 | } | 77 | } |
78 | 78 | ||
79 | /* | 79 | /* |
80 | * preempt_count and SOFTIRQ_OFFSET usage: | ||
81 | * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving | ||
82 | * softirq processing. | ||
83 | * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) | ||
84 | * on local_bh_disable or local_bh_enable. | ||
85 | * This lets us distinguish between whether we are currently processing | ||
86 | * softirq and whether we just have bh disabled. | ||
87 | */ | ||
88 | |||
89 | /* | ||
80 | * This one is for softirq.c-internal use, | 90 | * This one is for softirq.c-internal use, |
81 | * where hardirqs are disabled legitimately: | 91 | * where hardirqs are disabled legitimately: |
82 | */ | 92 | */ |
83 | #ifdef CONFIG_TRACE_IRQFLAGS | 93 | #ifdef CONFIG_TRACE_IRQFLAGS |
84 | static void __local_bh_disable(unsigned long ip) | 94 | static void __local_bh_disable(unsigned long ip, unsigned int cnt) |
85 | { | 95 | { |
86 | unsigned long flags; | 96 | unsigned long flags; |
87 | 97 | ||
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip) | |||
95 | * We must manually increment preempt_count here and manually | 105 | * We must manually increment preempt_count here and manually |
96 | * call the trace_preempt_off later. | 106 | * call the trace_preempt_off later. |
97 | */ | 107 | */ |
98 | preempt_count() += SOFTIRQ_OFFSET; | 108 | preempt_count() += cnt; |
99 | /* | 109 | /* |
100 | * Were softirqs turned off above: | 110 | * Were softirqs turned off above: |
101 | */ | 111 | */ |
102 | if (softirq_count() == SOFTIRQ_OFFSET) | 112 | if (softirq_count() == cnt) |
103 | trace_softirqs_off(ip); | 113 | trace_softirqs_off(ip); |
104 | raw_local_irq_restore(flags); | 114 | raw_local_irq_restore(flags); |
105 | 115 | ||
106 | if (preempt_count() == SOFTIRQ_OFFSET) | 116 | if (preempt_count() == cnt) |
107 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 117 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
108 | } | 118 | } |
109 | #else /* !CONFIG_TRACE_IRQFLAGS */ | 119 | #else /* !CONFIG_TRACE_IRQFLAGS */ |
110 | static inline void __local_bh_disable(unsigned long ip) | 120 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) |
111 | { | 121 | { |
112 | add_preempt_count(SOFTIRQ_OFFSET); | 122 | add_preempt_count(cnt); |
113 | barrier(); | 123 | barrier(); |
114 | } | 124 | } |
115 | #endif /* CONFIG_TRACE_IRQFLAGS */ | 125 | #endif /* CONFIG_TRACE_IRQFLAGS */ |
116 | 126 | ||
117 | void local_bh_disable(void) | 127 | void local_bh_disable(void) |
118 | { | 128 | { |
119 | __local_bh_disable((unsigned long)__builtin_return_address(0)); | 129 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
130 | SOFTIRQ_DISABLE_OFFSET); | ||
120 | } | 131 | } |
121 | 132 | ||
122 | EXPORT_SYMBOL(local_bh_disable); | 133 | EXPORT_SYMBOL(local_bh_disable); |
123 | 134 | ||
135 | static void __local_bh_enable(unsigned int cnt) | ||
136 | { | ||
137 | WARN_ON_ONCE(in_irq()); | ||
138 | WARN_ON_ONCE(!irqs_disabled()); | ||
139 | |||
140 | if (softirq_count() == cnt) | ||
141 | trace_softirqs_on((unsigned long)__builtin_return_address(0)); | ||
142 | sub_preempt_count(cnt); | ||
143 | } | ||
144 | |||
124 | /* | 145 | /* |
125 | * Special-case - softirqs can safely be enabled in | 146 | * Special-case - softirqs can safely be enabled in |
126 | * cond_resched_softirq(), or by __do_softirq(), | 147 | * cond_resched_softirq(), or by __do_softirq(), |
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable); | |||
128 | */ | 149 | */ |
129 | void _local_bh_enable(void) | 150 | void _local_bh_enable(void) |
130 | { | 151 | { |
131 | WARN_ON_ONCE(in_irq()); | 152 | __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); |
132 | WARN_ON_ONCE(!irqs_disabled()); | ||
133 | |||
134 | if (softirq_count() == SOFTIRQ_OFFSET) | ||
135 | trace_softirqs_on((unsigned long)__builtin_return_address(0)); | ||
136 | sub_preempt_count(SOFTIRQ_OFFSET); | ||
137 | } | 153 | } |
138 | 154 | ||
139 | EXPORT_SYMBOL(_local_bh_enable); | 155 | EXPORT_SYMBOL(_local_bh_enable); |
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip) | |||
147 | /* | 163 | /* |
148 | * Are softirqs going to be turned on now: | 164 | * Are softirqs going to be turned on now: |
149 | */ | 165 | */ |
150 | if (softirq_count() == SOFTIRQ_OFFSET) | 166 | if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) |
151 | trace_softirqs_on(ip); | 167 | trace_softirqs_on(ip); |
152 | /* | 168 | /* |
153 | * Keep preemption disabled until we are done with | 169 | * Keep preemption disabled until we are done with |
154 | * softirq processing: | 170 | * softirq processing: |
155 | */ | 171 | */ |
156 | sub_preempt_count(SOFTIRQ_OFFSET - 1); | 172 | sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); |
157 | 173 | ||
158 | if (unlikely(!in_interrupt() && local_softirq_pending())) | 174 | if (unlikely(!in_interrupt() && local_softirq_pending())) |
159 | do_softirq(); | 175 | do_softirq(); |
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void) | |||
198 | pending = local_softirq_pending(); | 214 | pending = local_softirq_pending(); |
199 | account_system_vtime(current); | 215 | account_system_vtime(current); |
200 | 216 | ||
201 | __local_bh_disable((unsigned long)__builtin_return_address(0)); | 217 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
218 | SOFTIRQ_OFFSET); | ||
202 | lockdep_softirq_enter(); | 219 | lockdep_softirq_enter(); |
203 | 220 | ||
204 | cpu = smp_processor_id(); | 221 | cpu = smp_processor_id(); |
@@ -212,18 +229,20 @@ restart: | |||
212 | 229 | ||
213 | do { | 230 | do { |
214 | if (pending & 1) { | 231 | if (pending & 1) { |
232 | unsigned int vec_nr = h - softirq_vec; | ||
215 | int prev_count = preempt_count(); | 233 | int prev_count = preempt_count(); |
216 | kstat_incr_softirqs_this_cpu(h - softirq_vec); | ||
217 | 234 | ||
218 | trace_softirq_entry(h, softirq_vec); | 235 | kstat_incr_softirqs_this_cpu(vec_nr); |
236 | |||
237 | trace_softirq_entry(vec_nr); | ||
219 | h->action(h); | 238 | h->action(h); |
220 | trace_softirq_exit(h, softirq_vec); | 239 | trace_softirq_exit(vec_nr); |
221 | if (unlikely(prev_count != preempt_count())) { | 240 | if (unlikely(prev_count != preempt_count())) { |
222 | printk(KERN_ERR "huh, entered softirq %td %s %p" | 241 | printk(KERN_ERR "huh, entered softirq %u %s %p" |
223 | "with preempt_count %08x," | 242 | "with preempt_count %08x," |
224 | " exited with %08x?\n", h - softirq_vec, | 243 | " exited with %08x?\n", vec_nr, |
225 | softirq_to_name[h - softirq_vec], | 244 | softirq_to_name[vec_nr], h->action, |
226 | h->action, prev_count, preempt_count()); | 245 | prev_count, preempt_count()); |
227 | preempt_count() = prev_count; | 246 | preempt_count() = prev_count; |
228 | } | 247 | } |
229 | 248 | ||
@@ -245,7 +264,7 @@ restart: | |||
245 | lockdep_softirq_exit(); | 264 | lockdep_softirq_exit(); |
246 | 265 | ||
247 | account_system_vtime(current); | 266 | account_system_vtime(current); |
248 | _local_bh_enable(); | 267 | __local_bh_enable(SOFTIRQ_OFFSET); |
249 | } | 268 | } |
250 | 269 | ||
251 | #ifndef __ARCH_HAS_DO_SOFTIRQ | 270 | #ifndef __ARCH_HAS_DO_SOFTIRQ |
@@ -279,10 +298,16 @@ void irq_enter(void) | |||
279 | 298 | ||
280 | rcu_irq_enter(); | 299 | rcu_irq_enter(); |
281 | if (idle_cpu(cpu) && !in_interrupt()) { | 300 | if (idle_cpu(cpu) && !in_interrupt()) { |
282 | __irq_enter(); | 301 | /* |
302 | * Prevent raise_softirq from needlessly waking up ksoftirqd | ||
303 | * here, as softirq will be serviced on return from interrupt. | ||
304 | */ | ||
305 | local_bh_disable(); | ||
283 | tick_check_idle(cpu); | 306 | tick_check_idle(cpu); |
284 | } else | 307 | _local_bh_enable(); |
285 | __irq_enter(); | 308 | } |
309 | |||
310 | __irq_enter(); | ||
286 | } | 311 | } |
287 | 312 | ||
288 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED | 313 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED |
@@ -696,6 +721,7 @@ static int run_ksoftirqd(void * __bind_cpu) | |||
696 | { | 721 | { |
697 | set_current_state(TASK_INTERRUPTIBLE); | 722 | set_current_state(TASK_INTERRUPTIBLE); |
698 | 723 | ||
724 | current->flags |= PF_KSOFTIRQD; | ||
699 | while (!kthread_should_stop()) { | 725 | while (!kthread_should_stop()) { |
700 | preempt_disable(); | 726 | preempt_disable(); |
701 | if (!local_softirq_pending()) { | 727 | if (!local_softirq_pending()) { |
@@ -886,17 +912,14 @@ int __init __weak early_irq_init(void) | |||
886 | return 0; | 912 | return 0; |
887 | } | 913 | } |
888 | 914 | ||
915 | #ifdef CONFIG_GENERIC_HARDIRQS | ||
889 | int __init __weak arch_probe_nr_irqs(void) | 916 | int __init __weak arch_probe_nr_irqs(void) |
890 | { | 917 | { |
891 | return 0; | 918 | return NR_IRQS_LEGACY; |
892 | } | 919 | } |
893 | 920 | ||
894 | int __init __weak arch_early_irq_init(void) | 921 | int __init __weak arch_early_irq_init(void) |
895 | { | 922 | { |
896 | return 0; | 923 | return 0; |
897 | } | 924 | } |
898 | 925 | #endif | |
899 | int __weak arch_init_chip_data(struct irq_desc *desc, int node) | ||
900 | { | ||
901 | return 0; | ||
902 | } | ||
diff --git a/kernel/softlockup.c b/kernel/softlockup.c deleted file mode 100644 index 4b493f67dcb5..000000000000 --- a/kernel/softlockup.c +++ /dev/null | |||
@@ -1,293 +0,0 @@ | |||
1 | /* | ||
2 | * Detect Soft Lockups | ||
3 | * | ||
4 | * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc. | ||
5 | * | ||
6 | * this code detects soft lockups: incidents in where on a CPU | ||
7 | * the kernel does not reschedule for 10 seconds or more. | ||
8 | */ | ||
9 | #include <linux/mm.h> | ||
10 | #include <linux/cpu.h> | ||
11 | #include <linux/nmi.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/delay.h> | ||
14 | #include <linux/freezer.h> | ||
15 | #include <linux/kthread.h> | ||
16 | #include <linux/lockdep.h> | ||
17 | #include <linux/notifier.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/sysctl.h> | ||
20 | |||
21 | #include <asm/irq_regs.h> | ||
22 | |||
23 | static DEFINE_SPINLOCK(print_lock); | ||
24 | |||
25 | static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ | ||
26 | static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ | ||
27 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | ||
28 | static DEFINE_PER_CPU(bool, softlock_touch_sync); | ||
29 | |||
30 | static int __read_mostly did_panic; | ||
31 | int __read_mostly softlockup_thresh = 60; | ||
32 | |||
33 | /* | ||
34 | * Should we panic (and reboot, if panic_timeout= is set) when a | ||
35 | * soft-lockup occurs: | ||
36 | */ | ||
37 | unsigned int __read_mostly softlockup_panic = | ||
38 | CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; | ||
39 | |||
40 | static int __init softlockup_panic_setup(char *str) | ||
41 | { | ||
42 | softlockup_panic = simple_strtoul(str, NULL, 0); | ||
43 | |||
44 | return 1; | ||
45 | } | ||
46 | __setup("softlockup_panic=", softlockup_panic_setup); | ||
47 | |||
48 | static int | ||
49 | softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) | ||
50 | { | ||
51 | did_panic = 1; | ||
52 | |||
53 | return NOTIFY_DONE; | ||
54 | } | ||
55 | |||
56 | static struct notifier_block panic_block = { | ||
57 | .notifier_call = softlock_panic, | ||
58 | }; | ||
59 | |||
60 | /* | ||
61 | * Returns seconds, approximately. We don't need nanosecond | ||
62 | * resolution, and we don't need to waste time with a big divide when | ||
63 | * 2^30ns == 1.074s. | ||
64 | */ | ||
65 | static unsigned long get_timestamp(int this_cpu) | ||
66 | { | ||
67 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ | ||
68 | } | ||
69 | |||
70 | static void __touch_softlockup_watchdog(void) | ||
71 | { | ||
72 | int this_cpu = raw_smp_processor_id(); | ||
73 | |||
74 | __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu); | ||
75 | } | ||
76 | |||
77 | void touch_softlockup_watchdog(void) | ||
78 | { | ||
79 | __raw_get_cpu_var(softlockup_touch_ts) = 0; | ||
80 | } | ||
81 | EXPORT_SYMBOL(touch_softlockup_watchdog); | ||
82 | |||
83 | void touch_softlockup_watchdog_sync(void) | ||
84 | { | ||
85 | __raw_get_cpu_var(softlock_touch_sync) = true; | ||
86 | __raw_get_cpu_var(softlockup_touch_ts) = 0; | ||
87 | } | ||
88 | |||
89 | void touch_all_softlockup_watchdogs(void) | ||
90 | { | ||
91 | int cpu; | ||
92 | |||
93 | /* Cause each CPU to re-update its timestamp rather than complain */ | ||
94 | for_each_online_cpu(cpu) | ||
95 | per_cpu(softlockup_touch_ts, cpu) = 0; | ||
96 | } | ||
97 | EXPORT_SYMBOL(touch_all_softlockup_watchdogs); | ||
98 | |||
99 | int proc_dosoftlockup_thresh(struct ctl_table *table, int write, | ||
100 | void __user *buffer, | ||
101 | size_t *lenp, loff_t *ppos) | ||
102 | { | ||
103 | touch_all_softlockup_watchdogs(); | ||
104 | return proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | * This callback runs from the timer interrupt, and checks | ||
109 | * whether the watchdog thread has hung or not: | ||
110 | */ | ||
111 | void softlockup_tick(void) | ||
112 | { | ||
113 | int this_cpu = smp_processor_id(); | ||
114 | unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu); | ||
115 | unsigned long print_ts; | ||
116 | struct pt_regs *regs = get_irq_regs(); | ||
117 | unsigned long now; | ||
118 | |||
119 | /* Is detection switched off? */ | ||
120 | if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) { | ||
121 | /* Be sure we don't false trigger if switched back on */ | ||
122 | if (touch_ts) | ||
123 | per_cpu(softlockup_touch_ts, this_cpu) = 0; | ||
124 | return; | ||
125 | } | ||
126 | |||
127 | if (touch_ts == 0) { | ||
128 | if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { | ||
129 | /* | ||
130 | * If the time stamp was touched atomically | ||
131 | * make sure the scheduler tick is up to date. | ||
132 | */ | ||
133 | per_cpu(softlock_touch_sync, this_cpu) = false; | ||
134 | sched_clock_tick(); | ||
135 | } | ||
136 | __touch_softlockup_watchdog(); | ||
137 | return; | ||
138 | } | ||
139 | |||
140 | print_ts = per_cpu(softlockup_print_ts, this_cpu); | ||
141 | |||
142 | /* report at most once a second */ | ||
143 | if (print_ts == touch_ts || did_panic) | ||
144 | return; | ||
145 | |||
146 | /* do not print during early bootup: */ | ||
147 | if (unlikely(system_state != SYSTEM_RUNNING)) { | ||
148 | __touch_softlockup_watchdog(); | ||
149 | return; | ||
150 | } | ||
151 | |||
152 | now = get_timestamp(this_cpu); | ||
153 | |||
154 | /* | ||
155 | * Wake up the high-prio watchdog task twice per | ||
156 | * threshold timespan. | ||
157 | */ | ||
158 | if (time_after(now - softlockup_thresh/2, touch_ts)) | ||
159 | wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); | ||
160 | |||
161 | /* Warn about unreasonable delays: */ | ||
162 | if (time_before_eq(now - softlockup_thresh, touch_ts)) | ||
163 | return; | ||
164 | |||
165 | per_cpu(softlockup_print_ts, this_cpu) = touch_ts; | ||
166 | |||
167 | spin_lock(&print_lock); | ||
168 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", | ||
169 | this_cpu, now - touch_ts, | ||
170 | current->comm, task_pid_nr(current)); | ||
171 | print_modules(); | ||
172 | print_irqtrace_events(current); | ||
173 | if (regs) | ||
174 | show_regs(regs); | ||
175 | else | ||
176 | dump_stack(); | ||
177 | spin_unlock(&print_lock); | ||
178 | |||
179 | if (softlockup_panic) | ||
180 | panic("softlockup: hung tasks"); | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * The watchdog thread - runs every second and touches the timestamp. | ||
185 | */ | ||
186 | static int watchdog(void *__bind_cpu) | ||
187 | { | ||
188 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | ||
189 | |||
190 | sched_setscheduler(current, SCHED_FIFO, ¶m); | ||
191 | |||
192 | /* initialize timestamp */ | ||
193 | __touch_softlockup_watchdog(); | ||
194 | |||
195 | set_current_state(TASK_INTERRUPTIBLE); | ||
196 | /* | ||
197 | * Run briefly once per second to reset the softlockup timestamp. | ||
198 | * If this gets delayed for more than 60 seconds then the | ||
199 | * debug-printout triggers in softlockup_tick(). | ||
200 | */ | ||
201 | while (!kthread_should_stop()) { | ||
202 | __touch_softlockup_watchdog(); | ||
203 | schedule(); | ||
204 | |||
205 | if (kthread_should_stop()) | ||
206 | break; | ||
207 | |||
208 | set_current_state(TASK_INTERRUPTIBLE); | ||
209 | } | ||
210 | __set_current_state(TASK_RUNNING); | ||
211 | |||
212 | return 0; | ||
213 | } | ||
214 | |||
215 | /* | ||
216 | * Create/destroy watchdog threads as CPUs come and go: | ||
217 | */ | ||
218 | static int __cpuinit | ||
219 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
220 | { | ||
221 | int hotcpu = (unsigned long)hcpu; | ||
222 | struct task_struct *p; | ||
223 | |||
224 | switch (action) { | ||
225 | case CPU_UP_PREPARE: | ||
226 | case CPU_UP_PREPARE_FROZEN: | ||
227 | BUG_ON(per_cpu(softlockup_watchdog, hotcpu)); | ||
228 | p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); | ||
229 | if (IS_ERR(p)) { | ||
230 | printk(KERN_ERR "watchdog for %i failed\n", hotcpu); | ||
231 | return NOTIFY_BAD; | ||
232 | } | ||
233 | per_cpu(softlockup_touch_ts, hotcpu) = 0; | ||
234 | per_cpu(softlockup_watchdog, hotcpu) = p; | ||
235 | kthread_bind(p, hotcpu); | ||
236 | break; | ||
237 | case CPU_ONLINE: | ||
238 | case CPU_ONLINE_FROZEN: | ||
239 | wake_up_process(per_cpu(softlockup_watchdog, hotcpu)); | ||
240 | break; | ||
241 | #ifdef CONFIG_HOTPLUG_CPU | ||
242 | case CPU_UP_CANCELED: | ||
243 | case CPU_UP_CANCELED_FROZEN: | ||
244 | if (!per_cpu(softlockup_watchdog, hotcpu)) | ||
245 | break; | ||
246 | /* Unbind so it can run. Fall thru. */ | ||
247 | kthread_bind(per_cpu(softlockup_watchdog, hotcpu), | ||
248 | cpumask_any(cpu_online_mask)); | ||
249 | case CPU_DEAD: | ||
250 | case CPU_DEAD_FROZEN: | ||
251 | p = per_cpu(softlockup_watchdog, hotcpu); | ||
252 | per_cpu(softlockup_watchdog, hotcpu) = NULL; | ||
253 | kthread_stop(p); | ||
254 | break; | ||
255 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
256 | } | ||
257 | return NOTIFY_OK; | ||
258 | } | ||
259 | |||
260 | static struct notifier_block __cpuinitdata cpu_nfb = { | ||
261 | .notifier_call = cpu_callback | ||
262 | }; | ||
263 | |||
264 | static int __initdata nosoftlockup; | ||
265 | |||
266 | static int __init nosoftlockup_setup(char *str) | ||
267 | { | ||
268 | nosoftlockup = 1; | ||
269 | return 1; | ||
270 | } | ||
271 | __setup("nosoftlockup", nosoftlockup_setup); | ||
272 | |||
273 | static int __init spawn_softlockup_task(void) | ||
274 | { | ||
275 | void *cpu = (void *)(long)smp_processor_id(); | ||
276 | int err; | ||
277 | |||
278 | if (nosoftlockup) | ||
279 | return 0; | ||
280 | |||
281 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | ||
282 | if (err == NOTIFY_BAD) { | ||
283 | BUG(); | ||
284 | return 1; | ||
285 | } | ||
286 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | ||
287 | register_cpu_notifier(&cpu_nfb); | ||
288 | |||
289 | atomic_notifier_chain_register(&panic_notifier_list, &panic_block); | ||
290 | |||
291 | return 0; | ||
292 | } | ||
293 | early_initcall(spawn_softlockup_task); | ||
diff --git a/kernel/srcu.c b/kernel/srcu.c index 2980da3fd509..c71e07500536 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) | |||
46 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | 46 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, |
47 | struct lock_class_key *key) | 47 | struct lock_class_key *key) |
48 | { | 48 | { |
49 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
50 | /* Don't re-initialize a lock while it is held. */ | 49 | /* Don't re-initialize a lock while it is held. */ |
51 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | 50 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); |
52 | lockdep_init_map(&sp->dep_map, name, key, 0); | 51 | lockdep_init_map(&sp->dep_map, name, key, 0); |
53 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
54 | return init_srcu_struct_fields(sp); | 52 | return init_srcu_struct_fields(sp); |
55 | } | 53 | } |
56 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | 54 | EXPORT_SYMBOL_GPL(__init_srcu_struct); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 70f8d90331e9..2df820b03beb 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -35,9 +35,9 @@ struct cpu_stop_done { | |||
35 | /* the actual stopper, one per every possible cpu, enabled on online cpus */ | 35 | /* the actual stopper, one per every possible cpu, enabled on online cpus */ |
36 | struct cpu_stopper { | 36 | struct cpu_stopper { |
37 | spinlock_t lock; | 37 | spinlock_t lock; |
38 | bool enabled; /* is this stopper enabled? */ | ||
38 | struct list_head works; /* list of pending works */ | 39 | struct list_head works; /* list of pending works */ |
39 | struct task_struct *thread; /* stopper thread */ | 40 | struct task_struct *thread; /* stopper thread */ |
40 | bool enabled; /* is this stopper enabled? */ | ||
41 | }; | 41 | }; |
42 | 42 | ||
43 | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); | 43 | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); |
@@ -262,7 +262,7 @@ repeat: | |||
262 | cpu_stop_fn_t fn = work->fn; | 262 | cpu_stop_fn_t fn = work->fn; |
263 | void *arg = work->arg; | 263 | void *arg = work->arg; |
264 | struct cpu_stop_done *done = work->done; | 264 | struct cpu_stop_done *done = work->done; |
265 | char ksym_buf[KSYM_NAME_LEN]; | 265 | char ksym_buf[KSYM_NAME_LEN] __maybe_unused; |
266 | 266 | ||
267 | __set_current_state(TASK_RUNNING); | 267 | __set_current_state(TASK_RUNNING); |
268 | 268 | ||
@@ -287,11 +287,12 @@ repeat: | |||
287 | goto repeat; | 287 | goto repeat; |
288 | } | 288 | } |
289 | 289 | ||
290 | extern void sched_set_stop_task(int cpu, struct task_struct *stop); | ||
291 | |||
290 | /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ | 292 | /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ |
291 | static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, | 293 | static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, |
292 | unsigned long action, void *hcpu) | 294 | unsigned long action, void *hcpu) |
293 | { | 295 | { |
294 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | ||
295 | unsigned int cpu = (unsigned long)hcpu; | 296 | unsigned int cpu = (unsigned long)hcpu; |
296 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 297 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
297 | struct task_struct *p; | 298 | struct task_struct *p; |
@@ -303,14 +304,14 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, | |||
303 | p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", | 304 | p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", |
304 | cpu); | 305 | cpu); |
305 | if (IS_ERR(p)) | 306 | if (IS_ERR(p)) |
306 | return NOTIFY_BAD; | 307 | return notifier_from_errno(PTR_ERR(p)); |
307 | sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); | ||
308 | get_task_struct(p); | 308 | get_task_struct(p); |
309 | kthread_bind(p, cpu); | ||
310 | sched_set_stop_task(cpu, p); | ||
309 | stopper->thread = p; | 311 | stopper->thread = p; |
310 | break; | 312 | break; |
311 | 313 | ||
312 | case CPU_ONLINE: | 314 | case CPU_ONLINE: |
313 | kthread_bind(stopper->thread, cpu); | ||
314 | /* strictly unnecessary, as first user will wake it */ | 315 | /* strictly unnecessary, as first user will wake it */ |
315 | wake_up_process(stopper->thread); | 316 | wake_up_process(stopper->thread); |
316 | /* mark enabled */ | 317 | /* mark enabled */ |
@@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, | |||
325 | { | 326 | { |
326 | struct cpu_stop_work *work; | 327 | struct cpu_stop_work *work; |
327 | 328 | ||
329 | sched_set_stop_task(cpu, NULL); | ||
328 | /* kill the stopper */ | 330 | /* kill the stopper */ |
329 | kthread_stop(stopper->thread); | 331 | kthread_stop(stopper->thread); |
330 | /* drain remaining works */ | 332 | /* drain remaining works */ |
@@ -370,7 +372,7 @@ static int __init cpu_stop_init(void) | |||
370 | /* start one for the boot cpu */ | 372 | /* start one for the boot cpu */ |
371 | err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, | 373 | err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, |
372 | bcpu); | 374 | bcpu); |
373 | BUG_ON(err == NOTIFY_BAD); | 375 | BUG_ON(err != NOTIFY_OK); |
374 | cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); | 376 | cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); |
375 | register_cpu_notifier(&cpu_stop_cpu_notifier); | 377 | register_cpu_notifier(&cpu_stop_cpu_notifier); |
376 | 378 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index e83ddbbaf89d..7f5a0cd296a9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -931,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) | |||
931 | pgid = pid; | 931 | pgid = pid; |
932 | if (pgid < 0) | 932 | if (pgid < 0) |
933 | return -EINVAL; | 933 | return -EINVAL; |
934 | rcu_read_lock(); | ||
934 | 935 | ||
935 | /* From this point forward we keep holding onto the tasklist lock | 936 | /* From this point forward we keep holding onto the tasklist lock |
936 | * so that our parent does not change from under us. -DaveM | 937 | * so that our parent does not change from under us. -DaveM |
@@ -984,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) | |||
984 | out: | 985 | out: |
985 | /* All paths lead to here, thus we are safe. -DaveM */ | 986 | /* All paths lead to here, thus we are safe. -DaveM */ |
986 | write_unlock_irq(&tasklist_lock); | 987 | write_unlock_irq(&tasklist_lock); |
988 | rcu_read_unlock(); | ||
987 | return err; | 989 | return err; |
988 | } | 990 | } |
989 | 991 | ||
@@ -1236,15 +1238,14 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) | |||
1236 | 1238 | ||
1237 | SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) | 1239 | SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) |
1238 | { | 1240 | { |
1239 | if (resource >= RLIM_NLIMITS) | 1241 | struct rlimit value; |
1240 | return -EINVAL; | 1242 | int ret; |
1241 | else { | 1243 | |
1242 | struct rlimit value; | 1244 | ret = do_prlimit(current, resource, NULL, &value); |
1243 | task_lock(current->group_leader); | 1245 | if (!ret) |
1244 | value = current->signal->rlim[resource]; | 1246 | ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; |
1245 | task_unlock(current->group_leader); | 1247 | |
1246 | return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; | 1248 | return ret; |
1247 | } | ||
1248 | } | 1249 | } |
1249 | 1250 | ||
1250 | #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT | 1251 | #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT |
@@ -1272,44 +1273,89 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, | |||
1272 | 1273 | ||
1273 | #endif | 1274 | #endif |
1274 | 1275 | ||
1275 | SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) | 1276 | static inline bool rlim64_is_infinity(__u64 rlim64) |
1276 | { | 1277 | { |
1277 | struct rlimit new_rlim, *old_rlim; | 1278 | #if BITS_PER_LONG < 64 |
1278 | int retval; | 1279 | return rlim64 >= ULONG_MAX; |
1280 | #else | ||
1281 | return rlim64 == RLIM64_INFINITY; | ||
1282 | #endif | ||
1283 | } | ||
1284 | |||
1285 | static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64) | ||
1286 | { | ||
1287 | if (rlim->rlim_cur == RLIM_INFINITY) | ||
1288 | rlim64->rlim_cur = RLIM64_INFINITY; | ||
1289 | else | ||
1290 | rlim64->rlim_cur = rlim->rlim_cur; | ||
1291 | if (rlim->rlim_max == RLIM_INFINITY) | ||
1292 | rlim64->rlim_max = RLIM64_INFINITY; | ||
1293 | else | ||
1294 | rlim64->rlim_max = rlim->rlim_max; | ||
1295 | } | ||
1296 | |||
1297 | static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) | ||
1298 | { | ||
1299 | if (rlim64_is_infinity(rlim64->rlim_cur)) | ||
1300 | rlim->rlim_cur = RLIM_INFINITY; | ||
1301 | else | ||
1302 | rlim->rlim_cur = (unsigned long)rlim64->rlim_cur; | ||
1303 | if (rlim64_is_infinity(rlim64->rlim_max)) | ||
1304 | rlim->rlim_max = RLIM_INFINITY; | ||
1305 | else | ||
1306 | rlim->rlim_max = (unsigned long)rlim64->rlim_max; | ||
1307 | } | ||
1308 | |||
1309 | /* make sure you are allowed to change @tsk limits before calling this */ | ||
1310 | int do_prlimit(struct task_struct *tsk, unsigned int resource, | ||
1311 | struct rlimit *new_rlim, struct rlimit *old_rlim) | ||
1312 | { | ||
1313 | struct rlimit *rlim; | ||
1314 | int retval = 0; | ||
1279 | 1315 | ||
1280 | if (resource >= RLIM_NLIMITS) | 1316 | if (resource >= RLIM_NLIMITS) |
1281 | return -EINVAL; | 1317 | return -EINVAL; |
1282 | if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) | 1318 | if (new_rlim) { |
1283 | return -EFAULT; | 1319 | if (new_rlim->rlim_cur > new_rlim->rlim_max) |
1284 | if (new_rlim.rlim_cur > new_rlim.rlim_max) | 1320 | return -EINVAL; |
1285 | return -EINVAL; | 1321 | if (resource == RLIMIT_NOFILE && |
1286 | old_rlim = current->signal->rlim + resource; | 1322 | new_rlim->rlim_max > sysctl_nr_open) |
1287 | if ((new_rlim.rlim_max > old_rlim->rlim_max) && | 1323 | return -EPERM; |
1288 | !capable(CAP_SYS_RESOURCE)) | ||
1289 | return -EPERM; | ||
1290 | if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) | ||
1291 | return -EPERM; | ||
1292 | |||
1293 | retval = security_task_setrlimit(resource, &new_rlim); | ||
1294 | if (retval) | ||
1295 | return retval; | ||
1296 | |||
1297 | if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { | ||
1298 | /* | ||
1299 | * The caller is asking for an immediate RLIMIT_CPU | ||
1300 | * expiry. But we use the zero value to mean "it was | ||
1301 | * never set". So let's cheat and make it one second | ||
1302 | * instead | ||
1303 | */ | ||
1304 | new_rlim.rlim_cur = 1; | ||
1305 | } | 1324 | } |
1306 | 1325 | ||
1307 | task_lock(current->group_leader); | 1326 | /* protect tsk->signal and tsk->sighand from disappearing */ |
1308 | *old_rlim = new_rlim; | 1327 | read_lock(&tasklist_lock); |
1309 | task_unlock(current->group_leader); | 1328 | if (!tsk->sighand) { |
1310 | 1329 | retval = -ESRCH; | |
1311 | if (resource != RLIMIT_CPU) | ||
1312 | goto out; | 1330 | goto out; |
1331 | } | ||
1332 | |||
1333 | rlim = tsk->signal->rlim + resource; | ||
1334 | task_lock(tsk->group_leader); | ||
1335 | if (new_rlim) { | ||
1336 | if (new_rlim->rlim_max > rlim->rlim_max && | ||
1337 | !capable(CAP_SYS_RESOURCE)) | ||
1338 | retval = -EPERM; | ||
1339 | if (!retval) | ||
1340 | retval = security_task_setrlimit(tsk->group_leader, | ||
1341 | resource, new_rlim); | ||
1342 | if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { | ||
1343 | /* | ||
1344 | * The caller is asking for an immediate RLIMIT_CPU | ||
1345 | * expiry. But we use the zero value to mean "it was | ||
1346 | * never set". So let's cheat and make it one second | ||
1347 | * instead | ||
1348 | */ | ||
1349 | new_rlim->rlim_cur = 1; | ||
1350 | } | ||
1351 | } | ||
1352 | if (!retval) { | ||
1353 | if (old_rlim) | ||
1354 | *old_rlim = *rlim; | ||
1355 | if (new_rlim) | ||
1356 | *rlim = *new_rlim; | ||
1357 | } | ||
1358 | task_unlock(tsk->group_leader); | ||
1313 | 1359 | ||
1314 | /* | 1360 | /* |
1315 | * RLIMIT_CPU handling. Note that the kernel fails to return an error | 1361 | * RLIMIT_CPU handling. Note that the kernel fails to return an error |
@@ -1317,14 +1363,84 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) | |||
1317 | * very long-standing error, and fixing it now risks breakage of | 1363 | * very long-standing error, and fixing it now risks breakage of |
1318 | * applications, so we live with it | 1364 | * applications, so we live with it |
1319 | */ | 1365 | */ |
1320 | if (new_rlim.rlim_cur == RLIM_INFINITY) | 1366 | if (!retval && new_rlim && resource == RLIMIT_CPU && |
1321 | goto out; | 1367 | new_rlim->rlim_cur != RLIM_INFINITY) |
1322 | 1368 | update_rlimit_cpu(tsk, new_rlim->rlim_cur); | |
1323 | update_rlimit_cpu(new_rlim.rlim_cur); | ||
1324 | out: | 1369 | out: |
1370 | read_unlock(&tasklist_lock); | ||
1371 | return retval; | ||
1372 | } | ||
1373 | |||
1374 | /* rcu lock must be held */ | ||
1375 | static int check_prlimit_permission(struct task_struct *task) | ||
1376 | { | ||
1377 | const struct cred *cred = current_cred(), *tcred; | ||
1378 | |||
1379 | tcred = __task_cred(task); | ||
1380 | if ((cred->uid != tcred->euid || | ||
1381 | cred->uid != tcred->suid || | ||
1382 | cred->uid != tcred->uid || | ||
1383 | cred->gid != tcred->egid || | ||
1384 | cred->gid != tcred->sgid || | ||
1385 | cred->gid != tcred->gid) && | ||
1386 | !capable(CAP_SYS_RESOURCE)) { | ||
1387 | return -EPERM; | ||
1388 | } | ||
1389 | |||
1325 | return 0; | 1390 | return 0; |
1326 | } | 1391 | } |
1327 | 1392 | ||
1393 | SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, | ||
1394 | const struct rlimit64 __user *, new_rlim, | ||
1395 | struct rlimit64 __user *, old_rlim) | ||
1396 | { | ||
1397 | struct rlimit64 old64, new64; | ||
1398 | struct rlimit old, new; | ||
1399 | struct task_struct *tsk; | ||
1400 | int ret; | ||
1401 | |||
1402 | if (new_rlim) { | ||
1403 | if (copy_from_user(&new64, new_rlim, sizeof(new64))) | ||
1404 | return -EFAULT; | ||
1405 | rlim64_to_rlim(&new64, &new); | ||
1406 | } | ||
1407 | |||
1408 | rcu_read_lock(); | ||
1409 | tsk = pid ? find_task_by_vpid(pid) : current; | ||
1410 | if (!tsk) { | ||
1411 | rcu_read_unlock(); | ||
1412 | return -ESRCH; | ||
1413 | } | ||
1414 | ret = check_prlimit_permission(tsk); | ||
1415 | if (ret) { | ||
1416 | rcu_read_unlock(); | ||
1417 | return ret; | ||
1418 | } | ||
1419 | get_task_struct(tsk); | ||
1420 | rcu_read_unlock(); | ||
1421 | |||
1422 | ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, | ||
1423 | old_rlim ? &old : NULL); | ||
1424 | |||
1425 | if (!ret && old_rlim) { | ||
1426 | rlim_to_rlim64(&old, &old64); | ||
1427 | if (copy_to_user(old_rlim, &old64, sizeof(old64))) | ||
1428 | ret = -EFAULT; | ||
1429 | } | ||
1430 | |||
1431 | put_task_struct(tsk); | ||
1432 | return ret; | ||
1433 | } | ||
1434 | |||
1435 | SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) | ||
1436 | { | ||
1437 | struct rlimit new_rlim; | ||
1438 | |||
1439 | if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) | ||
1440 | return -EFAULT; | ||
1441 | return do_prlimit(current, resource, &new_rlim, NULL); | ||
1442 | } | ||
1443 | |||
1328 | /* | 1444 | /* |
1329 | * It would make sense to put struct rusage in the task_struct, | 1445 | * It would make sense to put struct rusage in the task_struct, |
1330 | * except that would make the task_struct be *really big*. After | 1446 | * except that would make the task_struct be *really big*. After |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 70f2ea758ffe..c782fe9924c7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -50,6 +50,7 @@ cond_syscall(compat_sys_sendmsg); | |||
50 | cond_syscall(sys_recvmsg); | 50 | cond_syscall(sys_recvmsg); |
51 | cond_syscall(sys_recvmmsg); | 51 | cond_syscall(sys_recvmmsg); |
52 | cond_syscall(compat_sys_recvmsg); | 52 | cond_syscall(compat_sys_recvmsg); |
53 | cond_syscall(compat_sys_recv); | ||
53 | cond_syscall(compat_sys_recvfrom); | 54 | cond_syscall(compat_sys_recvfrom); |
54 | cond_syscall(compat_sys_recvmmsg); | 55 | cond_syscall(compat_sys_recvmmsg); |
55 | cond_syscall(sys_socketcall); | 56 | cond_syscall(sys_socketcall); |
@@ -181,3 +182,7 @@ cond_syscall(sys_eventfd2); | |||
181 | 182 | ||
182 | /* performance counters: */ | 183 | /* performance counters: */ |
183 | cond_syscall(sys_perf_event_open); | 184 | cond_syscall(sys_perf_event_open); |
185 | |||
186 | /* fanotify! */ | ||
187 | cond_syscall(sys_fanotify_init); | ||
188 | cond_syscall(sys_fanotify_mark); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d24f761f4876..c33a1edb799f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -44,16 +44,17 @@ | |||
44 | #include <linux/times.h> | 44 | #include <linux/times.h> |
45 | #include <linux/limits.h> | 45 | #include <linux/limits.h> |
46 | #include <linux/dcache.h> | 46 | #include <linux/dcache.h> |
47 | #include <linux/dnotify.h> | ||
47 | #include <linux/syscalls.h> | 48 | #include <linux/syscalls.h> |
48 | #include <linux/vmstat.h> | 49 | #include <linux/vmstat.h> |
49 | #include <linux/nfs_fs.h> | 50 | #include <linux/nfs_fs.h> |
50 | #include <linux/acpi.h> | 51 | #include <linux/acpi.h> |
51 | #include <linux/reboot.h> | 52 | #include <linux/reboot.h> |
52 | #include <linux/ftrace.h> | 53 | #include <linux/ftrace.h> |
53 | #include <linux/slow-work.h> | ||
54 | #include <linux/perf_event.h> | 54 | #include <linux/perf_event.h> |
55 | #include <linux/kprobes.h> | 55 | #include <linux/kprobes.h> |
56 | #include <linux/pipe_fs_i.h> | 56 | #include <linux/pipe_fs_i.h> |
57 | #include <linux/oom.h> | ||
57 | 58 | ||
58 | #include <asm/uaccess.h> | 59 | #include <asm/uaccess.h> |
59 | #include <asm/processor.h> | 60 | #include <asm/processor.h> |
@@ -76,15 +77,16 @@ | |||
76 | #include <scsi/sg.h> | 77 | #include <scsi/sg.h> |
77 | #endif | 78 | #endif |
78 | 79 | ||
80 | #ifdef CONFIG_LOCKUP_DETECTOR | ||
81 | #include <linux/nmi.h> | ||
82 | #endif | ||
83 | |||
79 | 84 | ||
80 | #if defined(CONFIG_SYSCTL) | 85 | #if defined(CONFIG_SYSCTL) |
81 | 86 | ||
82 | /* External variables not in a header file. */ | 87 | /* External variables not in a header file. */ |
83 | extern int sysctl_overcommit_memory; | 88 | extern int sysctl_overcommit_memory; |
84 | extern int sysctl_overcommit_ratio; | 89 | extern int sysctl_overcommit_ratio; |
85 | extern int sysctl_panic_on_oom; | ||
86 | extern int sysctl_oom_kill_allocating_task; | ||
87 | extern int sysctl_oom_dump_tasks; | ||
88 | extern int max_threads; | 90 | extern int max_threads; |
89 | extern int core_uses_pid; | 91 | extern int core_uses_pid; |
90 | extern int suid_dumpable; | 92 | extern int suid_dumpable; |
@@ -106,7 +108,7 @@ extern int blk_iopoll_enabled; | |||
106 | #endif | 108 | #endif |
107 | 109 | ||
108 | /* Constants used for minimum and maximum */ | 110 | /* Constants used for minimum and maximum */ |
109 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 111 | #ifdef CONFIG_LOCKUP_DETECTOR |
110 | static int sixty = 60; | 112 | static int sixty = 60; |
111 | static int neg_one = -1; | 113 | static int neg_one = -1; |
112 | #endif | 114 | #endif |
@@ -130,6 +132,9 @@ static int min_percpu_pagelist_fract = 8; | |||
130 | 132 | ||
131 | static int ngroups_max = NGROUPS_MAX; | 133 | static int ngroups_max = NGROUPS_MAX; |
132 | 134 | ||
135 | #ifdef CONFIG_INOTIFY_USER | ||
136 | #include <linux/inotify.h> | ||
137 | #endif | ||
133 | #ifdef CONFIG_SPARC | 138 | #ifdef CONFIG_SPARC |
134 | #include <asm/system.h> | 139 | #include <asm/system.h> |
135 | #endif | 140 | #endif |
@@ -156,8 +161,6 @@ extern int no_unaligned_warning; | |||
156 | extern int unaligned_dump_stack; | 161 | extern int unaligned_dump_stack; |
157 | #endif | 162 | #endif |
158 | 163 | ||
159 | extern struct ratelimit_state printk_ratelimit_state; | ||
160 | |||
161 | #ifdef CONFIG_PROC_SYSCTL | 164 | #ifdef CONFIG_PROC_SYSCTL |
162 | static int proc_do_cad_pid(struct ctl_table *table, int write, | 165 | static int proc_do_cad_pid(struct ctl_table *table, int write, |
163 | void __user *buffer, size_t *lenp, loff_t *ppos); | 166 | void __user *buffer, size_t *lenp, loff_t *ppos); |
@@ -206,9 +209,6 @@ static struct ctl_table fs_table[]; | |||
206 | static struct ctl_table debug_table[]; | 209 | static struct ctl_table debug_table[]; |
207 | static struct ctl_table dev_table[]; | 210 | static struct ctl_table dev_table[]; |
208 | extern struct ctl_table random_table[]; | 211 | extern struct ctl_table random_table[]; |
209 | #ifdef CONFIG_INOTIFY_USER | ||
210 | extern struct ctl_table inotify_table[]; | ||
211 | #endif | ||
212 | #ifdef CONFIG_EPOLL | 212 | #ifdef CONFIG_EPOLL |
213 | extern struct ctl_table epoll_table[]; | 213 | extern struct ctl_table epoll_table[]; |
214 | #endif | 214 | #endif |
@@ -562,7 +562,7 @@ static struct ctl_table kern_table[] = { | |||
562 | .extra2 = &one, | 562 | .extra2 = &one, |
563 | }, | 563 | }, |
564 | #endif | 564 | #endif |
565 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) | 565 | #ifdef CONFIG_HOTPLUG |
566 | { | 566 | { |
567 | .procname = "hotplug", | 567 | .procname = "hotplug", |
568 | .data = &uevent_helper, | 568 | .data = &uevent_helper, |
@@ -710,7 +710,34 @@ static struct ctl_table kern_table[] = { | |||
710 | .mode = 0444, | 710 | .mode = 0444, |
711 | .proc_handler = proc_dointvec, | 711 | .proc_handler = proc_dointvec, |
712 | }, | 712 | }, |
713 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 713 | #if defined(CONFIG_LOCKUP_DETECTOR) |
714 | { | ||
715 | .procname = "watchdog", | ||
716 | .data = &watchdog_enabled, | ||
717 | .maxlen = sizeof (int), | ||
718 | .mode = 0644, | ||
719 | .proc_handler = proc_dowatchdog_enabled, | ||
720 | }, | ||
721 | { | ||
722 | .procname = "watchdog_thresh", | ||
723 | .data = &softlockup_thresh, | ||
724 | .maxlen = sizeof(int), | ||
725 | .mode = 0644, | ||
726 | .proc_handler = proc_dowatchdog_thresh, | ||
727 | .extra1 = &neg_one, | ||
728 | .extra2 = &sixty, | ||
729 | }, | ||
730 | { | ||
731 | .procname = "softlockup_panic", | ||
732 | .data = &softlockup_panic, | ||
733 | .maxlen = sizeof(int), | ||
734 | .mode = 0644, | ||
735 | .proc_handler = proc_dointvec_minmax, | ||
736 | .extra1 = &zero, | ||
737 | .extra2 = &one, | ||
738 | }, | ||
739 | #endif | ||
740 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) | ||
714 | { | 741 | { |
715 | .procname = "unknown_nmi_panic", | 742 | .procname = "unknown_nmi_panic", |
716 | .data = &unknown_nmi_panic, | 743 | .data = &unknown_nmi_panic, |
@@ -813,26 +840,6 @@ static struct ctl_table kern_table[] = { | |||
813 | .proc_handler = proc_dointvec, | 840 | .proc_handler = proc_dointvec, |
814 | }, | 841 | }, |
815 | #endif | 842 | #endif |
816 | #ifdef CONFIG_DETECT_SOFTLOCKUP | ||
817 | { | ||
818 | .procname = "softlockup_panic", | ||
819 | .data = &softlockup_panic, | ||
820 | .maxlen = sizeof(int), | ||
821 | .mode = 0644, | ||
822 | .proc_handler = proc_dointvec_minmax, | ||
823 | .extra1 = &zero, | ||
824 | .extra2 = &one, | ||
825 | }, | ||
826 | { | ||
827 | .procname = "softlockup_thresh", | ||
828 | .data = &softlockup_thresh, | ||
829 | .maxlen = sizeof(int), | ||
830 | .mode = 0644, | ||
831 | .proc_handler = proc_dosoftlockup_thresh, | ||
832 | .extra1 = &neg_one, | ||
833 | .extra2 = &sixty, | ||
834 | }, | ||
835 | #endif | ||
836 | #ifdef CONFIG_DETECT_HUNG_TASK | 843 | #ifdef CONFIG_DETECT_HUNG_TASK |
837 | { | 844 | { |
838 | .procname = "hung_task_panic", | 845 | .procname = "hung_task_panic", |
@@ -906,13 +913,6 @@ static struct ctl_table kern_table[] = { | |||
906 | .proc_handler = proc_dointvec, | 913 | .proc_handler = proc_dointvec, |
907 | }, | 914 | }, |
908 | #endif | 915 | #endif |
909 | #ifdef CONFIG_SLOW_WORK | ||
910 | { | ||
911 | .procname = "slow-work", | ||
912 | .mode = 0555, | ||
913 | .child = slow_work_sysctls, | ||
914 | }, | ||
915 | #endif | ||
916 | #ifdef CONFIG_PERF_EVENTS | 916 | #ifdef CONFIG_PERF_EVENTS |
917 | { | 917 | { |
918 | .procname = "perf_event_paranoid", | 918 | .procname = "perf_event_paranoid", |
@@ -1338,28 +1338,28 @@ static struct ctl_table fs_table[] = { | |||
1338 | .data = &inodes_stat, | 1338 | .data = &inodes_stat, |
1339 | .maxlen = 2*sizeof(int), | 1339 | .maxlen = 2*sizeof(int), |
1340 | .mode = 0444, | 1340 | .mode = 0444, |
1341 | .proc_handler = proc_dointvec, | 1341 | .proc_handler = proc_nr_inodes, |
1342 | }, | 1342 | }, |
1343 | { | 1343 | { |
1344 | .procname = "inode-state", | 1344 | .procname = "inode-state", |
1345 | .data = &inodes_stat, | 1345 | .data = &inodes_stat, |
1346 | .maxlen = 7*sizeof(int), | 1346 | .maxlen = 7*sizeof(int), |
1347 | .mode = 0444, | 1347 | .mode = 0444, |
1348 | .proc_handler = proc_dointvec, | 1348 | .proc_handler = proc_nr_inodes, |
1349 | }, | 1349 | }, |
1350 | { | 1350 | { |
1351 | .procname = "file-nr", | 1351 | .procname = "file-nr", |
1352 | .data = &files_stat, | 1352 | .data = &files_stat, |
1353 | .maxlen = 3*sizeof(int), | 1353 | .maxlen = sizeof(files_stat), |
1354 | .mode = 0444, | 1354 | .mode = 0444, |
1355 | .proc_handler = proc_nr_files, | 1355 | .proc_handler = proc_nr_files, |
1356 | }, | 1356 | }, |
1357 | { | 1357 | { |
1358 | .procname = "file-max", | 1358 | .procname = "file-max", |
1359 | .data = &files_stat.max_files, | 1359 | .data = &files_stat.max_files, |
1360 | .maxlen = sizeof(int), | 1360 | .maxlen = sizeof(files_stat.max_files), |
1361 | .mode = 0644, | 1361 | .mode = 0644, |
1362 | .proc_handler = proc_dointvec, | 1362 | .proc_handler = proc_doulongvec_minmax, |
1363 | }, | 1363 | }, |
1364 | { | 1364 | { |
1365 | .procname = "nr_open", | 1365 | .procname = "nr_open", |
@@ -1375,7 +1375,7 @@ static struct ctl_table fs_table[] = { | |||
1375 | .data = &dentry_stat, | 1375 | .data = &dentry_stat, |
1376 | .maxlen = 6*sizeof(int), | 1376 | .maxlen = 6*sizeof(int), |
1377 | .mode = 0444, | 1377 | .mode = 0444, |
1378 | .proc_handler = proc_dointvec, | 1378 | .proc_handler = proc_nr_dentry, |
1379 | }, | 1379 | }, |
1380 | { | 1380 | { |
1381 | .procname = "overflowuid", | 1381 | .procname = "overflowuid", |
@@ -1711,10 +1711,7 @@ static __init int sysctl_init(void) | |||
1711 | { | 1711 | { |
1712 | sysctl_set_parent(NULL, root_table); | 1712 | sysctl_set_parent(NULL, root_table); |
1713 | #ifdef CONFIG_SYSCTL_SYSCALL_CHECK | 1713 | #ifdef CONFIG_SYSCTL_SYSCALL_CHECK |
1714 | { | 1714 | sysctl_check_table(current->nsproxy, root_table); |
1715 | int err; | ||
1716 | err = sysctl_check_table(current->nsproxy, root_table); | ||
1717 | } | ||
1718 | #endif | 1715 | #endif |
1719 | return 0; | 1716 | return 0; |
1720 | } | 1717 | } |
@@ -2486,7 +2483,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int | |||
2486 | kbuf[left] = 0; | 2483 | kbuf[left] = 0; |
2487 | } | 2484 | } |
2488 | 2485 | ||
2489 | for (; left && vleft--; i++, min++, max++, first=0) { | 2486 | for (; left && vleft--; i++, first = 0) { |
2490 | unsigned long val; | 2487 | unsigned long val; |
2491 | 2488 | ||
2492 | if (write) { | 2489 | if (write) { |
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 04cdcf72c827..10b90d8a03c4 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c | |||
@@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) | |||
143 | if (!table->maxlen) | 143 | if (!table->maxlen) |
144 | set_fail(&fail, table, "No maxlen"); | 144 | set_fail(&fail, table, "No maxlen"); |
145 | } | 145 | } |
146 | if ((table->proc_handler == proc_doulongvec_minmax) || | ||
147 | (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { | ||
148 | if (table->maxlen > sizeof (unsigned long)) { | ||
149 | if (!table->extra1) | ||
150 | set_fail(&fail, table, "No min"); | ||
151 | if (!table->extra2) | ||
152 | set_fail(&fail, table, "No max"); | ||
153 | } | ||
154 | } | ||
155 | #ifdef CONFIG_PROC_SYSCTL | 146 | #ifdef CONFIG_PROC_SYSCTL |
156 | if (table->procname && !table->proc_handler) | 147 | if (table->procname && !table->proc_handler) |
157 | set_fail(&fail, table, "No proc_handler"); | 148 | set_fail(&fail, table, "No proc_handler"); |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 11281d5792bd..c8231fb15708 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb, | |||
175 | up_write(&listeners->sem); | 175 | up_write(&listeners->sem); |
176 | } | 176 | } |
177 | 177 | ||
178 | static int fill_pid(pid_t pid, struct task_struct *tsk, | 178 | static void fill_stats(struct task_struct *tsk, struct taskstats *stats) |
179 | struct taskstats *stats) | ||
180 | { | 179 | { |
181 | int rc = 0; | ||
182 | |||
183 | if (!tsk) { | ||
184 | rcu_read_lock(); | ||
185 | tsk = find_task_by_vpid(pid); | ||
186 | if (tsk) | ||
187 | get_task_struct(tsk); | ||
188 | rcu_read_unlock(); | ||
189 | if (!tsk) | ||
190 | return -ESRCH; | ||
191 | } else | ||
192 | get_task_struct(tsk); | ||
193 | |||
194 | memset(stats, 0, sizeof(*stats)); | 180 | memset(stats, 0, sizeof(*stats)); |
195 | /* | 181 | /* |
196 | * Each accounting subsystem adds calls to its functions to | 182 | * Each accounting subsystem adds calls to its functions to |
@@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk, | |||
209 | 195 | ||
210 | /* fill in extended acct fields */ | 196 | /* fill in extended acct fields */ |
211 | xacct_add_tsk(stats, tsk); | 197 | xacct_add_tsk(stats, tsk); |
198 | } | ||
212 | 199 | ||
213 | /* Define err: label here if needed */ | 200 | static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) |
214 | put_task_struct(tsk); | 201 | { |
215 | return rc; | 202 | struct task_struct *tsk; |
216 | 203 | ||
204 | rcu_read_lock(); | ||
205 | tsk = find_task_by_vpid(pid); | ||
206 | if (tsk) | ||
207 | get_task_struct(tsk); | ||
208 | rcu_read_unlock(); | ||
209 | if (!tsk) | ||
210 | return -ESRCH; | ||
211 | fill_stats(tsk, stats); | ||
212 | put_task_struct(tsk); | ||
213 | return 0; | ||
217 | } | 214 | } |
218 | 215 | ||
219 | static int fill_tgid(pid_t tgid, struct task_struct *first, | 216 | static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) |
220 | struct taskstats *stats) | ||
221 | { | 217 | { |
222 | struct task_struct *tsk; | 218 | struct task_struct *tsk, *first; |
223 | unsigned long flags; | 219 | unsigned long flags; |
224 | int rc = -ESRCH; | 220 | int rc = -ESRCH; |
225 | 221 | ||
@@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, | |||
228 | * leaders who are already counted with the dead tasks | 224 | * leaders who are already counted with the dead tasks |
229 | */ | 225 | */ |
230 | rcu_read_lock(); | 226 | rcu_read_lock(); |
231 | if (!first) | 227 | first = find_task_by_vpid(tgid); |
232 | first = find_task_by_vpid(tgid); | ||
233 | 228 | ||
234 | if (!first || !lock_task_sighand(first, &flags)) | 229 | if (!first || !lock_task_sighand(first, &flags)) |
235 | goto out; | 230 | goto out; |
@@ -268,7 +263,6 @@ out: | |||
268 | return rc; | 263 | return rc; |
269 | } | 264 | } |
270 | 265 | ||
271 | |||
272 | static void fill_tgid_exit(struct task_struct *tsk) | 266 | static void fill_tgid_exit(struct task_struct *tsk) |
273 | { | 267 | { |
274 | unsigned long flags; | 268 | unsigned long flags; |
@@ -360,6 +354,12 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) | |||
360 | struct nlattr *na, *ret; | 354 | struct nlattr *na, *ret; |
361 | int aggr; | 355 | int aggr; |
362 | 356 | ||
357 | /* If we don't pad, we end up with alignment on a 4 byte boundary. | ||
358 | * This causes lots of runtime warnings on systems requiring 8 byte | ||
359 | * alignment */ | ||
360 | u32 pids[2] = { pid, 0 }; | ||
361 | int pid_size = ALIGN(sizeof(pid), sizeof(long)); | ||
362 | |||
363 | aggr = (type == TASKSTATS_TYPE_PID) | 363 | aggr = (type == TASKSTATS_TYPE_PID) |
364 | ? TASKSTATS_TYPE_AGGR_PID | 364 | ? TASKSTATS_TYPE_AGGR_PID |
365 | : TASKSTATS_TYPE_AGGR_TGID; | 365 | : TASKSTATS_TYPE_AGGR_TGID; |
@@ -367,7 +367,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) | |||
367 | na = nla_nest_start(skb, aggr); | 367 | na = nla_nest_start(skb, aggr); |
368 | if (!na) | 368 | if (!na) |
369 | goto err; | 369 | goto err; |
370 | if (nla_put(skb, type, sizeof(pid), &pid) < 0) | 370 | if (nla_put(skb, type, pid_size, pids) < 0) |
371 | goto err; | 371 | goto err; |
372 | ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); | 372 | ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); |
373 | if (!ret) | 373 | if (!ret) |
@@ -424,39 +424,46 @@ err: | |||
424 | return rc; | 424 | return rc; |
425 | } | 425 | } |
426 | 426 | ||
427 | static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | 427 | static int cmd_attr_register_cpumask(struct genl_info *info) |
428 | { | 428 | { |
429 | int rc; | ||
430 | struct sk_buff *rep_skb; | ||
431 | struct taskstats *stats; | ||
432 | size_t size; | ||
433 | cpumask_var_t mask; | 429 | cpumask_var_t mask; |
430 | int rc; | ||
434 | 431 | ||
435 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) | 432 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) |
436 | return -ENOMEM; | 433 | return -ENOMEM; |
437 | |||
438 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); | 434 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); |
439 | if (rc < 0) | 435 | if (rc < 0) |
440 | goto free_return_rc; | 436 | goto out; |
441 | if (rc == 0) { | 437 | rc = add_del_listener(info->snd_pid, mask, REGISTER); |
442 | rc = add_del_listener(info->snd_pid, mask, REGISTER); | 438 | out: |
443 | goto free_return_rc; | 439 | free_cpumask_var(mask); |
444 | } | 440 | return rc; |
441 | } | ||
442 | |||
443 | static int cmd_attr_deregister_cpumask(struct genl_info *info) | ||
444 | { | ||
445 | cpumask_var_t mask; | ||
446 | int rc; | ||
445 | 447 | ||
448 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) | ||
449 | return -ENOMEM; | ||
446 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); | 450 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); |
447 | if (rc < 0) | 451 | if (rc < 0) |
448 | goto free_return_rc; | 452 | goto out; |
449 | if (rc == 0) { | 453 | rc = add_del_listener(info->snd_pid, mask, DEREGISTER); |
450 | rc = add_del_listener(info->snd_pid, mask, DEREGISTER); | 454 | out: |
451 | free_return_rc: | ||
452 | free_cpumask_var(mask); | ||
453 | return rc; | ||
454 | } | ||
455 | free_cpumask_var(mask); | 455 | free_cpumask_var(mask); |
456 | return rc; | ||
457 | } | ||
458 | |||
459 | static int cmd_attr_pid(struct genl_info *info) | ||
460 | { | ||
461 | struct taskstats *stats; | ||
462 | struct sk_buff *rep_skb; | ||
463 | size_t size; | ||
464 | u32 pid; | ||
465 | int rc; | ||
456 | 466 | ||
457 | /* | ||
458 | * Size includes space for nested attributes | ||
459 | */ | ||
460 | size = nla_total_size(sizeof(u32)) + | 467 | size = nla_total_size(sizeof(u32)) + |
461 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | 468 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); |
462 | 469 | ||
@@ -465,33 +472,64 @@ free_return_rc: | |||
465 | return rc; | 472 | return rc; |
466 | 473 | ||
467 | rc = -EINVAL; | 474 | rc = -EINVAL; |
468 | if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { | 475 | pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); |
469 | u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); | 476 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); |
470 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); | 477 | if (!stats) |
471 | if (!stats) | 478 | goto err; |
472 | goto err; | 479 | |
473 | 480 | rc = fill_stats_for_pid(pid, stats); | |
474 | rc = fill_pid(pid, NULL, stats); | 481 | if (rc < 0) |
475 | if (rc < 0) | 482 | goto err; |
476 | goto err; | 483 | return send_reply(rep_skb, info); |
477 | } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { | 484 | err: |
478 | u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); | 485 | nlmsg_free(rep_skb); |
479 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); | 486 | return rc; |
480 | if (!stats) | 487 | } |
481 | goto err; | 488 | |
482 | 489 | static int cmd_attr_tgid(struct genl_info *info) | |
483 | rc = fill_tgid(tgid, NULL, stats); | 490 | { |
484 | if (rc < 0) | 491 | struct taskstats *stats; |
485 | goto err; | 492 | struct sk_buff *rep_skb; |
486 | } else | 493 | size_t size; |
494 | u32 tgid; | ||
495 | int rc; | ||
496 | |||
497 | size = nla_total_size(sizeof(u32)) + | ||
498 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
499 | |||
500 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); | ||
501 | if (rc < 0) | ||
502 | return rc; | ||
503 | |||
504 | rc = -EINVAL; | ||
505 | tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); | ||
506 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); | ||
507 | if (!stats) | ||
487 | goto err; | 508 | goto err; |
488 | 509 | ||
510 | rc = fill_stats_for_tgid(tgid, stats); | ||
511 | if (rc < 0) | ||
512 | goto err; | ||
489 | return send_reply(rep_skb, info); | 513 | return send_reply(rep_skb, info); |
490 | err: | 514 | err: |
491 | nlmsg_free(rep_skb); | 515 | nlmsg_free(rep_skb); |
492 | return rc; | 516 | return rc; |
493 | } | 517 | } |
494 | 518 | ||
519 | static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | ||
520 | { | ||
521 | if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) | ||
522 | return cmd_attr_register_cpumask(info); | ||
523 | else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) | ||
524 | return cmd_attr_deregister_cpumask(info); | ||
525 | else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) | ||
526 | return cmd_attr_pid(info); | ||
527 | else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) | ||
528 | return cmd_attr_tgid(info); | ||
529 | else | ||
530 | return -EINVAL; | ||
531 | } | ||
532 | |||
495 | static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) | 533 | static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) |
496 | { | 534 | { |
497 | struct signal_struct *sig = tsk->signal; | 535 | struct signal_struct *sig = tsk->signal; |
@@ -555,9 +593,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
555 | if (!stats) | 593 | if (!stats) |
556 | goto err; | 594 | goto err; |
557 | 595 | ||
558 | rc = fill_pid(-1, tsk, stats); | 596 | fill_stats(tsk, stats); |
559 | if (rc < 0) | ||
560 | goto err; | ||
561 | 597 | ||
562 | /* | 598 | /* |
563 | * Doesn't matter if tsk is the leader or the last group member leaving | 599 | * Doesn't matter if tsk is the leader or the last group member leaving |
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 4f104515a19b..f8b11a283171 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c | |||
@@ -115,7 +115,9 @@ static int test_kprobes(void) | |||
115 | int ret; | 115 | int ret; |
116 | struct kprobe *kps[2] = {&kp, &kp2}; | 116 | struct kprobe *kps[2] = {&kp, &kp2}; |
117 | 117 | ||
118 | kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | 118 | /* addr and flags should be cleard for reusing kprobe. */ |
119 | kp.addr = NULL; | ||
120 | kp.flags = 0; | ||
119 | ret = register_kprobes(kps, 2); | 121 | ret = register_kprobes(kps, 2); |
120 | if (ret < 0) { | 122 | if (ret < 0) { |
121 | printk(KERN_ERR "Kprobe smoke test failed: " | 123 | printk(KERN_ERR "Kprobe smoke test failed: " |
@@ -210,7 +212,9 @@ static int test_jprobes(void) | |||
210 | int ret; | 212 | int ret; |
211 | struct jprobe *jps[2] = {&jp, &jp2}; | 213 | struct jprobe *jps[2] = {&jp, &jp2}; |
212 | 214 | ||
213 | jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | 215 | /* addr and flags should be cleard for reusing kprobe. */ |
216 | jp.kp.addr = NULL; | ||
217 | jp.kp.flags = 0; | ||
214 | ret = register_jprobes(jps, 2); | 218 | ret = register_jprobes(jps, 2); |
215 | if (ret < 0) { | 219 | if (ret < 0) { |
216 | printk(KERN_ERR "Kprobe smoke test failed: " | 220 | printk(KERN_ERR "Kprobe smoke test failed: " |
@@ -323,7 +327,9 @@ static int test_kretprobes(void) | |||
323 | int ret; | 327 | int ret; |
324 | struct kretprobe *rps[2] = {&rp, &rp2}; | 328 | struct kretprobe *rps[2] = {&rp, &rp2}; |
325 | 329 | ||
326 | rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | 330 | /* addr and flags should be cleard for reusing kprobe. */ |
331 | rp.kp.addr = NULL; | ||
332 | rp.kp.flags = 0; | ||
327 | ret = register_kretprobes(rps, 2); | 333 | ret = register_kretprobes(rps, 2); |
328 | if (ret < 0) { | 334 | if (ret < 0) { |
329 | printk(KERN_ERR "Kprobe smoke test failed: " | 335 | printk(KERN_ERR "Kprobe smoke test failed: " |
diff --git a/kernel/time.c b/kernel/time.c index 848b1c2ab09a..ba9b338d1835 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -300,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) | |||
300 | } | 300 | } |
301 | EXPORT_SYMBOL(timespec_trunc); | 301 | EXPORT_SYMBOL(timespec_trunc); |
302 | 302 | ||
303 | #ifndef CONFIG_GENERIC_TIME | ||
304 | /* | ||
305 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval | ||
306 | * and therefore only yields usec accuracy | ||
307 | */ | ||
308 | void getnstimeofday(struct timespec *tv) | ||
309 | { | ||
310 | struct timeval x; | ||
311 | |||
312 | do_gettimeofday(&x); | ||
313 | tv->tv_sec = x.tv_sec; | ||
314 | tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; | ||
315 | } | ||
316 | EXPORT_SYMBOL_GPL(getnstimeofday); | ||
317 | #endif | ||
318 | |||
319 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. | 303 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. |
320 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 | 304 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 |
321 | * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. | 305 | * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 95ed42951e0a..f06a8a365648 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -6,7 +6,7 @@ config TICK_ONESHOT | |||
6 | 6 | ||
7 | config NO_HZ | 7 | config NO_HZ |
8 | bool "Tickless System (Dynamic Ticks)" | 8 | bool "Tickless System (Dynamic Ticks)" |
9 | depends on GENERIC_TIME && GENERIC_CLOCKEVENTS | 9 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
10 | select TICK_ONESHOT | 10 | select TICK_ONESHOT |
11 | help | 11 | help |
12 | This option enables a tickless system: timer interrupts will | 12 | This option enables a tickless system: timer interrupts will |
@@ -15,7 +15,7 @@ config NO_HZ | |||
15 | 15 | ||
16 | config HIGH_RES_TIMERS | 16 | config HIGH_RES_TIMERS |
17 | bool "High Resolution Timer Support" | 17 | bool "High Resolution Timer Support" |
18 | depends on GENERIC_TIME && GENERIC_CLOCKEVENTS | 18 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
19 | select TICK_ONESHOT | 19 | select TICK_ONESHOT |
20 | help | 20 | help |
21 | This option enables high resolution timer support. If your | 21 | This option enables high resolution timer support. If your |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f08e99c1d561..c18d7efa1b4b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
531 | return max_nsecs - (max_nsecs >> 5); | 531 | return max_nsecs - (max_nsecs >> 5); |
532 | } | 532 | } |
533 | 533 | ||
534 | #ifdef CONFIG_GENERIC_TIME | 534 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET |
535 | 535 | ||
536 | /** | 536 | /** |
537 | * clocksource_select - Select the best clocksource available | 537 | * clocksource_select - Select the best clocksource available |
@@ -577,7 +577,7 @@ static void clocksource_select(void) | |||
577 | } | 577 | } |
578 | } | 578 | } |
579 | 579 | ||
580 | #else /* CONFIG_GENERIC_TIME */ | 580 | #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ |
581 | 581 | ||
582 | static inline void clocksource_select(void) { } | 582 | static inline void clocksource_select(void) { } |
583 | 583 | ||
@@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
639 | #define MAX_UPDATE_LENGTH 5 /* Seconds */ | 639 | #define MAX_UPDATE_LENGTH 5 /* Seconds */ |
640 | 640 | ||
641 | /** | 641 | /** |
642 | * __clocksource_register_scale - Used to install new clocksources | 642 | * __clocksource_updatefreq_scale - Used update clocksource with new freq |
643 | * @t: clocksource to be registered | 643 | * @t: clocksource to be registered |
644 | * @scale: Scale factor multiplied against freq to get clocksource hz | 644 | * @scale: Scale factor multiplied against freq to get clocksource hz |
645 | * @freq: clocksource frequency (cycles per second) divided by scale | 645 | * @freq: clocksource frequency (cycles per second) divided by scale |
646 | * | 646 | * |
647 | * Returns -EBUSY if registration fails, zero otherwise. | 647 | * This should only be called from the clocksource->enable() method. |
648 | * | 648 | * |
649 | * This *SHOULD NOT* be called directly! Please use the | 649 | * This *SHOULD NOT* be called directly! Please use the |
650 | * clocksource_register_hz() or clocksource_register_khz helper functions. | 650 | * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. |
651 | */ | 651 | */ |
652 | int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | 652 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) |
653 | { | 653 | { |
654 | |||
655 | /* | 654 | /* |
656 | * Ideally we want to use some of the limits used in | 655 | * Ideally we want to use some of the limits used in |
657 | * clocksource_max_deferment, to provide a more informed | 656 | * clocksource_max_deferment, to provide a more informed |
@@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
662 | NSEC_PER_SEC/scale, | 661 | NSEC_PER_SEC/scale, |
663 | MAX_UPDATE_LENGTH*scale); | 662 | MAX_UPDATE_LENGTH*scale); |
664 | cs->max_idle_ns = clocksource_max_deferment(cs); | 663 | cs->max_idle_ns = clocksource_max_deferment(cs); |
664 | } | ||
665 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | ||
666 | |||
667 | /** | ||
668 | * __clocksource_register_scale - Used to install new clocksources | ||
669 | * @t: clocksource to be registered | ||
670 | * @scale: Scale factor multiplied against freq to get clocksource hz | ||
671 | * @freq: clocksource frequency (cycles per second) divided by scale | ||
672 | * | ||
673 | * Returns -EBUSY if registration fails, zero otherwise. | ||
674 | * | ||
675 | * This *SHOULD NOT* be called directly! Please use the | ||
676 | * clocksource_register_hz() or clocksource_register_khz helper functions. | ||
677 | */ | ||
678 | int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | ||
679 | { | ||
680 | |||
681 | /* Intialize mult/shift and max_idle_ns */ | ||
682 | __clocksource_updatefreq_scale(cs, scale, freq); | ||
665 | 683 | ||
684 | /* Add clocksource to the clcoksource list */ | ||
666 | mutex_lock(&clocksource_mutex); | 685 | mutex_lock(&clocksource_mutex); |
667 | clocksource_enqueue(cs); | 686 | clocksource_enqueue(cs); |
668 | clocksource_select(); | 687 | clocksource_select(); |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c63116863a80..d2321891538f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -149,10 +149,18 @@ static void ntp_update_offset(long offset) | |||
149 | time_reftime = get_seconds(); | 149 | time_reftime = get_seconds(); |
150 | 150 | ||
151 | offset64 = offset; | 151 | offset64 = offset; |
152 | freq_adj = (offset64 * secs) << | 152 | freq_adj = ntp_update_offset_fll(offset64, secs); |
153 | (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); | ||
154 | 153 | ||
155 | freq_adj += ntp_update_offset_fll(offset64, secs); | 154 | /* |
155 | * Clamp update interval to reduce PLL gain with low | ||
156 | * sampling rate (e.g. intermittent network connection) | ||
157 | * to avoid instability. | ||
158 | */ | ||
159 | if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) | ||
160 | secs = 1 << (SHIFT_PLL + 1 + time_constant); | ||
161 | |||
162 | freq_adj += (offset64 * secs) << | ||
163 | (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); | ||
156 | 164 | ||
157 | freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); | 165 | freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); |
158 | 166 | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b3bafd5fc66d..48b2761b5668 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) | |||
188 | /* | 188 | /* |
189 | * Setup the next period for devices, which do not have | 189 | * Setup the next period for devices, which do not have |
190 | * periodic mode. We read dev->next_event first and add to it | 190 | * periodic mode. We read dev->next_event first and add to it |
191 | * when the event alrady expired. clockevents_program_event() | 191 | * when the event already expired. clockevents_program_event() |
192 | * sets dev->next_event only when the event is really | 192 | * sets dev->next_event only when the event is really |
193 | * programmed to the device. | 193 | * programmed to the device. |
194 | */ | 194 | */ |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 813993b5fb61..3e216e01bbd1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
325 | } while (read_seqretry(&xtime_lock, seq)); | 325 | } while (read_seqretry(&xtime_lock, seq)); |
326 | 326 | ||
327 | if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || | 327 | if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || |
328 | arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { | 328 | arch_needs_cpu(cpu)) { |
329 | next_jiffies = last_jiffies + 1; | 329 | next_jiffies = last_jiffies + 1; |
330 | delta_jiffies = 1; | 330 | delta_jiffies = 1; |
331 | } else { | 331 | } else { |
@@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
405 | * the scheduler tick in nohz_restart_sched_tick. | 405 | * the scheduler tick in nohz_restart_sched_tick. |
406 | */ | 406 | */ |
407 | if (!ts->tick_stopped) { | 407 | if (!ts->tick_stopped) { |
408 | if (select_nohz_load_balancer(1)) { | 408 | select_nohz_load_balancer(1); |
409 | /* | ||
410 | * sched tick not stopped! | ||
411 | */ | ||
412 | cpumask_clear_cpu(cpu, nohz_cpu_mask); | ||
413 | goto out; | ||
414 | } | ||
415 | 409 | ||
416 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); | 410 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); |
417 | ts->tick_stopped = 1; | 411 | ts->tick_stopped = 1; |
@@ -780,7 +774,6 @@ void tick_setup_sched_timer(void) | |||
780 | { | 774 | { |
781 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 775 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
782 | ktime_t now = ktime_get(); | 776 | ktime_t now = ktime_get(); |
783 | u64 offset; | ||
784 | 777 | ||
785 | /* | 778 | /* |
786 | * Emulate tick processing via per-CPU hrtimers: | 779 | * Emulate tick processing via per-CPU hrtimers: |
@@ -790,10 +783,6 @@ void tick_setup_sched_timer(void) | |||
790 | 783 | ||
791 | /* Get the next period (per cpu) */ | 784 | /* Get the next period (per cpu) */ |
792 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); | 785 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); |
793 | offset = ktime_to_ns(tick_period) >> 1; | ||
794 | do_div(offset, num_possible_cpus()); | ||
795 | offset *= smp_processor_id(); | ||
796 | hrtimer_add_expires_ns(&ts->sched_timer, offset); | ||
797 | 786 | ||
798 | for (;;) { | 787 | for (;;) { |
799 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 788 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index caf8d4d4f5c8..49010d822f72 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | |||
153 | * - wall_to_monotonic is no longer the boot time, getboottime must be | 153 | * - wall_to_monotonic is no longer the boot time, getboottime must be |
154 | * used instead. | 154 | * used instead. |
155 | */ | 155 | */ |
156 | struct timespec xtime __attribute__ ((aligned (16))); | 156 | static struct timespec xtime __attribute__ ((aligned (16))); |
157 | struct timespec wall_to_monotonic __attribute__ ((aligned (16))); | 157 | static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); |
158 | static struct timespec total_sleep_time; | 158 | static struct timespec total_sleep_time; |
159 | 159 | ||
160 | /* | 160 | /* |
@@ -170,11 +170,10 @@ void timekeeping_leap_insert(int leapsecond) | |||
170 | { | 170 | { |
171 | xtime.tv_sec += leapsecond; | 171 | xtime.tv_sec += leapsecond; |
172 | wall_to_monotonic.tv_sec -= leapsecond; | 172 | wall_to_monotonic.tv_sec -= leapsecond; |
173 | update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); | 173 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, |
174 | timekeeper.mult); | ||
174 | } | 175 | } |
175 | 176 | ||
176 | #ifdef CONFIG_GENERIC_TIME | ||
177 | |||
178 | /** | 177 | /** |
179 | * timekeeping_forward_now - update clock to the current time | 178 | * timekeeping_forward_now - update clock to the current time |
180 | * | 179 | * |
@@ -328,7 +327,8 @@ int do_settimeofday(struct timespec *tv) | |||
328 | timekeeper.ntp_error = 0; | 327 | timekeeper.ntp_error = 0; |
329 | ntp_clear(); | 328 | ntp_clear(); |
330 | 329 | ||
331 | update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); | 330 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, |
331 | timekeeper.mult); | ||
332 | 332 | ||
333 | write_sequnlock_irqrestore(&xtime_lock, flags); | 333 | write_sequnlock_irqrestore(&xtime_lock, flags); |
334 | 334 | ||
@@ -376,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock) | |||
376 | tick_clock_notify(); | 376 | tick_clock_notify(); |
377 | } | 377 | } |
378 | 378 | ||
379 | #else /* GENERIC_TIME */ | ||
380 | |||
381 | static inline void timekeeping_forward_now(void) { } | ||
382 | |||
383 | /** | ||
384 | * ktime_get - get the monotonic time in ktime_t format | ||
385 | * | ||
386 | * returns the time in ktime_t format | ||
387 | */ | ||
388 | ktime_t ktime_get(void) | ||
389 | { | ||
390 | struct timespec now; | ||
391 | |||
392 | ktime_get_ts(&now); | ||
393 | |||
394 | return timespec_to_ktime(now); | ||
395 | } | ||
396 | EXPORT_SYMBOL_GPL(ktime_get); | ||
397 | |||
398 | /** | ||
399 | * ktime_get_ts - get the monotonic clock in timespec format | ||
400 | * @ts: pointer to timespec variable | ||
401 | * | ||
402 | * The function calculates the monotonic clock from the realtime | ||
403 | * clock and the wall_to_monotonic offset and stores the result | ||
404 | * in normalized timespec format in the variable pointed to by @ts. | ||
405 | */ | ||
406 | void ktime_get_ts(struct timespec *ts) | ||
407 | { | ||
408 | struct timespec tomono; | ||
409 | unsigned long seq; | ||
410 | |||
411 | do { | ||
412 | seq = read_seqbegin(&xtime_lock); | ||
413 | getnstimeofday(ts); | ||
414 | tomono = wall_to_monotonic; | ||
415 | |||
416 | } while (read_seqretry(&xtime_lock, seq)); | ||
417 | |||
418 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, | ||
419 | ts->tv_nsec + tomono.tv_nsec); | ||
420 | } | ||
421 | EXPORT_SYMBOL_GPL(ktime_get_ts); | ||
422 | |||
423 | #endif /* !GENERIC_TIME */ | ||
424 | |||
425 | /** | 379 | /** |
426 | * ktime_get_real - get the real (wall-) time in ktime_t format | 380 | * ktime_get_real - get the real (wall-) time in ktime_t format |
427 | * | 381 | * |
@@ -579,9 +533,9 @@ static int timekeeping_resume(struct sys_device *dev) | |||
579 | 533 | ||
580 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { | 534 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { |
581 | ts = timespec_sub(ts, timekeeping_suspend_time); | 535 | ts = timespec_sub(ts, timekeeping_suspend_time); |
582 | xtime = timespec_add_safe(xtime, ts); | 536 | xtime = timespec_add(xtime, ts); |
583 | wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); | 537 | wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); |
584 | total_sleep_time = timespec_add_safe(total_sleep_time, ts); | 538 | total_sleep_time = timespec_add(total_sleep_time, ts); |
585 | } | 539 | } |
586 | /* re-base the last cycle value */ | 540 | /* re-base the last cycle value */ |
587 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); | 541 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); |
@@ -736,6 +690,7 @@ static void timekeeping_adjust(s64 offset) | |||
736 | static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | 690 | static cycle_t logarithmic_accumulation(cycle_t offset, int shift) |
737 | { | 691 | { |
738 | u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; | 692 | u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; |
693 | u64 raw_nsecs; | ||
739 | 694 | ||
740 | /* If the offset is smaller then a shifted interval, do nothing */ | 695 | /* If the offset is smaller then a shifted interval, do nothing */ |
741 | if (offset < timekeeper.cycle_interval<<shift) | 696 | if (offset < timekeeper.cycle_interval<<shift) |
@@ -752,12 +707,15 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
752 | second_overflow(); | 707 | second_overflow(); |
753 | } | 708 | } |
754 | 709 | ||
755 | /* Accumulate into raw time */ | 710 | /* Accumulate raw time */ |
756 | raw_time.tv_nsec += timekeeper.raw_interval << shift;; | 711 | raw_nsecs = timekeeper.raw_interval << shift; |
757 | while (raw_time.tv_nsec >= NSEC_PER_SEC) { | 712 | raw_nsecs += raw_time.tv_nsec; |
758 | raw_time.tv_nsec -= NSEC_PER_SEC; | 713 | if (raw_nsecs >= NSEC_PER_SEC) { |
759 | raw_time.tv_sec++; | 714 | u64 raw_secs = raw_nsecs; |
715 | raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); | ||
716 | raw_time.tv_sec += raw_secs; | ||
760 | } | 717 | } |
718 | raw_time.tv_nsec = raw_nsecs; | ||
761 | 719 | ||
762 | /* Accumulate error between NTP and clock interval */ | 720 | /* Accumulate error between NTP and clock interval */ |
763 | timekeeper.ntp_error += tick_length << shift; | 721 | timekeeper.ntp_error += tick_length << shift; |
@@ -784,10 +742,11 @@ void update_wall_time(void) | |||
784 | return; | 742 | return; |
785 | 743 | ||
786 | clock = timekeeper.clock; | 744 | clock = timekeeper.clock; |
787 | #ifdef CONFIG_GENERIC_TIME | 745 | |
788 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; | 746 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET |
789 | #else | ||
790 | offset = timekeeper.cycle_interval; | 747 | offset = timekeeper.cycle_interval; |
748 | #else | ||
749 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; | ||
791 | #endif | 750 | #endif |
792 | timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; | 751 | timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; |
793 | 752 | ||
@@ -856,7 +815,8 @@ void update_wall_time(void) | |||
856 | } | 815 | } |
857 | 816 | ||
858 | /* check to see if there is a new clocksource to use */ | 817 | /* check to see if there is a new clocksource to use */ |
859 | update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); | 818 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, |
819 | timekeeper.mult); | ||
860 | } | 820 | } |
861 | 821 | ||
862 | /** | 822 | /** |
@@ -887,7 +847,7 @@ EXPORT_SYMBOL_GPL(getboottime); | |||
887 | */ | 847 | */ |
888 | void monotonic_to_bootbased(struct timespec *ts) | 848 | void monotonic_to_bootbased(struct timespec *ts) |
889 | { | 849 | { |
890 | *ts = timespec_add_safe(*ts, total_sleep_time); | 850 | *ts = timespec_add(*ts, total_sleep_time); |
891 | } | 851 | } |
892 | EXPORT_SYMBOL_GPL(monotonic_to_bootbased); | 852 | EXPORT_SYMBOL_GPL(monotonic_to_bootbased); |
893 | 853 | ||
@@ -902,6 +862,11 @@ struct timespec __current_kernel_time(void) | |||
902 | return xtime; | 862 | return xtime; |
903 | } | 863 | } |
904 | 864 | ||
865 | struct timespec __get_wall_to_monotonic(void) | ||
866 | { | ||
867 | return wall_to_monotonic; | ||
868 | } | ||
869 | |||
905 | struct timespec current_kernel_time(void) | 870 | struct timespec current_kernel_time(void) |
906 | { | 871 | { |
907 | struct timespec now; | 872 | struct timespec now; |
diff --git a/kernel/timer.c b/kernel/timer.c index ee305c8d4e18..68a9ae7679b7 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -37,7 +37,7 @@ | |||
37 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
38 | #include <linux/tick.h> | 38 | #include <linux/tick.h> |
39 | #include <linux/kallsyms.h> | 39 | #include <linux/kallsyms.h> |
40 | #include <linux/perf_event.h> | 40 | #include <linux/irq_work.h> |
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/slab.h> | 42 | #include <linux/slab.h> |
43 | 43 | ||
@@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; | |||
90 | 90 | ||
91 | /* | 91 | /* |
92 | * Note that all tvec_bases are 2 byte aligned and lower bit of | 92 | * Note that all tvec_bases are 2 byte aligned and lower bit of |
93 | * base in timer_list is guaranteed to be zero. Use the LSB for | 93 | * base in timer_list is guaranteed to be zero. Use the LSB to |
94 | * the new flag to indicate whether the timer is deferrable | 94 | * indicate whether the timer is deferrable. |
95 | * | ||
96 | * A deferrable timer will work normally when the system is busy, but | ||
97 | * will not cause a CPU to come out of idle just to service it; instead, | ||
98 | * the timer will be serviced when the CPU eventually wakes up with a | ||
99 | * subsequent non-deferrable timer. | ||
95 | */ | 100 | */ |
96 | #define TBASE_DEFERRABLE_FLAG (0x1) | 101 | #define TBASE_DEFERRABLE_FLAG (0x1) |
97 | 102 | ||
@@ -321,6 +326,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_up_relative); | |||
321 | 326 | ||
322 | /** | 327 | /** |
323 | * set_timer_slack - set the allowed slack for a timer | 328 | * set_timer_slack - set the allowed slack for a timer |
329 | * @timer: the timer to be modified | ||
324 | * @slack_hz: the amount of time (in jiffies) allowed for rounding | 330 | * @slack_hz: the amount of time (in jiffies) allowed for rounding |
325 | * | 331 | * |
326 | * Set the amount of time, in jiffies, that a certain timer has | 332 | * Set the amount of time, in jiffies, that a certain timer has |
@@ -577,6 +583,19 @@ static void __init_timer(struct timer_list *timer, | |||
577 | lockdep_init_map(&timer->lockdep_map, name, key, 0); | 583 | lockdep_init_map(&timer->lockdep_map, name, key, 0); |
578 | } | 584 | } |
579 | 585 | ||
586 | void setup_deferrable_timer_on_stack_key(struct timer_list *timer, | ||
587 | const char *name, | ||
588 | struct lock_class_key *key, | ||
589 | void (*function)(unsigned long), | ||
590 | unsigned long data) | ||
591 | { | ||
592 | timer->function = function; | ||
593 | timer->data = data; | ||
594 | init_timer_on_stack_key(timer, name, key); | ||
595 | timer_set_deferrable(timer); | ||
596 | } | ||
597 | EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); | ||
598 | |||
580 | /** | 599 | /** |
581 | * init_timer_key - initialize a timer | 600 | * init_timer_key - initialize a timer |
582 | * @timer: the timer to be initialized | 601 | * @timer: the timer to be initialized |
@@ -679,12 +698,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
679 | cpu = smp_processor_id(); | 698 | cpu = smp_processor_id(); |
680 | 699 | ||
681 | #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) | 700 | #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) |
682 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { | 701 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) |
683 | int preferred_cpu = get_nohz_load_balancer(); | 702 | cpu = get_nohz_timer_target(); |
684 | |||
685 | if (preferred_cpu >= 0) | ||
686 | cpu = preferred_cpu; | ||
687 | } | ||
688 | #endif | 703 | #endif |
689 | new_base = per_cpu(tvec_bases, cpu); | 704 | new_base = per_cpu(tvec_bases, cpu); |
690 | 705 | ||
@@ -1264,7 +1279,10 @@ void update_process_times(int user_tick) | |||
1264 | run_local_timers(); | 1279 | run_local_timers(); |
1265 | rcu_check_callbacks(cpu, user_tick); | 1280 | rcu_check_callbacks(cpu, user_tick); |
1266 | printk_tick(); | 1281 | printk_tick(); |
1267 | perf_event_do_pending(); | 1282 | #ifdef CONFIG_IRQ_WORK |
1283 | if (in_irq()) | ||
1284 | irq_work_run(); | ||
1285 | #endif | ||
1268 | scheduler_tick(); | 1286 | scheduler_tick(); |
1269 | run_posix_cpu_timers(p); | 1287 | run_posix_cpu_timers(p); |
1270 | } | 1288 | } |
@@ -1289,7 +1307,6 @@ void run_local_timers(void) | |||
1289 | { | 1307 | { |
1290 | hrtimer_run_queues(); | 1308 | hrtimer_run_queues(); |
1291 | raise_softirq(TIMER_SOFTIRQ); | 1309 | raise_softirq(TIMER_SOFTIRQ); |
1292 | softlockup_tick(); | ||
1293 | } | 1310 | } |
1294 | 1311 | ||
1295 | /* | 1312 | /* |
@@ -1750,3 +1767,25 @@ unsigned long msleep_interruptible(unsigned int msecs) | |||
1750 | } | 1767 | } |
1751 | 1768 | ||
1752 | EXPORT_SYMBOL(msleep_interruptible); | 1769 | EXPORT_SYMBOL(msleep_interruptible); |
1770 | |||
1771 | static int __sched do_usleep_range(unsigned long min, unsigned long max) | ||
1772 | { | ||
1773 | ktime_t kmin; | ||
1774 | unsigned long delta; | ||
1775 | |||
1776 | kmin = ktime_set(0, min * NSEC_PER_USEC); | ||
1777 | delta = (max - min) * NSEC_PER_USEC; | ||
1778 | return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); | ||
1779 | } | ||
1780 | |||
1781 | /** | ||
1782 | * usleep_range - Drop in replacement for udelay where wakeup is flexible | ||
1783 | * @min: Minimum time in usecs to sleep | ||
1784 | * @max: Maximum time in usecs to sleep | ||
1785 | */ | ||
1786 | void usleep_range(unsigned long min, unsigned long max) | ||
1787 | { | ||
1788 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
1789 | do_usleep_range(min, max); | ||
1790 | } | ||
1791 | EXPORT_SYMBOL(usleep_range); | ||
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8b1797c4545b..e04b8bcdef88 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS | |||
49 | help | 49 | help |
50 | See Documentation/trace/ftrace-design.txt | 50 | See Documentation/trace/ftrace-design.txt |
51 | 51 | ||
52 | config HAVE_C_RECORDMCOUNT | ||
53 | bool | ||
54 | help | ||
55 | C version of recordmcount available? | ||
56 | |||
52 | config TRACER_MAX_TRACE | 57 | config TRACER_MAX_TRACE |
53 | bool | 58 | bool |
54 | 59 | ||
@@ -121,7 +126,7 @@ if FTRACE | |||
121 | config FUNCTION_TRACER | 126 | config FUNCTION_TRACER |
122 | bool "Kernel Function Tracer" | 127 | bool "Kernel Function Tracer" |
123 | depends on HAVE_FUNCTION_TRACER | 128 | depends on HAVE_FUNCTION_TRACER |
124 | select FRAME_POINTER | 129 | select FRAME_POINTER if (!ARM_UNWIND) |
125 | select KALLSYMS | 130 | select KALLSYMS |
126 | select GENERIC_TRACER | 131 | select GENERIC_TRACER |
127 | select CONTEXT_SWITCH_TRACER | 132 | select CONTEXT_SWITCH_TRACER |
@@ -153,7 +158,7 @@ config IRQSOFF_TRACER | |||
153 | bool "Interrupts-off Latency Tracer" | 158 | bool "Interrupts-off Latency Tracer" |
154 | default n | 159 | default n |
155 | depends on TRACE_IRQFLAGS_SUPPORT | 160 | depends on TRACE_IRQFLAGS_SUPPORT |
156 | depends on GENERIC_TIME | 161 | depends on !ARCH_USES_GETTIMEOFFSET |
157 | select TRACE_IRQFLAGS | 162 | select TRACE_IRQFLAGS |
158 | select GENERIC_TRACER | 163 | select GENERIC_TRACER |
159 | select TRACER_MAX_TRACE | 164 | select TRACER_MAX_TRACE |
@@ -175,7 +180,7 @@ config IRQSOFF_TRACER | |||
175 | config PREEMPT_TRACER | 180 | config PREEMPT_TRACER |
176 | bool "Preemption-off Latency Tracer" | 181 | bool "Preemption-off Latency Tracer" |
177 | default n | 182 | default n |
178 | depends on GENERIC_TIME | 183 | depends on !ARCH_USES_GETTIMEOFFSET |
179 | depends on PREEMPT | 184 | depends on PREEMPT |
180 | select GENERIC_TRACER | 185 | select GENERIC_TRACER |
181 | select TRACER_MAX_TRACE | 186 | select TRACER_MAX_TRACE |
@@ -194,15 +199,6 @@ config PREEMPT_TRACER | |||
194 | enabled. This option and the irqs-off timing option can be | 199 | enabled. This option and the irqs-off timing option can be |
195 | used together or separately.) | 200 | used together or separately.) |
196 | 201 | ||
197 | config SYSPROF_TRACER | ||
198 | bool "Sysprof Tracer" | ||
199 | depends on X86 | ||
200 | select GENERIC_TRACER | ||
201 | select CONTEXT_SWITCH_TRACER | ||
202 | help | ||
203 | This tracer provides the trace needed by the 'Sysprof' userspace | ||
204 | tool. | ||
205 | |||
206 | config SCHED_TRACER | 202 | config SCHED_TRACER |
207 | bool "Scheduling Latency Tracer" | 203 | bool "Scheduling Latency Tracer" |
208 | select GENERIC_TRACER | 204 | select GENERIC_TRACER |
@@ -229,23 +225,6 @@ config FTRACE_SYSCALLS | |||
229 | help | 225 | help |
230 | Basic tracer to catch the syscall entry and exit events. | 226 | Basic tracer to catch the syscall entry and exit events. |
231 | 227 | ||
232 | config BOOT_TRACER | ||
233 | bool "Trace boot initcalls" | ||
234 | select GENERIC_TRACER | ||
235 | select CONTEXT_SWITCH_TRACER | ||
236 | help | ||
237 | This tracer helps developers to optimize boot times: it records | ||
238 | the timings of the initcalls and traces key events and the identity | ||
239 | of tasks that can cause boot delays, such as context-switches. | ||
240 | |||
241 | Its aim is to be parsed by the scripts/bootgraph.pl tool to | ||
242 | produce pretty graphics about boot inefficiencies, giving a visual | ||
243 | representation of the delays during initcalls - but the raw | ||
244 | /debug/tracing/trace text output is readable too. | ||
245 | |||
246 | You must pass in initcall_debug and ftrace=initcall to the kernel | ||
247 | command line to enable this on bootup. | ||
248 | |||
249 | config TRACE_BRANCH_PROFILING | 228 | config TRACE_BRANCH_PROFILING |
250 | bool | 229 | bool |
251 | select GENERIC_TRACER | 230 | select GENERIC_TRACER |
@@ -325,28 +304,6 @@ config BRANCH_TRACER | |||
325 | 304 | ||
326 | Say N if unsure. | 305 | Say N if unsure. |
327 | 306 | ||
328 | config KSYM_TRACER | ||
329 | bool "Trace read and write access on kernel memory locations" | ||
330 | depends on HAVE_HW_BREAKPOINT | ||
331 | select TRACING | ||
332 | help | ||
333 | This tracer helps find read and write operations on any given kernel | ||
334 | symbol i.e. /proc/kallsyms. | ||
335 | |||
336 | config PROFILE_KSYM_TRACER | ||
337 | bool "Profile all kernel memory accesses on 'watched' variables" | ||
338 | depends on KSYM_TRACER | ||
339 | help | ||
340 | This tracer profiles kernel accesses on variables watched through the | ||
341 | ksym tracer ftrace plugin. Depending upon the hardware, all read | ||
342 | and write operations on kernel variables can be monitored for | ||
343 | accesses. | ||
344 | |||
345 | The results will be displayed in: | ||
346 | /debugfs/tracing/profile_ksym | ||
347 | |||
348 | Say N if unsure. | ||
349 | |||
350 | config STACK_TRACER | 307 | config STACK_TRACER |
351 | bool "Trace max stack" | 308 | bool "Trace max stack" |
352 | depends on HAVE_FUNCTION_TRACER | 309 | depends on HAVE_FUNCTION_TRACER |
@@ -371,37 +328,6 @@ config STACK_TRACER | |||
371 | 328 | ||
372 | Say N if unsure. | 329 | Say N if unsure. |
373 | 330 | ||
374 | config KMEMTRACE | ||
375 | bool "Trace SLAB allocations" | ||
376 | select GENERIC_TRACER | ||
377 | help | ||
378 | kmemtrace provides tracing for slab allocator functions, such as | ||
379 | kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected | ||
380 | data is then fed to the userspace application in order to analyse | ||
381 | allocation hotspots, internal fragmentation and so on, making it | ||
382 | possible to see how well an allocator performs, as well as debug | ||
383 | and profile kernel code. | ||
384 | |||
385 | This requires an userspace application to use. See | ||
386 | Documentation/trace/kmemtrace.txt for more information. | ||
387 | |||
388 | Saying Y will make the kernel somewhat larger and slower. However, | ||
389 | if you disable kmemtrace at run-time or boot-time, the performance | ||
390 | impact is minimal (depending on the arch the kernel is built for). | ||
391 | |||
392 | If unsure, say N. | ||
393 | |||
394 | config WORKQUEUE_TRACER | ||
395 | bool "Trace workqueues" | ||
396 | select GENERIC_TRACER | ||
397 | help | ||
398 | The workqueue tracer provides some statistical information | ||
399 | about each cpu workqueue thread such as the number of the | ||
400 | works inserted and executed since their creation. It can help | ||
401 | to evaluate the amount of work each of them has to perform. | ||
402 | For example it can help a developer to decide whether he should | ||
403 | choose a per-cpu workqueue instead of a singlethreaded one. | ||
404 | |||
405 | config BLK_DEV_IO_TRACE | 331 | config BLK_DEV_IO_TRACE |
406 | bool "Support for tracing block IO actions" | 332 | bool "Support for tracing block IO actions" |
407 | depends on SYSFS | 333 | depends on SYSFS |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index ffb1a5b0550e..53f338190b26 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o | |||
30 | obj-$(CONFIG_TRACING) += trace_stat.o | 30 | obj-$(CONFIG_TRACING) += trace_stat.o |
31 | obj-$(CONFIG_TRACING) += trace_printk.o | 31 | obj-$(CONFIG_TRACING) += trace_printk.o |
32 | obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o | 32 | obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o |
33 | obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o | ||
34 | obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o | 33 | obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o |
35 | obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o | 34 | obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o |
36 | obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o | 35 | obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o |
@@ -38,10 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o | |||
38 | obj-$(CONFIG_NOP_TRACER) += trace_nop.o | 37 | obj-$(CONFIG_NOP_TRACER) += trace_nop.o |
39 | obj-$(CONFIG_STACK_TRACER) += trace_stack.o | 38 | obj-$(CONFIG_STACK_TRACER) += trace_stack.o |
40 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o | 39 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o |
41 | obj-$(CONFIG_BOOT_TRACER) += trace_boot.o | ||
42 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o | 40 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o |
43 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o | 41 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o |
44 | obj-$(CONFIG_KMEMTRACE) += kmemtrace.o | ||
45 | obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o | 42 | obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o |
46 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o | 43 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o |
47 | ifeq ($(CONFIG_BLOCK),y) | 44 | ifeq ($(CONFIG_BLOCK),y) |
@@ -55,7 +52,9 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o | |||
55 | endif | 52 | endif |
56 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 53 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o |
57 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 54 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
58 | obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o | ||
59 | obj-$(CONFIG_EVENT_TRACING) += power-traces.o | 55 | obj-$(CONFIG_EVENT_TRACING) += power-traces.o |
56 | ifeq ($(CONFIG_TRACING),y) | ||
57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o | ||
58 | endif | ||
60 | 59 | ||
61 | libftrace-y := ftrace.o | 60 | libftrace-y := ftrace.o |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 638711c17504..bc251ed66724 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -23,7 +23,6 @@ | |||
23 | #include <linux/mutex.h> | 23 | #include <linux/mutex.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/debugfs.h> | 25 | #include <linux/debugfs.h> |
26 | #include <linux/smp_lock.h> | ||
27 | #include <linux/time.h> | 26 | #include <linux/time.h> |
28 | #include <linux/uaccess.h> | 27 | #include <linux/uaccess.h> |
29 | 28 | ||
@@ -169,9 +168,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, | |||
169 | static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), | 168 | static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), |
170 | BLK_TC_ACT(BLK_TC_WRITE) }; | 169 | BLK_TC_ACT(BLK_TC_WRITE) }; |
171 | 170 | ||
171 | #define BLK_TC_HARDBARRIER BLK_TC_BARRIER | ||
172 | #define BLK_TC_RAHEAD BLK_TC_AHEAD | ||
173 | |||
172 | /* The ilog2() calls fall out because they're constant */ | 174 | /* The ilog2() calls fall out because they're constant */ |
173 | #define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ | 175 | #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ |
174 | (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) | 176 | (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) |
175 | 177 | ||
176 | /* | 178 | /* |
177 | * The worker for the various blk_add_trace*() types. Fills out a | 179 | * The worker for the various blk_add_trace*() types. Fills out a |
@@ -194,9 +196,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, | |||
194 | return; | 196 | return; |
195 | 197 | ||
196 | what |= ddir_act[rw & WRITE]; | 198 | what |= ddir_act[rw & WRITE]; |
197 | what |= MASK_TC_BIT(rw, BARRIER); | 199 | what |= MASK_TC_BIT(rw, HARDBARRIER); |
198 | what |= MASK_TC_BIT(rw, SYNCIO); | 200 | what |= MASK_TC_BIT(rw, SYNC); |
199 | what |= MASK_TC_BIT(rw, AHEAD); | 201 | what |= MASK_TC_BIT(rw, RAHEAD); |
200 | what |= MASK_TC_BIT(rw, META); | 202 | what |= MASK_TC_BIT(rw, META); |
201 | what |= MASK_TC_BIT(rw, DISCARD); | 203 | what |= MASK_TC_BIT(rw, DISCARD); |
202 | 204 | ||
@@ -323,6 +325,7 @@ static const struct file_operations blk_dropped_fops = { | |||
323 | .owner = THIS_MODULE, | 325 | .owner = THIS_MODULE, |
324 | .open = blk_dropped_open, | 326 | .open = blk_dropped_open, |
325 | .read = blk_dropped_read, | 327 | .read = blk_dropped_read, |
328 | .llseek = default_llseek, | ||
326 | }; | 329 | }; |
327 | 330 | ||
328 | static int blk_msg_open(struct inode *inode, struct file *filp) | 331 | static int blk_msg_open(struct inode *inode, struct file *filp) |
@@ -362,6 +365,7 @@ static const struct file_operations blk_msg_fops = { | |||
362 | .owner = THIS_MODULE, | 365 | .owner = THIS_MODULE, |
363 | .open = blk_msg_open, | 366 | .open = blk_msg_open, |
364 | .write = blk_msg_write, | 367 | .write = blk_msg_write, |
368 | .llseek = noop_llseek, | ||
365 | }; | 369 | }; |
366 | 370 | ||
367 | /* | 371 | /* |
@@ -549,6 +553,41 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | |||
549 | } | 553 | } |
550 | EXPORT_SYMBOL_GPL(blk_trace_setup); | 554 | EXPORT_SYMBOL_GPL(blk_trace_setup); |
551 | 555 | ||
556 | #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) | ||
557 | static int compat_blk_trace_setup(struct request_queue *q, char *name, | ||
558 | dev_t dev, struct block_device *bdev, | ||
559 | char __user *arg) | ||
560 | { | ||
561 | struct blk_user_trace_setup buts; | ||
562 | struct compat_blk_user_trace_setup cbuts; | ||
563 | int ret; | ||
564 | |||
565 | if (copy_from_user(&cbuts, arg, sizeof(cbuts))) | ||
566 | return -EFAULT; | ||
567 | |||
568 | buts = (struct blk_user_trace_setup) { | ||
569 | .act_mask = cbuts.act_mask, | ||
570 | .buf_size = cbuts.buf_size, | ||
571 | .buf_nr = cbuts.buf_nr, | ||
572 | .start_lba = cbuts.start_lba, | ||
573 | .end_lba = cbuts.end_lba, | ||
574 | .pid = cbuts.pid, | ||
575 | }; | ||
576 | memcpy(&buts.name, &cbuts.name, 32); | ||
577 | |||
578 | ret = do_blk_trace_setup(q, name, dev, bdev, &buts); | ||
579 | if (ret) | ||
580 | return ret; | ||
581 | |||
582 | if (copy_to_user(arg, &buts.name, 32)) { | ||
583 | blk_trace_remove(q); | ||
584 | return -EFAULT; | ||
585 | } | ||
586 | |||
587 | return 0; | ||
588 | } | ||
589 | #endif | ||
590 | |||
552 | int blk_trace_startstop(struct request_queue *q, int start) | 591 | int blk_trace_startstop(struct request_queue *q, int start) |
553 | { | 592 | { |
554 | int ret; | 593 | int ret; |
@@ -608,6 +647,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) | |||
608 | bdevname(bdev, b); | 647 | bdevname(bdev, b); |
609 | ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); | 648 | ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); |
610 | break; | 649 | break; |
650 | #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) | ||
651 | case BLKTRACESETUP32: | ||
652 | bdevname(bdev, b); | ||
653 | ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); | ||
654 | break; | ||
655 | #endif | ||
611 | case BLKTRACESTART: | 656 | case BLKTRACESTART: |
612 | start = 1; | 657 | start = 1; |
613 | case BLKTRACESTOP: | 658 | case BLKTRACESTOP: |
@@ -661,10 +706,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, | |||
661 | if (likely(!bt)) | 706 | if (likely(!bt)) |
662 | return; | 707 | return; |
663 | 708 | ||
664 | if (blk_discard_rq(rq)) | 709 | if (rq->cmd_flags & REQ_DISCARD) |
665 | rw |= (1 << BIO_RW_DISCARD); | 710 | rw |= REQ_DISCARD; |
711 | |||
712 | if (rq->cmd_flags & REQ_SECURE) | ||
713 | rw |= REQ_SECURE; | ||
666 | 714 | ||
667 | if (blk_pc_request(rq)) { | 715 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { |
668 | what |= BLK_TC_ACT(BLK_TC_PC); | 716 | what |= BLK_TC_ACT(BLK_TC_PC); |
669 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, | 717 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, |
670 | what, rq->errors, rq->cmd_len, rq->cmd); | 718 | what, rq->errors, rq->cmd_len, rq->cmd); |
@@ -925,7 +973,7 @@ void blk_add_driver_data(struct request_queue *q, | |||
925 | if (likely(!bt)) | 973 | if (likely(!bt)) |
926 | return; | 974 | return; |
927 | 975 | ||
928 | if (blk_pc_request(rq)) | 976 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) |
929 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, | 977 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, |
930 | BLK_TA_DRV_DATA, rq->errors, len, data); | 978 | BLK_TA_DRV_DATA, rq->errors, len, data); |
931 | else | 979 | else |
@@ -1603,10 +1651,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, | |||
1603 | struct block_device *bdev; | 1651 | struct block_device *bdev; |
1604 | ssize_t ret = -ENXIO; | 1652 | ssize_t ret = -ENXIO; |
1605 | 1653 | ||
1606 | lock_kernel(); | ||
1607 | bdev = bdget(part_devt(p)); | 1654 | bdev = bdget(part_devt(p)); |
1608 | if (bdev == NULL) | 1655 | if (bdev == NULL) |
1609 | goto out_unlock_kernel; | 1656 | goto out; |
1610 | 1657 | ||
1611 | q = blk_trace_get_queue(bdev); | 1658 | q = blk_trace_get_queue(bdev); |
1612 | if (q == NULL) | 1659 | if (q == NULL) |
@@ -1634,8 +1681,7 @@ out_unlock_bdev: | |||
1634 | mutex_unlock(&bdev->bd_mutex); | 1681 | mutex_unlock(&bdev->bd_mutex); |
1635 | out_bdput: | 1682 | out_bdput: |
1636 | bdput(bdev); | 1683 | bdput(bdev); |
1637 | out_unlock_kernel: | 1684 | out: |
1638 | unlock_kernel(); | ||
1639 | return ret; | 1685 | return ret; |
1640 | } | 1686 | } |
1641 | 1687 | ||
@@ -1665,11 +1711,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, | |||
1665 | 1711 | ||
1666 | ret = -ENXIO; | 1712 | ret = -ENXIO; |
1667 | 1713 | ||
1668 | lock_kernel(); | ||
1669 | p = dev_to_part(dev); | 1714 | p = dev_to_part(dev); |
1670 | bdev = bdget(part_devt(p)); | 1715 | bdev = bdget(part_devt(p)); |
1671 | if (bdev == NULL) | 1716 | if (bdev == NULL) |
1672 | goto out_unlock_kernel; | 1717 | goto out; |
1673 | 1718 | ||
1674 | q = blk_trace_get_queue(bdev); | 1719 | q = blk_trace_get_queue(bdev); |
1675 | if (q == NULL) | 1720 | if (q == NULL) |
@@ -1704,8 +1749,6 @@ out_unlock_bdev: | |||
1704 | mutex_unlock(&bdev->bd_mutex); | 1749 | mutex_unlock(&bdev->bd_mutex); |
1705 | out_bdput: | 1750 | out_bdput: |
1706 | bdput(bdev); | 1751 | bdput(bdev); |
1707 | out_unlock_kernel: | ||
1708 | unlock_kernel(); | ||
1709 | out: | 1752 | out: |
1710 | return ret ? ret : count; | 1753 | return ret ? ret : count; |
1711 | } | 1754 | } |
@@ -1730,7 +1773,7 @@ void blk_dump_cmd(char *buf, struct request *rq) | |||
1730 | int len = rq->cmd_len; | 1773 | int len = rq->cmd_len; |
1731 | unsigned char *cmd = rq->cmd; | 1774 | unsigned char *cmd = rq->cmd; |
1732 | 1775 | ||
1733 | if (!blk_pc_request(rq)) { | 1776 | if (rq->cmd_type != REQ_TYPE_BLOCK_PC) { |
1734 | buf[0] = '\0'; | 1777 | buf[0] = '\0'; |
1735 | return; | 1778 | return; |
1736 | } | 1779 | } |
@@ -1755,21 +1798,23 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | |||
1755 | 1798 | ||
1756 | if (rw & WRITE) | 1799 | if (rw & WRITE) |
1757 | rwbs[i++] = 'W'; | 1800 | rwbs[i++] = 'W'; |
1758 | else if (rw & 1 << BIO_RW_DISCARD) | 1801 | else if (rw & REQ_DISCARD) |
1759 | rwbs[i++] = 'D'; | 1802 | rwbs[i++] = 'D'; |
1760 | else if (bytes) | 1803 | else if (bytes) |
1761 | rwbs[i++] = 'R'; | 1804 | rwbs[i++] = 'R'; |
1762 | else | 1805 | else |
1763 | rwbs[i++] = 'N'; | 1806 | rwbs[i++] = 'N'; |
1764 | 1807 | ||
1765 | if (rw & 1 << BIO_RW_AHEAD) | 1808 | if (rw & REQ_RAHEAD) |
1766 | rwbs[i++] = 'A'; | 1809 | rwbs[i++] = 'A'; |
1767 | if (rw & 1 << BIO_RW_BARRIER) | 1810 | if (rw & REQ_HARDBARRIER) |
1768 | rwbs[i++] = 'B'; | 1811 | rwbs[i++] = 'B'; |
1769 | if (rw & 1 << BIO_RW_SYNCIO) | 1812 | if (rw & REQ_SYNC) |
1770 | rwbs[i++] = 'S'; | 1813 | rwbs[i++] = 'S'; |
1771 | if (rw & 1 << BIO_RW_META) | 1814 | if (rw & REQ_META) |
1772 | rwbs[i++] = 'M'; | 1815 | rwbs[i++] = 'M'; |
1816 | if (rw & REQ_SECURE) | ||
1817 | rwbs[i++] = 'E'; | ||
1773 | 1818 | ||
1774 | rwbs[i] = '\0'; | 1819 | rwbs[i] = '\0'; |
1775 | } | 1820 | } |
@@ -1779,8 +1824,11 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq) | |||
1779 | int rw = rq->cmd_flags & 0x03; | 1824 | int rw = rq->cmd_flags & 0x03; |
1780 | int bytes; | 1825 | int bytes; |
1781 | 1826 | ||
1782 | if (blk_discard_rq(rq)) | 1827 | if (rq->cmd_flags & REQ_DISCARD) |
1783 | rw |= (1 << BIO_RW_DISCARD); | 1828 | rw |= REQ_DISCARD; |
1829 | |||
1830 | if (rq->cmd_flags & REQ_SECURE) | ||
1831 | rw |= REQ_SECURE; | ||
1784 | 1832 | ||
1785 | bytes = blk_rq_bytes(rq); | 1833 | bytes = blk_rq_bytes(rq); |
1786 | 1834 | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6d2cb14f9449..f3dadae83883 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -381,12 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v) | |||
381 | { | 381 | { |
382 | struct ftrace_profile *rec = v; | 382 | struct ftrace_profile *rec = v; |
383 | char str[KSYM_SYMBOL_LEN]; | 383 | char str[KSYM_SYMBOL_LEN]; |
384 | int ret = 0; | ||
384 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 385 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
385 | static DEFINE_MUTEX(mutex); | ||
386 | static struct trace_seq s; | 386 | static struct trace_seq s; |
387 | unsigned long long avg; | 387 | unsigned long long avg; |
388 | unsigned long long stddev; | 388 | unsigned long long stddev; |
389 | #endif | 389 | #endif |
390 | mutex_lock(&ftrace_profile_lock); | ||
391 | |||
392 | /* we raced with function_profile_reset() */ | ||
393 | if (unlikely(rec->counter == 0)) { | ||
394 | ret = -EBUSY; | ||
395 | goto out; | ||
396 | } | ||
390 | 397 | ||
391 | kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); | 398 | kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); |
392 | seq_printf(m, " %-30.30s %10lu", str, rec->counter); | 399 | seq_printf(m, " %-30.30s %10lu", str, rec->counter); |
@@ -408,7 +415,6 @@ static int function_stat_show(struct seq_file *m, void *v) | |||
408 | do_div(stddev, (rec->counter - 1) * 1000); | 415 | do_div(stddev, (rec->counter - 1) * 1000); |
409 | } | 416 | } |
410 | 417 | ||
411 | mutex_lock(&mutex); | ||
412 | trace_seq_init(&s); | 418 | trace_seq_init(&s); |
413 | trace_print_graph_duration(rec->time, &s); | 419 | trace_print_graph_duration(rec->time, &s); |
414 | trace_seq_puts(&s, " "); | 420 | trace_seq_puts(&s, " "); |
@@ -416,11 +422,12 @@ static int function_stat_show(struct seq_file *m, void *v) | |||
416 | trace_seq_puts(&s, " "); | 422 | trace_seq_puts(&s, " "); |
417 | trace_print_graph_duration(stddev, &s); | 423 | trace_print_graph_duration(stddev, &s); |
418 | trace_print_seq(m, &s); | 424 | trace_print_seq(m, &s); |
419 | mutex_unlock(&mutex); | ||
420 | #endif | 425 | #endif |
421 | seq_putc(m, '\n'); | 426 | seq_putc(m, '\n'); |
427 | out: | ||
428 | mutex_unlock(&ftrace_profile_lock); | ||
422 | 429 | ||
423 | return 0; | 430 | return ret; |
424 | } | 431 | } |
425 | 432 | ||
426 | static void ftrace_profile_reset(struct ftrace_profile_stat *stat) | 433 | static void ftrace_profile_reset(struct ftrace_profile_stat *stat) |
@@ -793,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = { | |||
793 | .open = tracing_open_generic, | 800 | .open = tracing_open_generic, |
794 | .read = ftrace_profile_read, | 801 | .read = ftrace_profile_read, |
795 | .write = ftrace_profile_write, | 802 | .write = ftrace_profile_write, |
803 | .llseek = default_llseek, | ||
796 | }; | 804 | }; |
797 | 805 | ||
798 | /* used to initialize the real stat files */ | 806 | /* used to initialize the real stat files */ |
@@ -877,10 +885,8 @@ enum { | |||
877 | FTRACE_ENABLE_CALLS = (1 << 0), | 885 | FTRACE_ENABLE_CALLS = (1 << 0), |
878 | FTRACE_DISABLE_CALLS = (1 << 1), | 886 | FTRACE_DISABLE_CALLS = (1 << 1), |
879 | FTRACE_UPDATE_TRACE_FUNC = (1 << 2), | 887 | FTRACE_UPDATE_TRACE_FUNC = (1 << 2), |
880 | FTRACE_ENABLE_MCOUNT = (1 << 3), | 888 | FTRACE_START_FUNC_RET = (1 << 3), |
881 | FTRACE_DISABLE_MCOUNT = (1 << 4), | 889 | FTRACE_STOP_FUNC_RET = (1 << 4), |
882 | FTRACE_START_FUNC_RET = (1 << 5), | ||
883 | FTRACE_STOP_FUNC_RET = (1 << 6), | ||
884 | }; | 890 | }; |
885 | 891 | ||
886 | static int ftrace_filtered; | 892 | static int ftrace_filtered; |
@@ -1219,8 +1225,6 @@ static void ftrace_shutdown(int command) | |||
1219 | 1225 | ||
1220 | static void ftrace_startup_sysctl(void) | 1226 | static void ftrace_startup_sysctl(void) |
1221 | { | 1227 | { |
1222 | int command = FTRACE_ENABLE_MCOUNT; | ||
1223 | |||
1224 | if (unlikely(ftrace_disabled)) | 1228 | if (unlikely(ftrace_disabled)) |
1225 | return; | 1229 | return; |
1226 | 1230 | ||
@@ -1228,23 +1232,17 @@ static void ftrace_startup_sysctl(void) | |||
1228 | saved_ftrace_func = NULL; | 1232 | saved_ftrace_func = NULL; |
1229 | /* ftrace_start_up is true if we want ftrace running */ | 1233 | /* ftrace_start_up is true if we want ftrace running */ |
1230 | if (ftrace_start_up) | 1234 | if (ftrace_start_up) |
1231 | command |= FTRACE_ENABLE_CALLS; | 1235 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); |
1232 | |||
1233 | ftrace_run_update_code(command); | ||
1234 | } | 1236 | } |
1235 | 1237 | ||
1236 | static void ftrace_shutdown_sysctl(void) | 1238 | static void ftrace_shutdown_sysctl(void) |
1237 | { | 1239 | { |
1238 | int command = FTRACE_DISABLE_MCOUNT; | ||
1239 | |||
1240 | if (unlikely(ftrace_disabled)) | 1240 | if (unlikely(ftrace_disabled)) |
1241 | return; | 1241 | return; |
1242 | 1242 | ||
1243 | /* ftrace_start_up is true if ftrace is running */ | 1243 | /* ftrace_start_up is true if ftrace is running */ |
1244 | if (ftrace_start_up) | 1244 | if (ftrace_start_up) |
1245 | command |= FTRACE_DISABLE_CALLS; | 1245 | ftrace_run_update_code(FTRACE_DISABLE_CALLS); |
1246 | |||
1247 | ftrace_run_update_code(command); | ||
1248 | } | 1246 | } |
1249 | 1247 | ||
1250 | static cycle_t ftrace_update_time; | 1248 | static cycle_t ftrace_update_time; |
@@ -1361,24 +1359,29 @@ enum { | |||
1361 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ | 1359 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ |
1362 | 1360 | ||
1363 | struct ftrace_iterator { | 1361 | struct ftrace_iterator { |
1364 | struct ftrace_page *pg; | 1362 | loff_t pos; |
1365 | int hidx; | 1363 | loff_t func_pos; |
1366 | int idx; | 1364 | struct ftrace_page *pg; |
1367 | unsigned flags; | 1365 | struct dyn_ftrace *func; |
1368 | struct trace_parser parser; | 1366 | struct ftrace_func_probe *probe; |
1367 | struct trace_parser parser; | ||
1368 | int hidx; | ||
1369 | int idx; | ||
1370 | unsigned flags; | ||
1369 | }; | 1371 | }; |
1370 | 1372 | ||
1371 | static void * | 1373 | static void * |
1372 | t_hash_next(struct seq_file *m, void *v, loff_t *pos) | 1374 | t_hash_next(struct seq_file *m, loff_t *pos) |
1373 | { | 1375 | { |
1374 | struct ftrace_iterator *iter = m->private; | 1376 | struct ftrace_iterator *iter = m->private; |
1375 | struct hlist_node *hnd = v; | 1377 | struct hlist_node *hnd = NULL; |
1376 | struct hlist_head *hhd; | 1378 | struct hlist_head *hhd; |
1377 | 1379 | ||
1378 | WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); | ||
1379 | |||
1380 | (*pos)++; | 1380 | (*pos)++; |
1381 | iter->pos = *pos; | ||
1381 | 1382 | ||
1383 | if (iter->probe) | ||
1384 | hnd = &iter->probe->node; | ||
1382 | retry: | 1385 | retry: |
1383 | if (iter->hidx >= FTRACE_FUNC_HASHSIZE) | 1386 | if (iter->hidx >= FTRACE_FUNC_HASHSIZE) |
1384 | return NULL; | 1387 | return NULL; |
@@ -1401,7 +1404,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos) | |||
1401 | } | 1404 | } |
1402 | } | 1405 | } |
1403 | 1406 | ||
1404 | return hnd; | 1407 | if (WARN_ON_ONCE(!hnd)) |
1408 | return NULL; | ||
1409 | |||
1410 | iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); | ||
1411 | |||
1412 | return iter; | ||
1405 | } | 1413 | } |
1406 | 1414 | ||
1407 | static void *t_hash_start(struct seq_file *m, loff_t *pos) | 1415 | static void *t_hash_start(struct seq_file *m, loff_t *pos) |
@@ -1410,26 +1418,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) | |||
1410 | void *p = NULL; | 1418 | void *p = NULL; |
1411 | loff_t l; | 1419 | loff_t l; |
1412 | 1420 | ||
1413 | if (!(iter->flags & FTRACE_ITER_HASH)) | 1421 | if (iter->func_pos > *pos) |
1414 | *pos = 0; | 1422 | return NULL; |
1415 | |||
1416 | iter->flags |= FTRACE_ITER_HASH; | ||
1417 | 1423 | ||
1418 | iter->hidx = 0; | 1424 | iter->hidx = 0; |
1419 | for (l = 0; l <= *pos; ) { | 1425 | for (l = 0; l <= (*pos - iter->func_pos); ) { |
1420 | p = t_hash_next(m, p, &l); | 1426 | p = t_hash_next(m, &l); |
1421 | if (!p) | 1427 | if (!p) |
1422 | break; | 1428 | break; |
1423 | } | 1429 | } |
1424 | return p; | 1430 | if (!p) |
1431 | return NULL; | ||
1432 | |||
1433 | /* Only set this if we have an item */ | ||
1434 | iter->flags |= FTRACE_ITER_HASH; | ||
1435 | |||
1436 | return iter; | ||
1425 | } | 1437 | } |
1426 | 1438 | ||
1427 | static int t_hash_show(struct seq_file *m, void *v) | 1439 | static int |
1440 | t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) | ||
1428 | { | 1441 | { |
1429 | struct ftrace_func_probe *rec; | 1442 | struct ftrace_func_probe *rec; |
1430 | struct hlist_node *hnd = v; | ||
1431 | 1443 | ||
1432 | rec = hlist_entry(hnd, struct ftrace_func_probe, node); | 1444 | rec = iter->probe; |
1445 | if (WARN_ON_ONCE(!rec)) | ||
1446 | return -EIO; | ||
1433 | 1447 | ||
1434 | if (rec->ops->print) | 1448 | if (rec->ops->print) |
1435 | return rec->ops->print(m, rec->ip, rec->ops, rec->data); | 1449 | return rec->ops->print(m, rec->ip, rec->ops, rec->data); |
@@ -1450,12 +1464,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
1450 | struct dyn_ftrace *rec = NULL; | 1464 | struct dyn_ftrace *rec = NULL; |
1451 | 1465 | ||
1452 | if (iter->flags & FTRACE_ITER_HASH) | 1466 | if (iter->flags & FTRACE_ITER_HASH) |
1453 | return t_hash_next(m, v, pos); | 1467 | return t_hash_next(m, pos); |
1454 | 1468 | ||
1455 | (*pos)++; | 1469 | (*pos)++; |
1470 | iter->pos = *pos; | ||
1456 | 1471 | ||
1457 | if (iter->flags & FTRACE_ITER_PRINTALL) | 1472 | if (iter->flags & FTRACE_ITER_PRINTALL) |
1458 | return NULL; | 1473 | return t_hash_start(m, pos); |
1459 | 1474 | ||
1460 | retry: | 1475 | retry: |
1461 | if (iter->idx >= iter->pg->index) { | 1476 | if (iter->idx >= iter->pg->index) { |
@@ -1484,7 +1499,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
1484 | } | 1499 | } |
1485 | } | 1500 | } |
1486 | 1501 | ||
1487 | return rec; | 1502 | if (!rec) |
1503 | return t_hash_start(m, pos); | ||
1504 | |||
1505 | iter->func_pos = *pos; | ||
1506 | iter->func = rec; | ||
1507 | |||
1508 | return iter; | ||
1509 | } | ||
1510 | |||
1511 | static void reset_iter_read(struct ftrace_iterator *iter) | ||
1512 | { | ||
1513 | iter->pos = 0; | ||
1514 | iter->func_pos = 0; | ||
1515 | iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); | ||
1488 | } | 1516 | } |
1489 | 1517 | ||
1490 | static void *t_start(struct seq_file *m, loff_t *pos) | 1518 | static void *t_start(struct seq_file *m, loff_t *pos) |
@@ -1495,6 +1523,12 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
1495 | 1523 | ||
1496 | mutex_lock(&ftrace_lock); | 1524 | mutex_lock(&ftrace_lock); |
1497 | /* | 1525 | /* |
1526 | * If an lseek was done, then reset and start from beginning. | ||
1527 | */ | ||
1528 | if (*pos < iter->pos) | ||
1529 | reset_iter_read(iter); | ||
1530 | |||
1531 | /* | ||
1498 | * For set_ftrace_filter reading, if we have the filter | 1532 | * For set_ftrace_filter reading, if we have the filter |
1499 | * off, we can short cut and just print out that all | 1533 | * off, we can short cut and just print out that all |
1500 | * functions are enabled. | 1534 | * functions are enabled. |
@@ -1503,12 +1537,19 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
1503 | if (*pos > 0) | 1537 | if (*pos > 0) |
1504 | return t_hash_start(m, pos); | 1538 | return t_hash_start(m, pos); |
1505 | iter->flags |= FTRACE_ITER_PRINTALL; | 1539 | iter->flags |= FTRACE_ITER_PRINTALL; |
1540 | /* reset in case of seek/pread */ | ||
1541 | iter->flags &= ~FTRACE_ITER_HASH; | ||
1506 | return iter; | 1542 | return iter; |
1507 | } | 1543 | } |
1508 | 1544 | ||
1509 | if (iter->flags & FTRACE_ITER_HASH) | 1545 | if (iter->flags & FTRACE_ITER_HASH) |
1510 | return t_hash_start(m, pos); | 1546 | return t_hash_start(m, pos); |
1511 | 1547 | ||
1548 | /* | ||
1549 | * Unfortunately, we need to restart at ftrace_pages_start | ||
1550 | * every time we let go of the ftrace_mutex. This is because | ||
1551 | * those pointers can change without the lock. | ||
1552 | */ | ||
1512 | iter->pg = ftrace_pages_start; | 1553 | iter->pg = ftrace_pages_start; |
1513 | iter->idx = 0; | 1554 | iter->idx = 0; |
1514 | for (l = 0; l <= *pos; ) { | 1555 | for (l = 0; l <= *pos; ) { |
@@ -1517,10 +1558,14 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
1517 | break; | 1558 | break; |
1518 | } | 1559 | } |
1519 | 1560 | ||
1520 | if (!p && iter->flags & FTRACE_ITER_FILTER) | 1561 | if (!p) { |
1521 | return t_hash_start(m, pos); | 1562 | if (iter->flags & FTRACE_ITER_FILTER) |
1563 | return t_hash_start(m, pos); | ||
1522 | 1564 | ||
1523 | return p; | 1565 | return NULL; |
1566 | } | ||
1567 | |||
1568 | return iter; | ||
1524 | } | 1569 | } |
1525 | 1570 | ||
1526 | static void t_stop(struct seq_file *m, void *p) | 1571 | static void t_stop(struct seq_file *m, void *p) |
@@ -1531,16 +1576,18 @@ static void t_stop(struct seq_file *m, void *p) | |||
1531 | static int t_show(struct seq_file *m, void *v) | 1576 | static int t_show(struct seq_file *m, void *v) |
1532 | { | 1577 | { |
1533 | struct ftrace_iterator *iter = m->private; | 1578 | struct ftrace_iterator *iter = m->private; |
1534 | struct dyn_ftrace *rec = v; | 1579 | struct dyn_ftrace *rec; |
1535 | 1580 | ||
1536 | if (iter->flags & FTRACE_ITER_HASH) | 1581 | if (iter->flags & FTRACE_ITER_HASH) |
1537 | return t_hash_show(m, v); | 1582 | return t_hash_show(m, iter); |
1538 | 1583 | ||
1539 | if (iter->flags & FTRACE_ITER_PRINTALL) { | 1584 | if (iter->flags & FTRACE_ITER_PRINTALL) { |
1540 | seq_printf(m, "#### all functions enabled ####\n"); | 1585 | seq_printf(m, "#### all functions enabled ####\n"); |
1541 | return 0; | 1586 | return 0; |
1542 | } | 1587 | } |
1543 | 1588 | ||
1589 | rec = iter->func; | ||
1590 | |||
1544 | if (!rec) | 1591 | if (!rec) |
1545 | return 0; | 1592 | return 0; |
1546 | 1593 | ||
@@ -1592,8 +1639,8 @@ ftrace_failures_open(struct inode *inode, struct file *file) | |||
1592 | 1639 | ||
1593 | ret = ftrace_avail_open(inode, file); | 1640 | ret = ftrace_avail_open(inode, file); |
1594 | if (!ret) { | 1641 | if (!ret) { |
1595 | m = (struct seq_file *)file->private_data; | 1642 | m = file->private_data; |
1596 | iter = (struct ftrace_iterator *)m->private; | 1643 | iter = m->private; |
1597 | iter->flags = FTRACE_ITER_FAILURES; | 1644 | iter->flags = FTRACE_ITER_FAILURES; |
1598 | } | 1645 | } |
1599 | 1646 | ||
@@ -1883,7 +1930,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) | |||
1883 | struct hlist_head *hhd; | 1930 | struct hlist_head *hhd; |
1884 | struct hlist_node *n; | 1931 | struct hlist_node *n; |
1885 | unsigned long key; | 1932 | unsigned long key; |
1886 | int resched; | ||
1887 | 1933 | ||
1888 | key = hash_long(ip, FTRACE_HASH_BITS); | 1934 | key = hash_long(ip, FTRACE_HASH_BITS); |
1889 | 1935 | ||
@@ -1897,12 +1943,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) | |||
1897 | * period. This syncs the hash iteration and freeing of items | 1943 | * period. This syncs the hash iteration and freeing of items |
1898 | * on the hash. rcu_read_lock is too dangerous here. | 1944 | * on the hash. rcu_read_lock is too dangerous here. |
1899 | */ | 1945 | */ |
1900 | resched = ftrace_preempt_disable(); | 1946 | preempt_disable_notrace(); |
1901 | hlist_for_each_entry_rcu(entry, n, hhd, node) { | 1947 | hlist_for_each_entry_rcu(entry, n, hhd, node) { |
1902 | if (entry->ip == ip) | 1948 | if (entry->ip == ip) |
1903 | entry->ops->func(ip, parent_ip, &entry->data); | 1949 | entry->ops->func(ip, parent_ip, &entry->data); |
1904 | } | 1950 | } |
1905 | ftrace_preempt_enable(resched); | 1951 | preempt_enable_notrace(); |
1906 | } | 1952 | } |
1907 | 1953 | ||
1908 | static struct ftrace_ops trace_probe_ops __read_mostly = | 1954 | static struct ftrace_ops trace_probe_ops __read_mostly = |
@@ -2624,6 +2670,7 @@ static const struct file_operations ftrace_graph_fops = { | |||
2624 | .read = seq_read, | 2670 | .read = seq_read, |
2625 | .write = ftrace_graph_write, | 2671 | .write = ftrace_graph_write, |
2626 | .release = ftrace_graph_release, | 2672 | .release = ftrace_graph_release, |
2673 | .llseek = seq_lseek, | ||
2627 | }; | 2674 | }; |
2628 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 2675 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
2629 | 2676 | ||
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c deleted file mode 100644 index bbfc1bb1660b..000000000000 --- a/kernel/trace/kmemtrace.c +++ /dev/null | |||
@@ -1,529 +0,0 @@ | |||
1 | /* | ||
2 | * Memory allocator tracing | ||
3 | * | ||
4 | * Copyright (C) 2008 Eduard - Gabriel Munteanu | ||
5 | * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi> | ||
6 | * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> | ||
7 | */ | ||
8 | |||
9 | #include <linux/tracepoint.h> | ||
10 | #include <linux/seq_file.h> | ||
11 | #include <linux/debugfs.h> | ||
12 | #include <linux/dcache.h> | ||
13 | #include <linux/fs.h> | ||
14 | |||
15 | #include <linux/kmemtrace.h> | ||
16 | |||
17 | #include "trace_output.h" | ||
18 | #include "trace.h" | ||
19 | |||
20 | /* Select an alternative, minimalistic output than the original one */ | ||
21 | #define TRACE_KMEM_OPT_MINIMAL 0x1 | ||
22 | |||
23 | static struct tracer_opt kmem_opts[] = { | ||
24 | /* Default disable the minimalistic output */ | ||
25 | { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) }, | ||
26 | { } | ||
27 | }; | ||
28 | |||
29 | static struct tracer_flags kmem_tracer_flags = { | ||
30 | .val = 0, | ||
31 | .opts = kmem_opts | ||
32 | }; | ||
33 | |||
34 | static struct trace_array *kmemtrace_array; | ||
35 | |||
36 | /* Trace allocations */ | ||
37 | static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, | ||
38 | unsigned long call_site, | ||
39 | const void *ptr, | ||
40 | size_t bytes_req, | ||
41 | size_t bytes_alloc, | ||
42 | gfp_t gfp_flags, | ||
43 | int node) | ||
44 | { | ||
45 | struct ftrace_event_call *call = &event_kmem_alloc; | ||
46 | struct trace_array *tr = kmemtrace_array; | ||
47 | struct kmemtrace_alloc_entry *entry; | ||
48 | struct ring_buffer_event *event; | ||
49 | |||
50 | event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); | ||
51 | if (!event) | ||
52 | return; | ||
53 | |||
54 | entry = ring_buffer_event_data(event); | ||
55 | tracing_generic_entry_update(&entry->ent, 0, 0); | ||
56 | |||
57 | entry->ent.type = TRACE_KMEM_ALLOC; | ||
58 | entry->type_id = type_id; | ||
59 | entry->call_site = call_site; | ||
60 | entry->ptr = ptr; | ||
61 | entry->bytes_req = bytes_req; | ||
62 | entry->bytes_alloc = bytes_alloc; | ||
63 | entry->gfp_flags = gfp_flags; | ||
64 | entry->node = node; | ||
65 | |||
66 | if (!filter_check_discard(call, entry, tr->buffer, event)) | ||
67 | ring_buffer_unlock_commit(tr->buffer, event); | ||
68 | |||
69 | trace_wake_up(); | ||
70 | } | ||
71 | |||
72 | static inline void kmemtrace_free(enum kmemtrace_type_id type_id, | ||
73 | unsigned long call_site, | ||
74 | const void *ptr) | ||
75 | { | ||
76 | struct ftrace_event_call *call = &event_kmem_free; | ||
77 | struct trace_array *tr = kmemtrace_array; | ||
78 | struct kmemtrace_free_entry *entry; | ||
79 | struct ring_buffer_event *event; | ||
80 | |||
81 | event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); | ||
82 | if (!event) | ||
83 | return; | ||
84 | entry = ring_buffer_event_data(event); | ||
85 | tracing_generic_entry_update(&entry->ent, 0, 0); | ||
86 | |||
87 | entry->ent.type = TRACE_KMEM_FREE; | ||
88 | entry->type_id = type_id; | ||
89 | entry->call_site = call_site; | ||
90 | entry->ptr = ptr; | ||
91 | |||
92 | if (!filter_check_discard(call, entry, tr->buffer, event)) | ||
93 | ring_buffer_unlock_commit(tr->buffer, event); | ||
94 | |||
95 | trace_wake_up(); | ||
96 | } | ||
97 | |||
98 | static void kmemtrace_kmalloc(void *ignore, | ||
99 | unsigned long call_site, | ||
100 | const void *ptr, | ||
101 | size_t bytes_req, | ||
102 | size_t bytes_alloc, | ||
103 | gfp_t gfp_flags) | ||
104 | { | ||
105 | kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, | ||
106 | bytes_req, bytes_alloc, gfp_flags, -1); | ||
107 | } | ||
108 | |||
109 | static void kmemtrace_kmem_cache_alloc(void *ignore, | ||
110 | unsigned long call_site, | ||
111 | const void *ptr, | ||
112 | size_t bytes_req, | ||
113 | size_t bytes_alloc, | ||
114 | gfp_t gfp_flags) | ||
115 | { | ||
116 | kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, | ||
117 | bytes_req, bytes_alloc, gfp_flags, -1); | ||
118 | } | ||
119 | |||
120 | static void kmemtrace_kmalloc_node(void *ignore, | ||
121 | unsigned long call_site, | ||
122 | const void *ptr, | ||
123 | size_t bytes_req, | ||
124 | size_t bytes_alloc, | ||
125 | gfp_t gfp_flags, | ||
126 | int node) | ||
127 | { | ||
128 | kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, | ||
129 | bytes_req, bytes_alloc, gfp_flags, node); | ||
130 | } | ||
131 | |||
132 | static void kmemtrace_kmem_cache_alloc_node(void *ignore, | ||
133 | unsigned long call_site, | ||
134 | const void *ptr, | ||
135 | size_t bytes_req, | ||
136 | size_t bytes_alloc, | ||
137 | gfp_t gfp_flags, | ||
138 | int node) | ||
139 | { | ||
140 | kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, | ||
141 | bytes_req, bytes_alloc, gfp_flags, node); | ||
142 | } | ||
143 | |||
144 | static void | ||
145 | kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr) | ||
146 | { | ||
147 | kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); | ||
148 | } | ||
149 | |||
150 | static void kmemtrace_kmem_cache_free(void *ignore, | ||
151 | unsigned long call_site, const void *ptr) | ||
152 | { | ||
153 | kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); | ||
154 | } | ||
155 | |||
156 | static int kmemtrace_start_probes(void) | ||
157 | { | ||
158 | int err; | ||
159 | |||
160 | err = register_trace_kmalloc(kmemtrace_kmalloc, NULL); | ||
161 | if (err) | ||
162 | return err; | ||
163 | err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); | ||
164 | if (err) | ||
165 | return err; | ||
166 | err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); | ||
167 | if (err) | ||
168 | return err; | ||
169 | err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); | ||
170 | if (err) | ||
171 | return err; | ||
172 | err = register_trace_kfree(kmemtrace_kfree, NULL); | ||
173 | if (err) | ||
174 | return err; | ||
175 | err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); | ||
176 | |||
177 | return err; | ||
178 | } | ||
179 | |||
180 | static void kmemtrace_stop_probes(void) | ||
181 | { | ||
182 | unregister_trace_kmalloc(kmemtrace_kmalloc, NULL); | ||
183 | unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); | ||
184 | unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); | ||
185 | unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); | ||
186 | unregister_trace_kfree(kmemtrace_kfree, NULL); | ||
187 | unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); | ||
188 | } | ||
189 | |||
190 | static int kmem_trace_init(struct trace_array *tr) | ||
191 | { | ||
192 | kmemtrace_array = tr; | ||
193 | |||
194 | tracing_reset_online_cpus(tr); | ||
195 | |||
196 | kmemtrace_start_probes(); | ||
197 | |||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | static void kmem_trace_reset(struct trace_array *tr) | ||
202 | { | ||
203 | kmemtrace_stop_probes(); | ||
204 | } | ||
205 | |||
206 | static void kmemtrace_headers(struct seq_file *s) | ||
207 | { | ||
208 | /* Don't need headers for the original kmemtrace output */ | ||
209 | if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) | ||
210 | return; | ||
211 | |||
212 | seq_printf(s, "#\n"); | ||
213 | seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS " | ||
214 | " POINTER NODE CALLER\n"); | ||
215 | seq_printf(s, "# FREE | | | | " | ||
216 | " | | | |\n"); | ||
217 | seq_printf(s, "# |\n\n"); | ||
218 | } | ||
219 | |||
220 | /* | ||
221 | * The following functions give the original output from kmemtrace, | ||
222 | * plus the origin CPU, since reordering occurs in-kernel now. | ||
223 | */ | ||
224 | |||
225 | #define KMEMTRACE_USER_ALLOC 0 | ||
226 | #define KMEMTRACE_USER_FREE 1 | ||
227 | |||
228 | struct kmemtrace_user_event { | ||
229 | u8 event_id; | ||
230 | u8 type_id; | ||
231 | u16 event_size; | ||
232 | u32 cpu; | ||
233 | u64 timestamp; | ||
234 | unsigned long call_site; | ||
235 | unsigned long ptr; | ||
236 | }; | ||
237 | |||
238 | struct kmemtrace_user_event_alloc { | ||
239 | size_t bytes_req; | ||
240 | size_t bytes_alloc; | ||
241 | unsigned gfp_flags; | ||
242 | int node; | ||
243 | }; | ||
244 | |||
245 | static enum print_line_t | ||
246 | kmemtrace_print_alloc(struct trace_iterator *iter, int flags, | ||
247 | struct trace_event *event) | ||
248 | { | ||
249 | struct trace_seq *s = &iter->seq; | ||
250 | struct kmemtrace_alloc_entry *entry; | ||
251 | int ret; | ||
252 | |||
253 | trace_assign_type(entry, iter->ent); | ||
254 | |||
255 | ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu " | ||
256 | "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", | ||
257 | entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr, | ||
258 | (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc, | ||
259 | (unsigned long)entry->gfp_flags, entry->node); | ||
260 | |||
261 | if (!ret) | ||
262 | return TRACE_TYPE_PARTIAL_LINE; | ||
263 | return TRACE_TYPE_HANDLED; | ||
264 | } | ||
265 | |||
266 | static enum print_line_t | ||
267 | kmemtrace_print_free(struct trace_iterator *iter, int flags, | ||
268 | struct trace_event *event) | ||
269 | { | ||
270 | struct trace_seq *s = &iter->seq; | ||
271 | struct kmemtrace_free_entry *entry; | ||
272 | int ret; | ||
273 | |||
274 | trace_assign_type(entry, iter->ent); | ||
275 | |||
276 | ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n", | ||
277 | entry->type_id, (void *)entry->call_site, | ||
278 | (unsigned long)entry->ptr); | ||
279 | |||
280 | if (!ret) | ||
281 | return TRACE_TYPE_PARTIAL_LINE; | ||
282 | return TRACE_TYPE_HANDLED; | ||
283 | } | ||
284 | |||
285 | static enum print_line_t | ||
286 | kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags, | ||
287 | struct trace_event *event) | ||
288 | { | ||
289 | struct trace_seq *s = &iter->seq; | ||
290 | struct kmemtrace_alloc_entry *entry; | ||
291 | struct kmemtrace_user_event *ev; | ||
292 | struct kmemtrace_user_event_alloc *ev_alloc; | ||
293 | |||
294 | trace_assign_type(entry, iter->ent); | ||
295 | |||
296 | ev = trace_seq_reserve(s, sizeof(*ev)); | ||
297 | if (!ev) | ||
298 | return TRACE_TYPE_PARTIAL_LINE; | ||
299 | |||
300 | ev->event_id = KMEMTRACE_USER_ALLOC; | ||
301 | ev->type_id = entry->type_id; | ||
302 | ev->event_size = sizeof(*ev) + sizeof(*ev_alloc); | ||
303 | ev->cpu = iter->cpu; | ||
304 | ev->timestamp = iter->ts; | ||
305 | ev->call_site = entry->call_site; | ||
306 | ev->ptr = (unsigned long)entry->ptr; | ||
307 | |||
308 | ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc)); | ||
309 | if (!ev_alloc) | ||
310 | return TRACE_TYPE_PARTIAL_LINE; | ||
311 | |||
312 | ev_alloc->bytes_req = entry->bytes_req; | ||
313 | ev_alloc->bytes_alloc = entry->bytes_alloc; | ||
314 | ev_alloc->gfp_flags = entry->gfp_flags; | ||
315 | ev_alloc->node = entry->node; | ||
316 | |||
317 | return TRACE_TYPE_HANDLED; | ||
318 | } | ||
319 | |||
320 | static enum print_line_t | ||
321 | kmemtrace_print_free_user(struct trace_iterator *iter, int flags, | ||
322 | struct trace_event *event) | ||
323 | { | ||
324 | struct trace_seq *s = &iter->seq; | ||
325 | struct kmemtrace_free_entry *entry; | ||
326 | struct kmemtrace_user_event *ev; | ||
327 | |||
328 | trace_assign_type(entry, iter->ent); | ||
329 | |||
330 | ev = trace_seq_reserve(s, sizeof(*ev)); | ||
331 | if (!ev) | ||
332 | return TRACE_TYPE_PARTIAL_LINE; | ||
333 | |||
334 | ev->event_id = KMEMTRACE_USER_FREE; | ||
335 | ev->type_id = entry->type_id; | ||
336 | ev->event_size = sizeof(*ev); | ||
337 | ev->cpu = iter->cpu; | ||
338 | ev->timestamp = iter->ts; | ||
339 | ev->call_site = entry->call_site; | ||
340 | ev->ptr = (unsigned long)entry->ptr; | ||
341 | |||
342 | return TRACE_TYPE_HANDLED; | ||
343 | } | ||
344 | |||
345 | /* The two other following provide a more minimalistic output */ | ||
346 | static enum print_line_t | ||
347 | kmemtrace_print_alloc_compress(struct trace_iterator *iter) | ||
348 | { | ||
349 | struct kmemtrace_alloc_entry *entry; | ||
350 | struct trace_seq *s = &iter->seq; | ||
351 | int ret; | ||
352 | |||
353 | trace_assign_type(entry, iter->ent); | ||
354 | |||
355 | /* Alloc entry */ | ||
356 | ret = trace_seq_printf(s, " + "); | ||
357 | if (!ret) | ||
358 | return TRACE_TYPE_PARTIAL_LINE; | ||
359 | |||
360 | /* Type */ | ||
361 | switch (entry->type_id) { | ||
362 | case KMEMTRACE_TYPE_KMALLOC: | ||
363 | ret = trace_seq_printf(s, "K "); | ||
364 | break; | ||
365 | case KMEMTRACE_TYPE_CACHE: | ||
366 | ret = trace_seq_printf(s, "C "); | ||
367 | break; | ||
368 | case KMEMTRACE_TYPE_PAGES: | ||
369 | ret = trace_seq_printf(s, "P "); | ||
370 | break; | ||
371 | default: | ||
372 | ret = trace_seq_printf(s, "? "); | ||
373 | } | ||
374 | |||
375 | if (!ret) | ||
376 | return TRACE_TYPE_PARTIAL_LINE; | ||
377 | |||
378 | /* Requested */ | ||
379 | ret = trace_seq_printf(s, "%4zu ", entry->bytes_req); | ||
380 | if (!ret) | ||
381 | return TRACE_TYPE_PARTIAL_LINE; | ||
382 | |||
383 | /* Allocated */ | ||
384 | ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc); | ||
385 | if (!ret) | ||
386 | return TRACE_TYPE_PARTIAL_LINE; | ||
387 | |||
388 | /* Flags | ||
389 | * TODO: would be better to see the name of the GFP flag names | ||
390 | */ | ||
391 | ret = trace_seq_printf(s, "%08x ", entry->gfp_flags); | ||
392 | if (!ret) | ||
393 | return TRACE_TYPE_PARTIAL_LINE; | ||
394 | |||
395 | /* Pointer to allocated */ | ||
396 | ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); | ||
397 | if (!ret) | ||
398 | return TRACE_TYPE_PARTIAL_LINE; | ||
399 | |||
400 | /* Node and call site*/ | ||
401 | ret = trace_seq_printf(s, "%4d %pf\n", entry->node, | ||
402 | (void *)entry->call_site); | ||
403 | if (!ret) | ||
404 | return TRACE_TYPE_PARTIAL_LINE; | ||
405 | |||
406 | return TRACE_TYPE_HANDLED; | ||
407 | } | ||
408 | |||
409 | static enum print_line_t | ||
410 | kmemtrace_print_free_compress(struct trace_iterator *iter) | ||
411 | { | ||
412 | struct kmemtrace_free_entry *entry; | ||
413 | struct trace_seq *s = &iter->seq; | ||
414 | int ret; | ||
415 | |||
416 | trace_assign_type(entry, iter->ent); | ||
417 | |||
418 | /* Free entry */ | ||
419 | ret = trace_seq_printf(s, " - "); | ||
420 | if (!ret) | ||
421 | return TRACE_TYPE_PARTIAL_LINE; | ||
422 | |||
423 | /* Type */ | ||
424 | switch (entry->type_id) { | ||
425 | case KMEMTRACE_TYPE_KMALLOC: | ||
426 | ret = trace_seq_printf(s, "K "); | ||
427 | break; | ||
428 | case KMEMTRACE_TYPE_CACHE: | ||
429 | ret = trace_seq_printf(s, "C "); | ||
430 | break; | ||
431 | case KMEMTRACE_TYPE_PAGES: | ||
432 | ret = trace_seq_printf(s, "P "); | ||
433 | break; | ||
434 | default: | ||
435 | ret = trace_seq_printf(s, "? "); | ||
436 | } | ||
437 | |||
438 | if (!ret) | ||
439 | return TRACE_TYPE_PARTIAL_LINE; | ||
440 | |||
441 | /* Skip requested/allocated/flags */ | ||
442 | ret = trace_seq_printf(s, " "); | ||
443 | if (!ret) | ||
444 | return TRACE_TYPE_PARTIAL_LINE; | ||
445 | |||
446 | /* Pointer to allocated */ | ||
447 | ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); | ||
448 | if (!ret) | ||
449 | return TRACE_TYPE_PARTIAL_LINE; | ||
450 | |||
451 | /* Skip node and print call site*/ | ||
452 | ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site); | ||
453 | if (!ret) | ||
454 | return TRACE_TYPE_PARTIAL_LINE; | ||
455 | |||
456 | return TRACE_TYPE_HANDLED; | ||
457 | } | ||
458 | |||
459 | static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) | ||
460 | { | ||
461 | struct trace_entry *entry = iter->ent; | ||
462 | |||
463 | if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) | ||
464 | return TRACE_TYPE_UNHANDLED; | ||
465 | |||
466 | switch (entry->type) { | ||
467 | case TRACE_KMEM_ALLOC: | ||
468 | return kmemtrace_print_alloc_compress(iter); | ||
469 | case TRACE_KMEM_FREE: | ||
470 | return kmemtrace_print_free_compress(iter); | ||
471 | default: | ||
472 | return TRACE_TYPE_UNHANDLED; | ||
473 | } | ||
474 | } | ||
475 | |||
476 | static struct trace_event_functions kmem_trace_alloc_funcs = { | ||
477 | .trace = kmemtrace_print_alloc, | ||
478 | .binary = kmemtrace_print_alloc_user, | ||
479 | }; | ||
480 | |||
481 | static struct trace_event kmem_trace_alloc = { | ||
482 | .type = TRACE_KMEM_ALLOC, | ||
483 | .funcs = &kmem_trace_alloc_funcs, | ||
484 | }; | ||
485 | |||
486 | static struct trace_event_functions kmem_trace_free_funcs = { | ||
487 | .trace = kmemtrace_print_free, | ||
488 | .binary = kmemtrace_print_free_user, | ||
489 | }; | ||
490 | |||
491 | static struct trace_event kmem_trace_free = { | ||
492 | .type = TRACE_KMEM_FREE, | ||
493 | .funcs = &kmem_trace_free_funcs, | ||
494 | }; | ||
495 | |||
496 | static struct tracer kmem_tracer __read_mostly = { | ||
497 | .name = "kmemtrace", | ||
498 | .init = kmem_trace_init, | ||
499 | .reset = kmem_trace_reset, | ||
500 | .print_line = kmemtrace_print_line, | ||
501 | .print_header = kmemtrace_headers, | ||
502 | .flags = &kmem_tracer_flags | ||
503 | }; | ||
504 | |||
505 | void kmemtrace_init(void) | ||
506 | { | ||
507 | /* earliest opportunity to start kmem tracing */ | ||
508 | } | ||
509 | |||
510 | static int __init init_kmem_tracer(void) | ||
511 | { | ||
512 | if (!register_ftrace_event(&kmem_trace_alloc)) { | ||
513 | pr_warning("Warning: could not register kmem events\n"); | ||
514 | return 1; | ||
515 | } | ||
516 | |||
517 | if (!register_ftrace_event(&kmem_trace_free)) { | ||
518 | pr_warning("Warning: could not register kmem events\n"); | ||
519 | return 1; | ||
520 | } | ||
521 | |||
522 | if (register_tracer(&kmem_tracer) != 0) { | ||
523 | pr_warning("Warning: could not register the kmem tracer\n"); | ||
524 | return 1; | ||
525 | } | ||
526 | |||
527 | return 0; | ||
528 | } | ||
529 | device_initcall(init_kmem_tracer); | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1da7b6ea8b85..9ed509a015d8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -224,6 +224,9 @@ enum { | |||
224 | RB_LEN_TIME_STAMP = 16, | 224 | RB_LEN_TIME_STAMP = 16, |
225 | }; | 225 | }; |
226 | 226 | ||
227 | #define skip_time_extend(event) \ | ||
228 | ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) | ||
229 | |||
227 | static inline int rb_null_event(struct ring_buffer_event *event) | 230 | static inline int rb_null_event(struct ring_buffer_event *event) |
228 | { | 231 | { |
229 | return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; | 232 | return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; |
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event) | |||
248 | return length + RB_EVNT_HDR_SIZE; | 251 | return length + RB_EVNT_HDR_SIZE; |
249 | } | 252 | } |
250 | 253 | ||
251 | /* inline for ring buffer fast paths */ | 254 | /* |
252 | static unsigned | 255 | * Return the length of the given event. Will return |
256 | * the length of the time extend if the event is a | ||
257 | * time extend. | ||
258 | */ | ||
259 | static inline unsigned | ||
253 | rb_event_length(struct ring_buffer_event *event) | 260 | rb_event_length(struct ring_buffer_event *event) |
254 | { | 261 | { |
255 | switch (event->type_len) { | 262 | switch (event->type_len) { |
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event) | |||
274 | return 0; | 281 | return 0; |
275 | } | 282 | } |
276 | 283 | ||
284 | /* | ||
285 | * Return total length of time extend and data, | ||
286 | * or just the event length for all other events. | ||
287 | */ | ||
288 | static inline unsigned | ||
289 | rb_event_ts_length(struct ring_buffer_event *event) | ||
290 | { | ||
291 | unsigned len = 0; | ||
292 | |||
293 | if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { | ||
294 | /* time extends include the data event after it */ | ||
295 | len = RB_LEN_TIME_EXTEND; | ||
296 | event = skip_time_extend(event); | ||
297 | } | ||
298 | return len + rb_event_length(event); | ||
299 | } | ||
300 | |||
277 | /** | 301 | /** |
278 | * ring_buffer_event_length - return the length of the event | 302 | * ring_buffer_event_length - return the length of the event |
279 | * @event: the event to get the length of | 303 | * @event: the event to get the length of |
304 | * | ||
305 | * Returns the size of the data load of a data event. | ||
306 | * If the event is something other than a data event, it | ||
307 | * returns the size of the event itself. With the exception | ||
308 | * of a TIME EXTEND, where it still returns the size of the | ||
309 | * data load of the data event after it. | ||
280 | */ | 310 | */ |
281 | unsigned ring_buffer_event_length(struct ring_buffer_event *event) | 311 | unsigned ring_buffer_event_length(struct ring_buffer_event *event) |
282 | { | 312 | { |
283 | unsigned length = rb_event_length(event); | 313 | unsigned length; |
314 | |||
315 | if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) | ||
316 | event = skip_time_extend(event); | ||
317 | |||
318 | length = rb_event_length(event); | ||
284 | if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) | 319 | if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) |
285 | return length; | 320 | return length; |
286 | length -= RB_EVNT_HDR_SIZE; | 321 | length -= RB_EVNT_HDR_SIZE; |
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); | |||
294 | static void * | 329 | static void * |
295 | rb_event_data(struct ring_buffer_event *event) | 330 | rb_event_data(struct ring_buffer_event *event) |
296 | { | 331 | { |
332 | if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) | ||
333 | event = skip_time_extend(event); | ||
297 | BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); | 334 | BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); |
298 | /* If length is in len field, then array[0] has the data */ | 335 | /* If length is in len field, then array[0] has the data */ |
299 | if (event->type_len) | 336 | if (event->type_len) |
@@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta) | |||
404 | /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ | 441 | /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ |
405 | #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) | 442 | #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) |
406 | 443 | ||
407 | /* Max number of timestamps that can fit on a page */ | ||
408 | #define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) | ||
409 | |||
410 | int ring_buffer_print_page_header(struct trace_seq *s) | 444 | int ring_buffer_print_page_header(struct trace_seq *s) |
411 | { | 445 | { |
412 | struct buffer_data_page field; | 446 | struct buffer_data_page field; |
@@ -443,6 +477,7 @@ int ring_buffer_print_page_header(struct trace_seq *s) | |||
443 | */ | 477 | */ |
444 | struct ring_buffer_per_cpu { | 478 | struct ring_buffer_per_cpu { |
445 | int cpu; | 479 | int cpu; |
480 | atomic_t record_disabled; | ||
446 | struct ring_buffer *buffer; | 481 | struct ring_buffer *buffer; |
447 | spinlock_t reader_lock; /* serialize readers */ | 482 | spinlock_t reader_lock; /* serialize readers */ |
448 | arch_spinlock_t lock; | 483 | arch_spinlock_t lock; |
@@ -462,7 +497,6 @@ struct ring_buffer_per_cpu { | |||
462 | unsigned long read; | 497 | unsigned long read; |
463 | u64 write_stamp; | 498 | u64 write_stamp; |
464 | u64 read_stamp; | 499 | u64 read_stamp; |
465 | atomic_t record_disabled; | ||
466 | }; | 500 | }; |
467 | 501 | ||
468 | struct ring_buffer { | 502 | struct ring_buffer { |
@@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) | |||
1546 | iter->head = 0; | 1580 | iter->head = 0; |
1547 | } | 1581 | } |
1548 | 1582 | ||
1583 | /* Slow path, do not inline */ | ||
1584 | static noinline struct ring_buffer_event * | ||
1585 | rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) | ||
1586 | { | ||
1587 | event->type_len = RINGBUF_TYPE_TIME_EXTEND; | ||
1588 | |||
1589 | /* Not the first event on the page? */ | ||
1590 | if (rb_event_index(event)) { | ||
1591 | event->time_delta = delta & TS_MASK; | ||
1592 | event->array[0] = delta >> TS_SHIFT; | ||
1593 | } else { | ||
1594 | /* nope, just zero it */ | ||
1595 | event->time_delta = 0; | ||
1596 | event->array[0] = 0; | ||
1597 | } | ||
1598 | |||
1599 | return skip_time_extend(event); | ||
1600 | } | ||
1601 | |||
1549 | /** | 1602 | /** |
1550 | * ring_buffer_update_event - update event type and data | 1603 | * ring_buffer_update_event - update event type and data |
1551 | * @event: the even to update | 1604 | * @event: the even to update |
@@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) | |||
1558 | * data field. | 1611 | * data field. |
1559 | */ | 1612 | */ |
1560 | static void | 1613 | static void |
1561 | rb_update_event(struct ring_buffer_event *event, | 1614 | rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, |
1562 | unsigned type, unsigned length) | 1615 | struct ring_buffer_event *event, unsigned length, |
1616 | int add_timestamp, u64 delta) | ||
1563 | { | 1617 | { |
1564 | event->type_len = type; | 1618 | /* Only a commit updates the timestamp */ |
1565 | 1619 | if (unlikely(!rb_event_is_commit(cpu_buffer, event))) | |
1566 | switch (type) { | 1620 | delta = 0; |
1567 | |||
1568 | case RINGBUF_TYPE_PADDING: | ||
1569 | case RINGBUF_TYPE_TIME_EXTEND: | ||
1570 | case RINGBUF_TYPE_TIME_STAMP: | ||
1571 | break; | ||
1572 | 1621 | ||
1573 | case 0: | 1622 | /* |
1574 | length -= RB_EVNT_HDR_SIZE; | 1623 | * If we need to add a timestamp, then we |
1575 | if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) | 1624 | * add it to the start of the resevered space. |
1576 | event->array[0] = length; | 1625 | */ |
1577 | else | 1626 | if (unlikely(add_timestamp)) { |
1578 | event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); | 1627 | event = rb_add_time_stamp(event, delta); |
1579 | break; | 1628 | length -= RB_LEN_TIME_EXTEND; |
1580 | default: | 1629 | delta = 0; |
1581 | BUG(); | ||
1582 | } | 1630 | } |
1631 | |||
1632 | event->time_delta = delta; | ||
1633 | length -= RB_EVNT_HDR_SIZE; | ||
1634 | if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { | ||
1635 | event->type_len = 0; | ||
1636 | event->array[0] = length; | ||
1637 | } else | ||
1638 | event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); | ||
1583 | } | 1639 | } |
1584 | 1640 | ||
1585 | /* | 1641 | /* |
@@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
1823 | local_sub(length, &tail_page->write); | 1879 | local_sub(length, &tail_page->write); |
1824 | } | 1880 | } |
1825 | 1881 | ||
1826 | static struct ring_buffer_event * | 1882 | /* |
1883 | * This is the slow path, force gcc not to inline it. | ||
1884 | */ | ||
1885 | static noinline struct ring_buffer_event * | ||
1827 | rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | 1886 | rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, |
1828 | unsigned long length, unsigned long tail, | 1887 | unsigned long length, unsigned long tail, |
1829 | struct buffer_page *tail_page, u64 *ts) | 1888 | struct buffer_page *tail_page, u64 ts) |
1830 | { | 1889 | { |
1831 | struct buffer_page *commit_page = cpu_buffer->commit_page; | 1890 | struct buffer_page *commit_page = cpu_buffer->commit_page; |
1832 | struct ring_buffer *buffer = cpu_buffer->buffer; | 1891 | struct ring_buffer *buffer = cpu_buffer->buffer; |
@@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
1909 | * Nested commits always have zero deltas, so | 1968 | * Nested commits always have zero deltas, so |
1910 | * just reread the time stamp | 1969 | * just reread the time stamp |
1911 | */ | 1970 | */ |
1912 | *ts = rb_time_stamp(buffer); | 1971 | ts = rb_time_stamp(buffer); |
1913 | next_page->page->time_stamp = *ts; | 1972 | next_page->page->time_stamp = ts; |
1914 | } | 1973 | } |
1915 | 1974 | ||
1916 | out_again: | 1975 | out_again: |
@@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
1929 | 1988 | ||
1930 | static struct ring_buffer_event * | 1989 | static struct ring_buffer_event * |
1931 | __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | 1990 | __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, |
1932 | unsigned type, unsigned long length, u64 *ts) | 1991 | unsigned long length, u64 ts, |
1992 | u64 delta, int add_timestamp) | ||
1933 | { | 1993 | { |
1934 | struct buffer_page *tail_page; | 1994 | struct buffer_page *tail_page; |
1935 | struct ring_buffer_event *event; | 1995 | struct ring_buffer_event *event; |
1936 | unsigned long tail, write; | 1996 | unsigned long tail, write; |
1937 | 1997 | ||
1998 | /* | ||
1999 | * If the time delta since the last event is too big to | ||
2000 | * hold in the time field of the event, then we append a | ||
2001 | * TIME EXTEND event ahead of the data event. | ||
2002 | */ | ||
2003 | if (unlikely(add_timestamp)) | ||
2004 | length += RB_LEN_TIME_EXTEND; | ||
2005 | |||
1938 | tail_page = cpu_buffer->tail_page; | 2006 | tail_page = cpu_buffer->tail_page; |
1939 | write = local_add_return(length, &tail_page->write); | 2007 | write = local_add_return(length, &tail_page->write); |
1940 | 2008 | ||
@@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | |||
1943 | tail = write - length; | 2011 | tail = write - length; |
1944 | 2012 | ||
1945 | /* See if we shot pass the end of this buffer page */ | 2013 | /* See if we shot pass the end of this buffer page */ |
1946 | if (write > BUF_PAGE_SIZE) | 2014 | if (unlikely(write > BUF_PAGE_SIZE)) |
1947 | return rb_move_tail(cpu_buffer, length, tail, | 2015 | return rb_move_tail(cpu_buffer, length, tail, |
1948 | tail_page, ts); | 2016 | tail_page, ts); |
1949 | 2017 | ||
@@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | |||
1951 | 2019 | ||
1952 | event = __rb_page_index(tail_page, tail); | 2020 | event = __rb_page_index(tail_page, tail); |
1953 | kmemcheck_annotate_bitfield(event, bitfield); | 2021 | kmemcheck_annotate_bitfield(event, bitfield); |
1954 | rb_update_event(event, type, length); | 2022 | rb_update_event(cpu_buffer, event, length, add_timestamp, delta); |
1955 | 2023 | ||
1956 | /* The passed in type is zero for DATA */ | 2024 | local_inc(&tail_page->entries); |
1957 | if (likely(!type)) | ||
1958 | local_inc(&tail_page->entries); | ||
1959 | 2025 | ||
1960 | /* | 2026 | /* |
1961 | * If this is the first commit on the page, then update | 2027 | * If this is the first commit on the page, then update |
1962 | * its timestamp. | 2028 | * its timestamp. |
1963 | */ | 2029 | */ |
1964 | if (!tail) | 2030 | if (!tail) |
1965 | tail_page->page->time_stamp = *ts; | 2031 | tail_page->page->time_stamp = ts; |
1966 | 2032 | ||
1967 | return event; | 2033 | return event; |
1968 | } | 2034 | } |
@@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, | |||
1977 | unsigned long addr; | 2043 | unsigned long addr; |
1978 | 2044 | ||
1979 | new_index = rb_event_index(event); | 2045 | new_index = rb_event_index(event); |
1980 | old_index = new_index + rb_event_length(event); | 2046 | old_index = new_index + rb_event_ts_length(event); |
1981 | addr = (unsigned long)event; | 2047 | addr = (unsigned long)event; |
1982 | addr &= PAGE_MASK; | 2048 | addr &= PAGE_MASK; |
1983 | 2049 | ||
@@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, | |||
2003 | return 0; | 2069 | return 0; |
2004 | } | 2070 | } |
2005 | 2071 | ||
2006 | static int | ||
2007 | rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, | ||
2008 | u64 *ts, u64 *delta) | ||
2009 | { | ||
2010 | struct ring_buffer_event *event; | ||
2011 | int ret; | ||
2012 | |||
2013 | WARN_ONCE(*delta > (1ULL << 59), | ||
2014 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", | ||
2015 | (unsigned long long)*delta, | ||
2016 | (unsigned long long)*ts, | ||
2017 | (unsigned long long)cpu_buffer->write_stamp); | ||
2018 | |||
2019 | /* | ||
2020 | * The delta is too big, we to add a | ||
2021 | * new timestamp. | ||
2022 | */ | ||
2023 | event = __rb_reserve_next(cpu_buffer, | ||
2024 | RINGBUF_TYPE_TIME_EXTEND, | ||
2025 | RB_LEN_TIME_EXTEND, | ||
2026 | ts); | ||
2027 | if (!event) | ||
2028 | return -EBUSY; | ||
2029 | |||
2030 | if (PTR_ERR(event) == -EAGAIN) | ||
2031 | return -EAGAIN; | ||
2032 | |||
2033 | /* Only a commited time event can update the write stamp */ | ||
2034 | if (rb_event_is_commit(cpu_buffer, event)) { | ||
2035 | /* | ||
2036 | * If this is the first on the page, then it was | ||
2037 | * updated with the page itself. Try to discard it | ||
2038 | * and if we can't just make it zero. | ||
2039 | */ | ||
2040 | if (rb_event_index(event)) { | ||
2041 | event->time_delta = *delta & TS_MASK; | ||
2042 | event->array[0] = *delta >> TS_SHIFT; | ||
2043 | } else { | ||
2044 | /* try to discard, since we do not need this */ | ||
2045 | if (!rb_try_to_discard(cpu_buffer, event)) { | ||
2046 | /* nope, just zero it */ | ||
2047 | event->time_delta = 0; | ||
2048 | event->array[0] = 0; | ||
2049 | } | ||
2050 | } | ||
2051 | cpu_buffer->write_stamp = *ts; | ||
2052 | /* let the caller know this was the commit */ | ||
2053 | ret = 1; | ||
2054 | } else { | ||
2055 | /* Try to discard the event */ | ||
2056 | if (!rb_try_to_discard(cpu_buffer, event)) { | ||
2057 | /* Darn, this is just wasted space */ | ||
2058 | event->time_delta = 0; | ||
2059 | event->array[0] = 0; | ||
2060 | } | ||
2061 | ret = 0; | ||
2062 | } | ||
2063 | |||
2064 | *delta = 0; | ||
2065 | |||
2066 | return ret; | ||
2067 | } | ||
2068 | |||
2069 | static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) | 2072 | static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) |
2070 | { | 2073 | { |
2071 | local_inc(&cpu_buffer->committing); | 2074 | local_inc(&cpu_buffer->committing); |
2072 | local_inc(&cpu_buffer->commits); | 2075 | local_inc(&cpu_buffer->commits); |
2073 | } | 2076 | } |
2074 | 2077 | ||
2075 | static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) | 2078 | static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) |
2076 | { | 2079 | { |
2077 | unsigned long commits; | 2080 | unsigned long commits; |
2078 | 2081 | ||
@@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2110 | unsigned long length) | 2113 | unsigned long length) |
2111 | { | 2114 | { |
2112 | struct ring_buffer_event *event; | 2115 | struct ring_buffer_event *event; |
2113 | u64 ts, delta = 0; | 2116 | u64 ts, delta; |
2114 | int commit = 0; | ||
2115 | int nr_loops = 0; | 2117 | int nr_loops = 0; |
2118 | int add_timestamp; | ||
2119 | u64 diff; | ||
2116 | 2120 | ||
2117 | rb_start_commit(cpu_buffer); | 2121 | rb_start_commit(cpu_buffer); |
2118 | 2122 | ||
@@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2133 | 2137 | ||
2134 | length = rb_calculate_event_length(length); | 2138 | length = rb_calculate_event_length(length); |
2135 | again: | 2139 | again: |
2140 | add_timestamp = 0; | ||
2141 | delta = 0; | ||
2142 | |||
2136 | /* | 2143 | /* |
2137 | * We allow for interrupts to reenter here and do a trace. | 2144 | * We allow for interrupts to reenter here and do a trace. |
2138 | * If one does, it will cause this original code to loop | 2145 | * If one does, it will cause this original code to loop |
@@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2146 | goto out_fail; | 2153 | goto out_fail; |
2147 | 2154 | ||
2148 | ts = rb_time_stamp(cpu_buffer->buffer); | 2155 | ts = rb_time_stamp(cpu_buffer->buffer); |
2156 | diff = ts - cpu_buffer->write_stamp; | ||
2149 | 2157 | ||
2150 | /* | 2158 | /* make sure this diff is calculated here */ |
2151 | * Only the first commit can update the timestamp. | 2159 | barrier(); |
2152 | * Yes there is a race here. If an interrupt comes in | ||
2153 | * just after the conditional and it traces too, then it | ||
2154 | * will also check the deltas. More than one timestamp may | ||
2155 | * also be made. But only the entry that did the actual | ||
2156 | * commit will be something other than zero. | ||
2157 | */ | ||
2158 | if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && | ||
2159 | rb_page_write(cpu_buffer->tail_page) == | ||
2160 | rb_commit_index(cpu_buffer))) { | ||
2161 | u64 diff; | ||
2162 | |||
2163 | diff = ts - cpu_buffer->write_stamp; | ||
2164 | |||
2165 | /* make sure this diff is calculated here */ | ||
2166 | barrier(); | ||
2167 | |||
2168 | /* Did the write stamp get updated already? */ | ||
2169 | if (unlikely(ts < cpu_buffer->write_stamp)) | ||
2170 | goto get_event; | ||
2171 | 2160 | ||
2161 | /* Did the write stamp get updated already? */ | ||
2162 | if (likely(ts >= cpu_buffer->write_stamp)) { | ||
2172 | delta = diff; | 2163 | delta = diff; |
2173 | if (unlikely(test_time_stamp(delta))) { | 2164 | if (unlikely(test_time_stamp(delta))) { |
2174 | 2165 | WARN_ONCE(delta > (1ULL << 59), | |
2175 | commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); | 2166 | KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", |
2176 | if (commit == -EBUSY) | 2167 | (unsigned long long)delta, |
2177 | goto out_fail; | 2168 | (unsigned long long)ts, |
2178 | 2169 | (unsigned long long)cpu_buffer->write_stamp); | |
2179 | if (commit == -EAGAIN) | 2170 | add_timestamp = 1; |
2180 | goto again; | ||
2181 | |||
2182 | RB_WARN_ON(cpu_buffer, commit < 0); | ||
2183 | } | 2171 | } |
2184 | } | 2172 | } |
2185 | 2173 | ||
2186 | get_event: | 2174 | event = __rb_reserve_next(cpu_buffer, length, ts, |
2187 | event = __rb_reserve_next(cpu_buffer, 0, length, &ts); | 2175 | delta, add_timestamp); |
2188 | if (unlikely(PTR_ERR(event) == -EAGAIN)) | 2176 | if (unlikely(PTR_ERR(event) == -EAGAIN)) |
2189 | goto again; | 2177 | goto again; |
2190 | 2178 | ||
2191 | if (!event) | 2179 | if (!event) |
2192 | goto out_fail; | 2180 | goto out_fail; |
2193 | 2181 | ||
2194 | if (!rb_event_is_commit(cpu_buffer, event)) | ||
2195 | delta = 0; | ||
2196 | |||
2197 | event->time_delta = delta; | ||
2198 | |||
2199 | return event; | 2182 | return event; |
2200 | 2183 | ||
2201 | out_fail: | 2184 | out_fail: |
@@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2207 | 2190 | ||
2208 | #define TRACE_RECURSIVE_DEPTH 16 | 2191 | #define TRACE_RECURSIVE_DEPTH 16 |
2209 | 2192 | ||
2210 | static int trace_recursive_lock(void) | 2193 | /* Keep this code out of the fast path cache */ |
2194 | static noinline void trace_recursive_fail(void) | ||
2211 | { | 2195 | { |
2212 | current->trace_recursion++; | ||
2213 | |||
2214 | if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) | ||
2215 | return 0; | ||
2216 | |||
2217 | /* Disable all tracing before we do anything else */ | 2196 | /* Disable all tracing before we do anything else */ |
2218 | tracing_off_permanent(); | 2197 | tracing_off_permanent(); |
2219 | 2198 | ||
@@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void) | |||
2225 | in_nmi()); | 2204 | in_nmi()); |
2226 | 2205 | ||
2227 | WARN_ON_ONCE(1); | 2206 | WARN_ON_ONCE(1); |
2207 | } | ||
2208 | |||
2209 | static inline int trace_recursive_lock(void) | ||
2210 | { | ||
2211 | current->trace_recursion++; | ||
2212 | |||
2213 | if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) | ||
2214 | return 0; | ||
2215 | |||
2216 | trace_recursive_fail(); | ||
2217 | |||
2228 | return -1; | 2218 | return -1; |
2229 | } | 2219 | } |
2230 | 2220 | ||
2231 | static void trace_recursive_unlock(void) | 2221 | static inline void trace_recursive_unlock(void) |
2232 | { | 2222 | { |
2233 | WARN_ON_ONCE(!current->trace_recursion); | 2223 | WARN_ON_ONCE(!current->trace_recursion); |
2234 | 2224 | ||
@@ -2242,8 +2232,6 @@ static void trace_recursive_unlock(void) | |||
2242 | 2232 | ||
2243 | #endif | 2233 | #endif |
2244 | 2234 | ||
2245 | static DEFINE_PER_CPU(int, rb_need_resched); | ||
2246 | |||
2247 | /** | 2235 | /** |
2248 | * ring_buffer_lock_reserve - reserve a part of the buffer | 2236 | * ring_buffer_lock_reserve - reserve a part of the buffer |
2249 | * @buffer: the ring buffer to reserve from | 2237 | * @buffer: the ring buffer to reserve from |
@@ -2264,13 +2252,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) | |||
2264 | { | 2252 | { |
2265 | struct ring_buffer_per_cpu *cpu_buffer; | 2253 | struct ring_buffer_per_cpu *cpu_buffer; |
2266 | struct ring_buffer_event *event; | 2254 | struct ring_buffer_event *event; |
2267 | int cpu, resched; | 2255 | int cpu; |
2268 | 2256 | ||
2269 | if (ring_buffer_flags != RB_BUFFERS_ON) | 2257 | if (ring_buffer_flags != RB_BUFFERS_ON) |
2270 | return NULL; | 2258 | return NULL; |
2271 | 2259 | ||
2272 | /* If we are tracing schedule, we don't want to recurse */ | 2260 | /* If we are tracing schedule, we don't want to recurse */ |
2273 | resched = ftrace_preempt_disable(); | 2261 | preempt_disable_notrace(); |
2274 | 2262 | ||
2275 | if (atomic_read(&buffer->record_disabled)) | 2263 | if (atomic_read(&buffer->record_disabled)) |
2276 | goto out_nocheck; | 2264 | goto out_nocheck; |
@@ -2295,21 +2283,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) | |||
2295 | if (!event) | 2283 | if (!event) |
2296 | goto out; | 2284 | goto out; |
2297 | 2285 | ||
2298 | /* | ||
2299 | * Need to store resched state on this cpu. | ||
2300 | * Only the first needs to. | ||
2301 | */ | ||
2302 | |||
2303 | if (preempt_count() == 1) | ||
2304 | per_cpu(rb_need_resched, cpu) = resched; | ||
2305 | |||
2306 | return event; | 2286 | return event; |
2307 | 2287 | ||
2308 | out: | 2288 | out: |
2309 | trace_recursive_unlock(); | 2289 | trace_recursive_unlock(); |
2310 | 2290 | ||
2311 | out_nocheck: | 2291 | out_nocheck: |
2312 | ftrace_preempt_enable(resched); | 2292 | preempt_enable_notrace(); |
2313 | return NULL; | 2293 | return NULL; |
2314 | } | 2294 | } |
2315 | EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); | 2295 | EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); |
@@ -2318,12 +2298,28 @@ static void | |||
2318 | rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, | 2298 | rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, |
2319 | struct ring_buffer_event *event) | 2299 | struct ring_buffer_event *event) |
2320 | { | 2300 | { |
2301 | u64 delta; | ||
2302 | |||
2321 | /* | 2303 | /* |
2322 | * The event first in the commit queue updates the | 2304 | * The event first in the commit queue updates the |
2323 | * time stamp. | 2305 | * time stamp. |
2324 | */ | 2306 | */ |
2325 | if (rb_event_is_commit(cpu_buffer, event)) | 2307 | if (rb_event_is_commit(cpu_buffer, event)) { |
2326 | cpu_buffer->write_stamp += event->time_delta; | 2308 | /* |
2309 | * A commit event that is first on a page | ||
2310 | * updates the write timestamp with the page stamp | ||
2311 | */ | ||
2312 | if (!rb_event_index(event)) | ||
2313 | cpu_buffer->write_stamp = | ||
2314 | cpu_buffer->commit_page->page->time_stamp; | ||
2315 | else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { | ||
2316 | delta = event->array[0]; | ||
2317 | delta <<= TS_SHIFT; | ||
2318 | delta += event->time_delta; | ||
2319 | cpu_buffer->write_stamp += delta; | ||
2320 | } else | ||
2321 | cpu_buffer->write_stamp += event->time_delta; | ||
2322 | } | ||
2327 | } | 2323 | } |
2328 | 2324 | ||
2329 | static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, | 2325 | static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, |
@@ -2355,13 +2351,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, | |||
2355 | 2351 | ||
2356 | trace_recursive_unlock(); | 2352 | trace_recursive_unlock(); |
2357 | 2353 | ||
2358 | /* | 2354 | preempt_enable_notrace(); |
2359 | * Only the last preempt count needs to restore preemption. | ||
2360 | */ | ||
2361 | if (preempt_count() == 1) | ||
2362 | ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); | ||
2363 | else | ||
2364 | preempt_enable_no_resched_notrace(); | ||
2365 | 2355 | ||
2366 | return 0; | 2356 | return 0; |
2367 | } | 2357 | } |
@@ -2369,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); | |||
2369 | 2359 | ||
2370 | static inline void rb_event_discard(struct ring_buffer_event *event) | 2360 | static inline void rb_event_discard(struct ring_buffer_event *event) |
2371 | { | 2361 | { |
2362 | if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) | ||
2363 | event = skip_time_extend(event); | ||
2364 | |||
2372 | /* array[0] holds the actual length for the discarded event */ | 2365 | /* array[0] holds the actual length for the discarded event */ |
2373 | event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; | 2366 | event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; |
2374 | event->type_len = RINGBUF_TYPE_PADDING; | 2367 | event->type_len = RINGBUF_TYPE_PADDING; |
@@ -2469,13 +2462,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, | |||
2469 | 2462 | ||
2470 | trace_recursive_unlock(); | 2463 | trace_recursive_unlock(); |
2471 | 2464 | ||
2472 | /* | 2465 | preempt_enable_notrace(); |
2473 | * Only the last preempt count needs to restore preemption. | ||
2474 | */ | ||
2475 | if (preempt_count() == 1) | ||
2476 | ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); | ||
2477 | else | ||
2478 | preempt_enable_no_resched_notrace(); | ||
2479 | 2466 | ||
2480 | } | 2467 | } |
2481 | EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); | 2468 | EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); |
@@ -2501,12 +2488,12 @@ int ring_buffer_write(struct ring_buffer *buffer, | |||
2501 | struct ring_buffer_event *event; | 2488 | struct ring_buffer_event *event; |
2502 | void *body; | 2489 | void *body; |
2503 | int ret = -EBUSY; | 2490 | int ret = -EBUSY; |
2504 | int cpu, resched; | 2491 | int cpu; |
2505 | 2492 | ||
2506 | if (ring_buffer_flags != RB_BUFFERS_ON) | 2493 | if (ring_buffer_flags != RB_BUFFERS_ON) |
2507 | return -EBUSY; | 2494 | return -EBUSY; |
2508 | 2495 | ||
2509 | resched = ftrace_preempt_disable(); | 2496 | preempt_disable_notrace(); |
2510 | 2497 | ||
2511 | if (atomic_read(&buffer->record_disabled)) | 2498 | if (atomic_read(&buffer->record_disabled)) |
2512 | goto out; | 2499 | goto out; |
@@ -2536,7 +2523,7 @@ int ring_buffer_write(struct ring_buffer *buffer, | |||
2536 | 2523 | ||
2537 | ret = 0; | 2524 | ret = 0; |
2538 | out: | 2525 | out: |
2539 | ftrace_preempt_enable(resched); | 2526 | preempt_enable_notrace(); |
2540 | 2527 | ||
2541 | return ret; | 2528 | return ret; |
2542 | } | 2529 | } |
@@ -2628,6 +2615,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) | |||
2628 | } | 2615 | } |
2629 | EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); | 2616 | EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); |
2630 | 2617 | ||
2618 | /* | ||
2619 | * The total entries in the ring buffer is the running counter | ||
2620 | * of entries entered into the ring buffer, minus the sum of | ||
2621 | * the entries read from the ring buffer and the number of | ||
2622 | * entries that were overwritten. | ||
2623 | */ | ||
2624 | static inline unsigned long | ||
2625 | rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) | ||
2626 | { | ||
2627 | return local_read(&cpu_buffer->entries) - | ||
2628 | (local_read(&cpu_buffer->overrun) + cpu_buffer->read); | ||
2629 | } | ||
2630 | |||
2631 | /** | 2631 | /** |
2632 | * ring_buffer_entries_cpu - get the number of entries in a cpu buffer | 2632 | * ring_buffer_entries_cpu - get the number of entries in a cpu buffer |
2633 | * @buffer: The ring buffer | 2633 | * @buffer: The ring buffer |
@@ -2636,16 +2636,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); | |||
2636 | unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) | 2636 | unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) |
2637 | { | 2637 | { |
2638 | struct ring_buffer_per_cpu *cpu_buffer; | 2638 | struct ring_buffer_per_cpu *cpu_buffer; |
2639 | unsigned long ret; | ||
2640 | 2639 | ||
2641 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | 2640 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) |
2642 | return 0; | 2641 | return 0; |
2643 | 2642 | ||
2644 | cpu_buffer = buffer->buffers[cpu]; | 2643 | cpu_buffer = buffer->buffers[cpu]; |
2645 | ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) | ||
2646 | - cpu_buffer->read; | ||
2647 | 2644 | ||
2648 | return ret; | 2645 | return rb_num_of_entries(cpu_buffer); |
2649 | } | 2646 | } |
2650 | EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); | 2647 | EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); |
2651 | 2648 | ||
@@ -2706,8 +2703,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) | |||
2706 | /* if you care about this being correct, lock the buffer */ | 2703 | /* if you care about this being correct, lock the buffer */ |
2707 | for_each_buffer_cpu(buffer, cpu) { | 2704 | for_each_buffer_cpu(buffer, cpu) { |
2708 | cpu_buffer = buffer->buffers[cpu]; | 2705 | cpu_buffer = buffer->buffers[cpu]; |
2709 | entries += (local_read(&cpu_buffer->entries) - | 2706 | entries += rb_num_of_entries(cpu_buffer); |
2710 | local_read(&cpu_buffer->overrun)) - cpu_buffer->read; | ||
2711 | } | 2707 | } |
2712 | 2708 | ||
2713 | return entries; | 2709 | return entries; |
@@ -3007,13 +3003,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) | |||
3007 | 3003 | ||
3008 | static void rb_advance_iter(struct ring_buffer_iter *iter) | 3004 | static void rb_advance_iter(struct ring_buffer_iter *iter) |
3009 | { | 3005 | { |
3010 | struct ring_buffer *buffer; | ||
3011 | struct ring_buffer_per_cpu *cpu_buffer; | 3006 | struct ring_buffer_per_cpu *cpu_buffer; |
3012 | struct ring_buffer_event *event; | 3007 | struct ring_buffer_event *event; |
3013 | unsigned length; | 3008 | unsigned length; |
3014 | 3009 | ||
3015 | cpu_buffer = iter->cpu_buffer; | 3010 | cpu_buffer = iter->cpu_buffer; |
3016 | buffer = cpu_buffer->buffer; | ||
3017 | 3011 | ||
3018 | /* | 3012 | /* |
3019 | * Check if we are at the end of the buffer. | 3013 | * Check if we are at the end of the buffer. |
@@ -3064,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, | |||
3064 | 3058 | ||
3065 | again: | 3059 | again: |
3066 | /* | 3060 | /* |
3067 | * We repeat when a timestamp is encountered. It is possible | 3061 | * We repeat when a time extend is encountered. |
3068 | * to get multiple timestamps from an interrupt entering just | 3062 | * Since the time extend is always attached to a data event, |
3069 | * as one timestamp is about to be written, or from discarded | 3063 | * we should never loop more than once. |
3070 | * commits. The most that we can have is the number on a single page. | 3064 | * (We never hit the following condition more than twice). |
3071 | */ | 3065 | */ |
3072 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) | 3066 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) |
3073 | return NULL; | 3067 | return NULL; |
3074 | 3068 | ||
3075 | reader = rb_get_reader_page(cpu_buffer); | 3069 | reader = rb_get_reader_page(cpu_buffer); |
@@ -3145,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) | |||
3145 | return NULL; | 3139 | return NULL; |
3146 | 3140 | ||
3147 | /* | 3141 | /* |
3148 | * We repeat when a timestamp is encountered. | 3142 | * We repeat when a time extend is encountered. |
3149 | * We can get multiple timestamps by nested interrupts or also | 3143 | * Since the time extend is always attached to a data event, |
3150 | * if filtering is on (discarding commits). Since discarding | 3144 | * we should never loop more than once. |
3151 | * commits can be frequent we can get a lot of timestamps. | 3145 | * (We never hit the following condition more than twice). |
3152 | * But we limit them by not adding timestamps if they begin | ||
3153 | * at the start of a page. | ||
3154 | */ | 3146 | */ |
3155 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) | 3147 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) |
3156 | return NULL; | 3148 | return NULL; |
3157 | 3149 | ||
3158 | if (rb_per_cpu_empty(cpu_buffer)) | 3150 | if (rb_per_cpu_empty(cpu_buffer)) |
@@ -3850,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3850 | if (len > (commit - read)) | 3842 | if (len > (commit - read)) |
3851 | len = (commit - read); | 3843 | len = (commit - read); |
3852 | 3844 | ||
3853 | size = rb_event_length(event); | 3845 | /* Always keep the time extend and data together */ |
3846 | size = rb_event_ts_length(event); | ||
3854 | 3847 | ||
3855 | if (len < size) | 3848 | if (len < size) |
3856 | goto out_unlock; | 3849 | goto out_unlock; |
@@ -3868,8 +3861,12 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3868 | rpos = reader->read; | 3861 | rpos = reader->read; |
3869 | pos += size; | 3862 | pos += size; |
3870 | 3863 | ||
3864 | if (rpos >= commit) | ||
3865 | break; | ||
3866 | |||
3871 | event = rb_reader_event(cpu_buffer); | 3867 | event = rb_reader_event(cpu_buffer); |
3872 | size = rb_event_length(event); | 3868 | /* Always keep the time extend and data together */ |
3869 | size = rb_event_ts_length(event); | ||
3873 | } while (len > size); | 3870 | } while (len > size); |
3874 | 3871 | ||
3875 | /* update bpage */ | 3872 | /* update bpage */ |
@@ -3986,6 +3983,7 @@ static const struct file_operations rb_simple_fops = { | |||
3986 | .open = tracing_open_generic, | 3983 | .open = tracing_open_generic, |
3987 | .read = rb_simple_read, | 3984 | .read = rb_simple_read, |
3988 | .write = rb_simple_write, | 3985 | .write = rb_simple_write, |
3986 | .llseek = default_llseek, | ||
3989 | }; | 3987 | }; |
3990 | 3988 | ||
3991 | 3989 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 086d36316805..82d9b8106cd0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void) | |||
101 | preempt_enable(); | 101 | preempt_enable(); |
102 | } | 102 | } |
103 | 103 | ||
104 | static cpumask_var_t __read_mostly tracing_buffer_mask; | 104 | cpumask_var_t __read_mostly tracing_buffer_mask; |
105 | |||
106 | #define for_each_tracing_cpu(cpu) \ | ||
107 | for_each_cpu(cpu, tracing_buffer_mask) | ||
108 | 105 | ||
109 | /* | 106 | /* |
110 | * ftrace_dump_on_oops - variable to dump ftrace buffer on oops | 107 | * ftrace_dump_on_oops - variable to dump ftrace buffer on oops |
@@ -344,7 +341,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | |||
344 | /* trace_flags holds trace_options default values */ | 341 | /* trace_flags holds trace_options default values */ |
345 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | 342 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | |
346 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | | 343 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | |
347 | TRACE_ITER_GRAPH_TIME; | 344 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; |
348 | 345 | ||
349 | static int trace_stop_count; | 346 | static int trace_stop_count; |
350 | static DEFINE_SPINLOCK(tracing_start_lock); | 347 | static DEFINE_SPINLOCK(tracing_start_lock); |
@@ -428,6 +425,7 @@ static const char *trace_options[] = { | |||
428 | "latency-format", | 425 | "latency-format", |
429 | "sleep-time", | 426 | "sleep-time", |
430 | "graph-time", | 427 | "graph-time", |
428 | "record-cmd", | ||
431 | NULL | 429 | NULL |
432 | }; | 430 | }; |
433 | 431 | ||
@@ -659,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
659 | return; | 657 | return; |
660 | 658 | ||
661 | WARN_ON_ONCE(!irqs_disabled()); | 659 | WARN_ON_ONCE(!irqs_disabled()); |
660 | if (!current_trace->use_max_tr) { | ||
661 | WARN_ON_ONCE(1); | ||
662 | return; | ||
663 | } | ||
662 | arch_spin_lock(&ftrace_max_lock); | 664 | arch_spin_lock(&ftrace_max_lock); |
663 | 665 | ||
664 | tr->buffer = max_tr.buffer; | 666 | tr->buffer = max_tr.buffer; |
@@ -685,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
685 | return; | 687 | return; |
686 | 688 | ||
687 | WARN_ON_ONCE(!irqs_disabled()); | 689 | WARN_ON_ONCE(!irqs_disabled()); |
690 | if (!current_trace->use_max_tr) { | ||
691 | WARN_ON_ONCE(1); | ||
692 | return; | ||
693 | } | ||
694 | |||
688 | arch_spin_lock(&ftrace_max_lock); | 695 | arch_spin_lock(&ftrace_max_lock); |
689 | 696 | ||
690 | ftrace_disable_cpu(); | 697 | ftrace_disable_cpu(); |
@@ -729,18 +736,11 @@ __acquires(kernel_lock) | |||
729 | return -1; | 736 | return -1; |
730 | } | 737 | } |
731 | 738 | ||
732 | if (strlen(type->name) > MAX_TRACER_SIZE) { | 739 | if (strlen(type->name) >= MAX_TRACER_SIZE) { |
733 | pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); | 740 | pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); |
734 | return -1; | 741 | return -1; |
735 | } | 742 | } |
736 | 743 | ||
737 | /* | ||
738 | * When this gets called we hold the BKL which means that | ||
739 | * preemption is disabled. Various trace selftests however | ||
740 | * need to disable and enable preemption for successful tests. | ||
741 | * So we drop the BKL here and grab it after the tests again. | ||
742 | */ | ||
743 | unlock_kernel(); | ||
744 | mutex_lock(&trace_types_lock); | 744 | mutex_lock(&trace_types_lock); |
745 | 745 | ||
746 | tracing_selftest_running = true; | 746 | tracing_selftest_running = true; |
@@ -822,7 +822,6 @@ __acquires(kernel_lock) | |||
822 | #endif | 822 | #endif |
823 | 823 | ||
824 | out_unlock: | 824 | out_unlock: |
825 | lock_kernel(); | ||
826 | return ret; | 825 | return ret; |
827 | } | 826 | } |
828 | 827 | ||
@@ -1331,61 +1330,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) | |||
1331 | 1330 | ||
1332 | #endif /* CONFIG_STACKTRACE */ | 1331 | #endif /* CONFIG_STACKTRACE */ |
1333 | 1332 | ||
1334 | static void | ||
1335 | ftrace_trace_special(void *__tr, | ||
1336 | unsigned long arg1, unsigned long arg2, unsigned long arg3, | ||
1337 | int pc) | ||
1338 | { | ||
1339 | struct ftrace_event_call *call = &event_special; | ||
1340 | struct ring_buffer_event *event; | ||
1341 | struct trace_array *tr = __tr; | ||
1342 | struct ring_buffer *buffer = tr->buffer; | ||
1343 | struct special_entry *entry; | ||
1344 | |||
1345 | event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL, | ||
1346 | sizeof(*entry), 0, pc); | ||
1347 | if (!event) | ||
1348 | return; | ||
1349 | entry = ring_buffer_event_data(event); | ||
1350 | entry->arg1 = arg1; | ||
1351 | entry->arg2 = arg2; | ||
1352 | entry->arg3 = arg3; | ||
1353 | |||
1354 | if (!filter_check_discard(call, entry, buffer, event)) | ||
1355 | trace_buffer_unlock_commit(buffer, event, 0, pc); | ||
1356 | } | ||
1357 | |||
1358 | void | ||
1359 | __trace_special(void *__tr, void *__data, | ||
1360 | unsigned long arg1, unsigned long arg2, unsigned long arg3) | ||
1361 | { | ||
1362 | ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count()); | ||
1363 | } | ||
1364 | |||
1365 | void | ||
1366 | ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) | ||
1367 | { | ||
1368 | struct trace_array *tr = &global_trace; | ||
1369 | struct trace_array_cpu *data; | ||
1370 | unsigned long flags; | ||
1371 | int cpu; | ||
1372 | int pc; | ||
1373 | |||
1374 | if (tracing_disabled) | ||
1375 | return; | ||
1376 | |||
1377 | pc = preempt_count(); | ||
1378 | local_irq_save(flags); | ||
1379 | cpu = raw_smp_processor_id(); | ||
1380 | data = tr->data[cpu]; | ||
1381 | |||
1382 | if (likely(atomic_inc_return(&data->disabled) == 1)) | ||
1383 | ftrace_trace_special(tr, arg1, arg2, arg3, pc); | ||
1384 | |||
1385 | atomic_dec(&data->disabled); | ||
1386 | local_irq_restore(flags); | ||
1387 | } | ||
1388 | |||
1389 | /** | 1333 | /** |
1390 | * trace_vbprintk - write binary msg to tracing buffer | 1334 | * trace_vbprintk - write binary msg to tracing buffer |
1391 | * | 1335 | * |
@@ -1404,7 +1348,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1404 | struct bprint_entry *entry; | 1348 | struct bprint_entry *entry; |
1405 | unsigned long flags; | 1349 | unsigned long flags; |
1406 | int disable; | 1350 | int disable; |
1407 | int resched; | ||
1408 | int cpu, len = 0, size, pc; | 1351 | int cpu, len = 0, size, pc; |
1409 | 1352 | ||
1410 | if (unlikely(tracing_selftest_running || tracing_disabled)) | 1353 | if (unlikely(tracing_selftest_running || tracing_disabled)) |
@@ -1414,7 +1357,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1414 | pause_graph_tracing(); | 1357 | pause_graph_tracing(); |
1415 | 1358 | ||
1416 | pc = preempt_count(); | 1359 | pc = preempt_count(); |
1417 | resched = ftrace_preempt_disable(); | 1360 | preempt_disable_notrace(); |
1418 | cpu = raw_smp_processor_id(); | 1361 | cpu = raw_smp_processor_id(); |
1419 | data = tr->data[cpu]; | 1362 | data = tr->data[cpu]; |
1420 | 1363 | ||
@@ -1452,7 +1395,7 @@ out_unlock: | |||
1452 | 1395 | ||
1453 | out: | 1396 | out: |
1454 | atomic_dec_return(&data->disabled); | 1397 | atomic_dec_return(&data->disabled); |
1455 | ftrace_preempt_enable(resched); | 1398 | preempt_enable_notrace(); |
1456 | unpause_graph_tracing(); | 1399 | unpause_graph_tracing(); |
1457 | 1400 | ||
1458 | return len; | 1401 | return len; |
@@ -1539,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) | |||
1539 | } | 1482 | } |
1540 | EXPORT_SYMBOL_GPL(trace_vprintk); | 1483 | EXPORT_SYMBOL_GPL(trace_vprintk); |
1541 | 1484 | ||
1542 | enum trace_file_type { | ||
1543 | TRACE_FILE_LAT_FMT = 1, | ||
1544 | TRACE_FILE_ANNOTATE = 2, | ||
1545 | }; | ||
1546 | |||
1547 | static void trace_iterator_increment(struct trace_iterator *iter) | 1485 | static void trace_iterator_increment(struct trace_iterator *iter) |
1548 | { | 1486 | { |
1549 | /* Don't allow ftrace to trace into the ring buffers */ | 1487 | /* Don't allow ftrace to trace into the ring buffers */ |
@@ -1641,7 +1579,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, | |||
1641 | } | 1579 | } |
1642 | 1580 | ||
1643 | /* Find the next real entry, and increment the iterator to the next entry */ | 1581 | /* Find the next real entry, and increment the iterator to the next entry */ |
1644 | static void *find_next_entry_inc(struct trace_iterator *iter) | 1582 | void *trace_find_next_entry_inc(struct trace_iterator *iter) |
1645 | { | 1583 | { |
1646 | iter->ent = __find_next_entry(iter, &iter->cpu, | 1584 | iter->ent = __find_next_entry(iter, &iter->cpu, |
1647 | &iter->lost_events, &iter->ts); | 1585 | &iter->lost_events, &iter->ts); |
@@ -1676,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) | |||
1676 | return NULL; | 1614 | return NULL; |
1677 | 1615 | ||
1678 | if (iter->idx < 0) | 1616 | if (iter->idx < 0) |
1679 | ent = find_next_entry_inc(iter); | 1617 | ent = trace_find_next_entry_inc(iter); |
1680 | else | 1618 | else |
1681 | ent = iter; | 1619 | ent = iter; |
1682 | 1620 | ||
1683 | while (ent && iter->idx < i) | 1621 | while (ent && iter->idx < i) |
1684 | ent = find_next_entry_inc(iter); | 1622 | ent = trace_find_next_entry_inc(iter); |
1685 | 1623 | ||
1686 | iter->pos = *pos; | 1624 | iter->pos = *pos; |
1687 | 1625 | ||
1688 | return ent; | 1626 | return ent; |
1689 | } | 1627 | } |
1690 | 1628 | ||
1691 | static void tracing_iter_reset(struct trace_iterator *iter, int cpu) | 1629 | void tracing_iter_reset(struct trace_iterator *iter, int cpu) |
1692 | { | 1630 | { |
1693 | struct trace_array *tr = iter->tr; | 1631 | struct trace_array *tr = iter->tr; |
1694 | struct ring_buffer_event *event; | 1632 | struct ring_buffer_event *event; |
@@ -2049,7 +1987,7 @@ int trace_empty(struct trace_iterator *iter) | |||
2049 | } | 1987 | } |
2050 | 1988 | ||
2051 | /* Called with trace_event_read_lock() held. */ | 1989 | /* Called with trace_event_read_lock() held. */ |
2052 | static enum print_line_t print_trace_line(struct trace_iterator *iter) | 1990 | enum print_line_t print_trace_line(struct trace_iterator *iter) |
2053 | { | 1991 | { |
2054 | enum print_line_t ret; | 1992 | enum print_line_t ret; |
2055 | 1993 | ||
@@ -2258,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) | |||
2258 | 2196 | ||
2259 | static int tracing_release(struct inode *inode, struct file *file) | 2197 | static int tracing_release(struct inode *inode, struct file *file) |
2260 | { | 2198 | { |
2261 | struct seq_file *m = (struct seq_file *)file->private_data; | 2199 | struct seq_file *m = file->private_data; |
2262 | struct trace_iterator *iter; | 2200 | struct trace_iterator *iter; |
2263 | int cpu; | 2201 | int cpu; |
2264 | 2202 | ||
@@ -2394,6 +2332,7 @@ static const struct file_operations show_traces_fops = { | |||
2394 | .open = show_traces_open, | 2332 | .open = show_traces_open, |
2395 | .read = seq_read, | 2333 | .read = seq_read, |
2396 | .release = seq_release, | 2334 | .release = seq_release, |
2335 | .llseek = seq_lseek, | ||
2397 | }; | 2336 | }; |
2398 | 2337 | ||
2399 | /* | 2338 | /* |
@@ -2487,6 +2426,7 @@ static const struct file_operations tracing_cpumask_fops = { | |||
2487 | .open = tracing_open_generic, | 2426 | .open = tracing_open_generic, |
2488 | .read = tracing_cpumask_read, | 2427 | .read = tracing_cpumask_read, |
2489 | .write = tracing_cpumask_write, | 2428 | .write = tracing_cpumask_write, |
2429 | .llseek = generic_file_llseek, | ||
2490 | }; | 2430 | }; |
2491 | 2431 | ||
2492 | static int tracing_trace_options_show(struct seq_file *m, void *v) | 2432 | static int tracing_trace_options_show(struct seq_file *m, void *v) |
@@ -2562,6 +2502,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) | |||
2562 | trace_flags |= mask; | 2502 | trace_flags |= mask; |
2563 | else | 2503 | else |
2564 | trace_flags &= ~mask; | 2504 | trace_flags &= ~mask; |
2505 | |||
2506 | if (mask == TRACE_ITER_RECORD_CMD) | ||
2507 | trace_event_enable_cmd_record(enabled); | ||
2565 | } | 2508 | } |
2566 | 2509 | ||
2567 | static ssize_t | 2510 | static ssize_t |
@@ -2653,6 +2596,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf, | |||
2653 | static const struct file_operations tracing_readme_fops = { | 2596 | static const struct file_operations tracing_readme_fops = { |
2654 | .open = tracing_open_generic, | 2597 | .open = tracing_open_generic, |
2655 | .read = tracing_readme_read, | 2598 | .read = tracing_readme_read, |
2599 | .llseek = generic_file_llseek, | ||
2656 | }; | 2600 | }; |
2657 | 2601 | ||
2658 | static ssize_t | 2602 | static ssize_t |
@@ -2703,6 +2647,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, | |||
2703 | static const struct file_operations tracing_saved_cmdlines_fops = { | 2647 | static const struct file_operations tracing_saved_cmdlines_fops = { |
2704 | .open = tracing_open_generic, | 2648 | .open = tracing_open_generic, |
2705 | .read = tracing_saved_cmdlines_read, | 2649 | .read = tracing_saved_cmdlines_read, |
2650 | .llseek = generic_file_llseek, | ||
2706 | }; | 2651 | }; |
2707 | 2652 | ||
2708 | static ssize_t | 2653 | static ssize_t |
@@ -2798,6 +2743,9 @@ static int tracing_resize_ring_buffer(unsigned long size) | |||
2798 | if (ret < 0) | 2743 | if (ret < 0) |
2799 | return ret; | 2744 | return ret; |
2800 | 2745 | ||
2746 | if (!current_trace->use_max_tr) | ||
2747 | goto out; | ||
2748 | |||
2801 | ret = ring_buffer_resize(max_tr.buffer, size); | 2749 | ret = ring_buffer_resize(max_tr.buffer, size); |
2802 | if (ret < 0) { | 2750 | if (ret < 0) { |
2803 | int r; | 2751 | int r; |
@@ -2825,11 +2773,14 @@ static int tracing_resize_ring_buffer(unsigned long size) | |||
2825 | return ret; | 2773 | return ret; |
2826 | } | 2774 | } |
2827 | 2775 | ||
2776 | max_tr.entries = size; | ||
2777 | out: | ||
2828 | global_trace.entries = size; | 2778 | global_trace.entries = size; |
2829 | 2779 | ||
2830 | return ret; | 2780 | return ret; |
2831 | } | 2781 | } |
2832 | 2782 | ||
2783 | |||
2833 | /** | 2784 | /** |
2834 | * tracing_update_buffers - used by tracing facility to expand ring buffers | 2785 | * tracing_update_buffers - used by tracing facility to expand ring buffers |
2835 | * | 2786 | * |
@@ -2890,12 +2841,26 @@ static int tracing_set_tracer(const char *buf) | |||
2890 | trace_branch_disable(); | 2841 | trace_branch_disable(); |
2891 | if (current_trace && current_trace->reset) | 2842 | if (current_trace && current_trace->reset) |
2892 | current_trace->reset(tr); | 2843 | current_trace->reset(tr); |
2893 | 2844 | if (current_trace && current_trace->use_max_tr) { | |
2845 | /* | ||
2846 | * We don't free the ring buffer. instead, resize it because | ||
2847 | * The max_tr ring buffer has some state (e.g. ring->clock) and | ||
2848 | * we want preserve it. | ||
2849 | */ | ||
2850 | ring_buffer_resize(max_tr.buffer, 1); | ||
2851 | max_tr.entries = 1; | ||
2852 | } | ||
2894 | destroy_trace_option_files(topts); | 2853 | destroy_trace_option_files(topts); |
2895 | 2854 | ||
2896 | current_trace = t; | 2855 | current_trace = t; |
2897 | 2856 | ||
2898 | topts = create_trace_option_files(current_trace); | 2857 | topts = create_trace_option_files(current_trace); |
2858 | if (current_trace->use_max_tr) { | ||
2859 | ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); | ||
2860 | if (ret < 0) | ||
2861 | goto out; | ||
2862 | max_tr.entries = global_trace.entries; | ||
2863 | } | ||
2899 | 2864 | ||
2900 | if (t->init) { | 2865 | if (t->init) { |
2901 | ret = tracer_init(t, tr); | 2866 | ret = tracer_init(t, tr); |
@@ -3032,6 +2997,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
3032 | if (iter->trace->pipe_open) | 2997 | if (iter->trace->pipe_open) |
3033 | iter->trace->pipe_open(iter); | 2998 | iter->trace->pipe_open(iter); |
3034 | 2999 | ||
3000 | nonseekable_open(inode, filp); | ||
3035 | out: | 3001 | out: |
3036 | mutex_unlock(&trace_types_lock); | 3002 | mutex_unlock(&trace_types_lock); |
3037 | return ret; | 3003 | return ret; |
@@ -3211,7 +3177,7 @@ waitagain: | |||
3211 | 3177 | ||
3212 | trace_event_read_lock(); | 3178 | trace_event_read_lock(); |
3213 | trace_access_lock(iter->cpu_file); | 3179 | trace_access_lock(iter->cpu_file); |
3214 | while (find_next_entry_inc(iter) != NULL) { | 3180 | while (trace_find_next_entry_inc(iter) != NULL) { |
3215 | enum print_line_t ret; | 3181 | enum print_line_t ret; |
3216 | int len = iter->seq.len; | 3182 | int len = iter->seq.len; |
3217 | 3183 | ||
@@ -3294,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) | |||
3294 | if (ret != TRACE_TYPE_NO_CONSUME) | 3260 | if (ret != TRACE_TYPE_NO_CONSUME) |
3295 | trace_consume(iter); | 3261 | trace_consume(iter); |
3296 | rem -= count; | 3262 | rem -= count; |
3297 | if (!find_next_entry_inc(iter)) { | 3263 | if (!trace_find_next_entry_inc(iter)) { |
3298 | rem = 0; | 3264 | rem = 0; |
3299 | iter->ent = NULL; | 3265 | iter->ent = NULL; |
3300 | break; | 3266 | break; |
@@ -3350,7 +3316,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3350 | if (ret <= 0) | 3316 | if (ret <= 0) |
3351 | goto out_err; | 3317 | goto out_err; |
3352 | 3318 | ||
3353 | if (!iter->ent && !find_next_entry_inc(iter)) { | 3319 | if (!iter->ent && !trace_find_next_entry_inc(iter)) { |
3354 | ret = -EFAULT; | 3320 | ret = -EFAULT; |
3355 | goto out_err; | 3321 | goto out_err; |
3356 | } | 3322 | } |
@@ -3477,7 +3443,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3477 | } | 3443 | } |
3478 | 3444 | ||
3479 | tracing_start(); | 3445 | tracing_start(); |
3480 | max_tr.entries = global_trace.entries; | ||
3481 | mutex_unlock(&trace_types_lock); | 3446 | mutex_unlock(&trace_types_lock); |
3482 | 3447 | ||
3483 | return cnt; | 3448 | return cnt; |
@@ -3498,6 +3463,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3498 | size_t cnt, loff_t *fpos) | 3463 | size_t cnt, loff_t *fpos) |
3499 | { | 3464 | { |
3500 | char *buf; | 3465 | char *buf; |
3466 | size_t written; | ||
3501 | 3467 | ||
3502 | if (tracing_disabled) | 3468 | if (tracing_disabled) |
3503 | return -EINVAL; | 3469 | return -EINVAL; |
@@ -3519,11 +3485,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3519 | } else | 3485 | } else |
3520 | buf[cnt] = '\0'; | 3486 | buf[cnt] = '\0'; |
3521 | 3487 | ||
3522 | cnt = mark_printk("%s", buf); | 3488 | written = mark_printk("%s", buf); |
3523 | kfree(buf); | 3489 | kfree(buf); |
3524 | *fpos += cnt; | 3490 | *fpos += written; |
3525 | 3491 | ||
3526 | return cnt; | 3492 | /* don't tell userspace we wrote more - it might confuse them */ |
3493 | if (written > cnt) | ||
3494 | written = cnt; | ||
3495 | |||
3496 | return written; | ||
3527 | } | 3497 | } |
3528 | 3498 | ||
3529 | static int tracing_clock_show(struct seq_file *m, void *v) | 3499 | static int tracing_clock_show(struct seq_file *m, void *v) |
@@ -3590,18 +3560,21 @@ static const struct file_operations tracing_max_lat_fops = { | |||
3590 | .open = tracing_open_generic, | 3560 | .open = tracing_open_generic, |
3591 | .read = tracing_max_lat_read, | 3561 | .read = tracing_max_lat_read, |
3592 | .write = tracing_max_lat_write, | 3562 | .write = tracing_max_lat_write, |
3563 | .llseek = generic_file_llseek, | ||
3593 | }; | 3564 | }; |
3594 | 3565 | ||
3595 | static const struct file_operations tracing_ctrl_fops = { | 3566 | static const struct file_operations tracing_ctrl_fops = { |
3596 | .open = tracing_open_generic, | 3567 | .open = tracing_open_generic, |
3597 | .read = tracing_ctrl_read, | 3568 | .read = tracing_ctrl_read, |
3598 | .write = tracing_ctrl_write, | 3569 | .write = tracing_ctrl_write, |
3570 | .llseek = generic_file_llseek, | ||
3599 | }; | 3571 | }; |
3600 | 3572 | ||
3601 | static const struct file_operations set_tracer_fops = { | 3573 | static const struct file_operations set_tracer_fops = { |
3602 | .open = tracing_open_generic, | 3574 | .open = tracing_open_generic, |
3603 | .read = tracing_set_trace_read, | 3575 | .read = tracing_set_trace_read, |
3604 | .write = tracing_set_trace_write, | 3576 | .write = tracing_set_trace_write, |
3577 | .llseek = generic_file_llseek, | ||
3605 | }; | 3578 | }; |
3606 | 3579 | ||
3607 | static const struct file_operations tracing_pipe_fops = { | 3580 | static const struct file_operations tracing_pipe_fops = { |
@@ -3610,17 +3583,20 @@ static const struct file_operations tracing_pipe_fops = { | |||
3610 | .read = tracing_read_pipe, | 3583 | .read = tracing_read_pipe, |
3611 | .splice_read = tracing_splice_read_pipe, | 3584 | .splice_read = tracing_splice_read_pipe, |
3612 | .release = tracing_release_pipe, | 3585 | .release = tracing_release_pipe, |
3586 | .llseek = no_llseek, | ||
3613 | }; | 3587 | }; |
3614 | 3588 | ||
3615 | static const struct file_operations tracing_entries_fops = { | 3589 | static const struct file_operations tracing_entries_fops = { |
3616 | .open = tracing_open_generic, | 3590 | .open = tracing_open_generic, |
3617 | .read = tracing_entries_read, | 3591 | .read = tracing_entries_read, |
3618 | .write = tracing_entries_write, | 3592 | .write = tracing_entries_write, |
3593 | .llseek = generic_file_llseek, | ||
3619 | }; | 3594 | }; |
3620 | 3595 | ||
3621 | static const struct file_operations tracing_mark_fops = { | 3596 | static const struct file_operations tracing_mark_fops = { |
3622 | .open = tracing_open_generic, | 3597 | .open = tracing_open_generic, |
3623 | .write = tracing_mark_write, | 3598 | .write = tracing_mark_write, |
3599 | .llseek = generic_file_llseek, | ||
3624 | }; | 3600 | }; |
3625 | 3601 | ||
3626 | static const struct file_operations trace_clock_fops = { | 3602 | static const struct file_operations trace_clock_fops = { |
@@ -3926,6 +3902,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
3926 | static const struct file_operations tracing_stats_fops = { | 3902 | static const struct file_operations tracing_stats_fops = { |
3927 | .open = tracing_open_generic, | 3903 | .open = tracing_open_generic, |
3928 | .read = tracing_stats_read, | 3904 | .read = tracing_stats_read, |
3905 | .llseek = generic_file_llseek, | ||
3929 | }; | 3906 | }; |
3930 | 3907 | ||
3931 | #ifdef CONFIG_DYNAMIC_FTRACE | 3908 | #ifdef CONFIG_DYNAMIC_FTRACE |
@@ -3962,6 +3939,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf, | |||
3962 | static const struct file_operations tracing_dyn_info_fops = { | 3939 | static const struct file_operations tracing_dyn_info_fops = { |
3963 | .open = tracing_open_generic, | 3940 | .open = tracing_open_generic, |
3964 | .read = tracing_read_dyn_info, | 3941 | .read = tracing_read_dyn_info, |
3942 | .llseek = generic_file_llseek, | ||
3965 | }; | 3943 | }; |
3966 | #endif | 3944 | #endif |
3967 | 3945 | ||
@@ -4018,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu) | |||
4018 | { | 3996 | { |
4019 | struct dentry *d_percpu = tracing_dentry_percpu(); | 3997 | struct dentry *d_percpu = tracing_dentry_percpu(); |
4020 | struct dentry *d_cpu; | 3998 | struct dentry *d_cpu; |
4021 | /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ | 3999 | char cpu_dir[30]; /* 30 characters should be more than enough */ |
4022 | char cpu_dir[7]; | ||
4023 | 4000 | ||
4024 | if (cpu > 999 || cpu < 0) | 4001 | snprintf(cpu_dir, 30, "cpu%ld", cpu); |
4025 | return; | ||
4026 | |||
4027 | sprintf(cpu_dir, "cpu%ld", cpu); | ||
4028 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); | 4002 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); |
4029 | if (!d_cpu) { | 4003 | if (!d_cpu) { |
4030 | pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); | 4004 | pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); |
@@ -4115,6 +4089,7 @@ static const struct file_operations trace_options_fops = { | |||
4115 | .open = tracing_open_generic, | 4089 | .open = tracing_open_generic, |
4116 | .read = trace_options_read, | 4090 | .read = trace_options_read, |
4117 | .write = trace_options_write, | 4091 | .write = trace_options_write, |
4092 | .llseek = generic_file_llseek, | ||
4118 | }; | 4093 | }; |
4119 | 4094 | ||
4120 | static ssize_t | 4095 | static ssize_t |
@@ -4166,6 +4141,7 @@ static const struct file_operations trace_options_core_fops = { | |||
4166 | .open = tracing_open_generic, | 4141 | .open = tracing_open_generic, |
4167 | .read = trace_options_core_read, | 4142 | .read = trace_options_core_read, |
4168 | .write = trace_options_core_write, | 4143 | .write = trace_options_core_write, |
4144 | .llseek = generic_file_llseek, | ||
4169 | }; | 4145 | }; |
4170 | 4146 | ||
4171 | struct dentry *trace_create_file(const char *name, | 4147 | struct dentry *trace_create_file(const char *name, |
@@ -4355,9 +4331,6 @@ static __init int tracer_init_debugfs(void) | |||
4355 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, | 4331 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, |
4356 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); | 4332 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); |
4357 | #endif | 4333 | #endif |
4358 | #ifdef CONFIG_SYSPROF_TRACER | ||
4359 | init_tracer_sysprof_debugfs(d_tracer); | ||
4360 | #endif | ||
4361 | 4334 | ||
4362 | create_trace_options_dir(); | 4335 | create_trace_options_dir(); |
4363 | 4336 | ||
@@ -4414,7 +4387,7 @@ static struct notifier_block trace_die_notifier = { | |||
4414 | */ | 4387 | */ |
4415 | #define KERN_TRACE KERN_EMERG | 4388 | #define KERN_TRACE KERN_EMERG |
4416 | 4389 | ||
4417 | static void | 4390 | void |
4418 | trace_printk_seq(struct trace_seq *s) | 4391 | trace_printk_seq(struct trace_seq *s) |
4419 | { | 4392 | { |
4420 | /* Probably should print a warning here. */ | 4393 | /* Probably should print a warning here. */ |
@@ -4429,6 +4402,13 @@ trace_printk_seq(struct trace_seq *s) | |||
4429 | trace_seq_init(s); | 4402 | trace_seq_init(s); |
4430 | } | 4403 | } |
4431 | 4404 | ||
4405 | void trace_init_global_iter(struct trace_iterator *iter) | ||
4406 | { | ||
4407 | iter->tr = &global_trace; | ||
4408 | iter->trace = current_trace; | ||
4409 | iter->cpu_file = TRACE_PIPE_ALL_CPU; | ||
4410 | } | ||
4411 | |||
4432 | static void | 4412 | static void |
4433 | __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | 4413 | __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) |
4434 | { | 4414 | { |
@@ -4454,8 +4434,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | |||
4454 | if (disable_tracing) | 4434 | if (disable_tracing) |
4455 | ftrace_kill(); | 4435 | ftrace_kill(); |
4456 | 4436 | ||
4437 | trace_init_global_iter(&iter); | ||
4438 | |||
4457 | for_each_tracing_cpu(cpu) { | 4439 | for_each_tracing_cpu(cpu) { |
4458 | atomic_inc(&global_trace.data[cpu]->disabled); | 4440 | atomic_inc(&iter.tr->data[cpu]->disabled); |
4459 | } | 4441 | } |
4460 | 4442 | ||
4461 | old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; | 4443 | old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; |
@@ -4504,7 +4486,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | |||
4504 | iter.iter_flags |= TRACE_FILE_LAT_FMT; | 4486 | iter.iter_flags |= TRACE_FILE_LAT_FMT; |
4505 | iter.pos = -1; | 4487 | iter.pos = -1; |
4506 | 4488 | ||
4507 | if (find_next_entry_inc(&iter) != NULL) { | 4489 | if (trace_find_next_entry_inc(&iter) != NULL) { |
4508 | int ret; | 4490 | int ret; |
4509 | 4491 | ||
4510 | ret = print_trace_line(&iter); | 4492 | ret = print_trace_line(&iter); |
@@ -4526,7 +4508,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | |||
4526 | trace_flags |= old_userobj; | 4508 | trace_flags |= old_userobj; |
4527 | 4509 | ||
4528 | for_each_tracing_cpu(cpu) { | 4510 | for_each_tracing_cpu(cpu) { |
4529 | atomic_dec(&global_trace.data[cpu]->disabled); | 4511 | atomic_dec(&iter.tr->data[cpu]->disabled); |
4530 | } | 4512 | } |
4531 | tracing_on(); | 4513 | tracing_on(); |
4532 | } | 4514 | } |
@@ -4575,16 +4557,14 @@ __init static int tracer_alloc_buffers(void) | |||
4575 | 4557 | ||
4576 | 4558 | ||
4577 | #ifdef CONFIG_TRACER_MAX_TRACE | 4559 | #ifdef CONFIG_TRACER_MAX_TRACE |
4578 | max_tr.buffer = ring_buffer_alloc(ring_buf_size, | 4560 | max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); |
4579 | TRACE_BUFFER_FLAGS); | ||
4580 | if (!max_tr.buffer) { | 4561 | if (!max_tr.buffer) { |
4581 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); | 4562 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); |
4582 | WARN_ON(1); | 4563 | WARN_ON(1); |
4583 | ring_buffer_free(global_trace.buffer); | 4564 | ring_buffer_free(global_trace.buffer); |
4584 | goto out_free_cpumask; | 4565 | goto out_free_cpumask; |
4585 | } | 4566 | } |
4586 | max_tr.entries = ring_buffer_size(max_tr.buffer); | 4567 | max_tr.entries = 1; |
4587 | WARN_ON(max_tr.entries != global_trace.entries); | ||
4588 | #endif | 4568 | #endif |
4589 | 4569 | ||
4590 | /* Allocate the first page for all buffers */ | 4570 | /* Allocate the first page for all buffers */ |
@@ -4597,9 +4577,6 @@ __init static int tracer_alloc_buffers(void) | |||
4597 | 4577 | ||
4598 | register_tracer(&nop_trace); | 4578 | register_tracer(&nop_trace); |
4599 | current_trace = &nop_trace; | 4579 | current_trace = &nop_trace; |
4600 | #ifdef CONFIG_BOOT_TRACER | ||
4601 | register_tracer(&boot_tracer); | ||
4602 | #endif | ||
4603 | /* All seems OK, enable tracing */ | 4580 | /* All seems OK, enable tracing */ |
4604 | tracing_disabled = 0; | 4581 | tracing_disabled = 0; |
4605 | 4582 | ||
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2cd96399463f..9021f8c0c0c3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -9,10 +9,7 @@ | |||
9 | #include <linux/mmiotrace.h> | 9 | #include <linux/mmiotrace.h> |
10 | #include <linux/tracepoint.h> | 10 | #include <linux/tracepoint.h> |
11 | #include <linux/ftrace.h> | 11 | #include <linux/ftrace.h> |
12 | #include <trace/boot.h> | ||
13 | #include <linux/kmemtrace.h> | ||
14 | #include <linux/hw_breakpoint.h> | 12 | #include <linux/hw_breakpoint.h> |
15 | |||
16 | #include <linux/trace_seq.h> | 13 | #include <linux/trace_seq.h> |
17 | #include <linux/ftrace_event.h> | 14 | #include <linux/ftrace_event.h> |
18 | 15 | ||
@@ -25,30 +22,17 @@ enum trace_type { | |||
25 | TRACE_STACK, | 22 | TRACE_STACK, |
26 | TRACE_PRINT, | 23 | TRACE_PRINT, |
27 | TRACE_BPRINT, | 24 | TRACE_BPRINT, |
28 | TRACE_SPECIAL, | ||
29 | TRACE_MMIO_RW, | 25 | TRACE_MMIO_RW, |
30 | TRACE_MMIO_MAP, | 26 | TRACE_MMIO_MAP, |
31 | TRACE_BRANCH, | 27 | TRACE_BRANCH, |
32 | TRACE_BOOT_CALL, | ||
33 | TRACE_BOOT_RET, | ||
34 | TRACE_GRAPH_RET, | 28 | TRACE_GRAPH_RET, |
35 | TRACE_GRAPH_ENT, | 29 | TRACE_GRAPH_ENT, |
36 | TRACE_USER_STACK, | 30 | TRACE_USER_STACK, |
37 | TRACE_KMEM_ALLOC, | ||
38 | TRACE_KMEM_FREE, | ||
39 | TRACE_BLK, | 31 | TRACE_BLK, |
40 | TRACE_KSYM, | ||
41 | 32 | ||
42 | __TRACE_LAST_TYPE, | 33 | __TRACE_LAST_TYPE, |
43 | }; | 34 | }; |
44 | 35 | ||
45 | enum kmemtrace_type_id { | ||
46 | KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ | ||
47 | KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ | ||
48 | KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ | ||
49 | }; | ||
50 | |||
51 | extern struct tracer boot_tracer; | ||
52 | 36 | ||
53 | #undef __field | 37 | #undef __field |
54 | #define __field(type, item) type item; | 38 | #define __field(type, item) type item; |
@@ -204,23 +188,15 @@ extern void __ftrace_bad_type(void); | |||
204 | IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ | 188 | IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ |
205 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ | 189 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ |
206 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ | 190 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ |
207 | IF_ASSIGN(var, ent, struct special_entry, 0); \ | ||
208 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ | 191 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ |
209 | TRACE_MMIO_RW); \ | 192 | TRACE_MMIO_RW); \ |
210 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ | 193 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ |
211 | TRACE_MMIO_MAP); \ | 194 | TRACE_MMIO_MAP); \ |
212 | IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ | ||
213 | IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ | ||
214 | IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ | 195 | IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ |
215 | IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ | 196 | IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ |
216 | TRACE_GRAPH_ENT); \ | 197 | TRACE_GRAPH_ENT); \ |
217 | IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ | 198 | IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ |
218 | TRACE_GRAPH_RET); \ | 199 | TRACE_GRAPH_RET); \ |
219 | IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ | ||
220 | TRACE_KMEM_ALLOC); \ | ||
221 | IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ | ||
222 | TRACE_KMEM_FREE); \ | ||
223 | IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ | ||
224 | __ftrace_bad_type(); \ | 200 | __ftrace_bad_type(); \ |
225 | } while (0) | 201 | } while (0) |
226 | 202 | ||
@@ -298,6 +274,7 @@ struct tracer { | |||
298 | struct tracer *next; | 274 | struct tracer *next; |
299 | int print_max; | 275 | int print_max; |
300 | struct tracer_flags *flags; | 276 | struct tracer_flags *flags; |
277 | int use_max_tr; | ||
301 | }; | 278 | }; |
302 | 279 | ||
303 | 280 | ||
@@ -318,7 +295,6 @@ struct dentry *trace_create_file(const char *name, | |||
318 | const struct file_operations *fops); | 295 | const struct file_operations *fops); |
319 | 296 | ||
320 | struct dentry *tracing_init_dentry(void); | 297 | struct dentry *tracing_init_dentry(void); |
321 | void init_tracer_sysprof_debugfs(struct dentry *d_tracer); | ||
322 | 298 | ||
323 | struct ring_buffer_event; | 299 | struct ring_buffer_event; |
324 | 300 | ||
@@ -338,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, | |||
338 | struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, | 314 | struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, |
339 | int *ent_cpu, u64 *ent_ts); | 315 | int *ent_cpu, u64 *ent_ts); |
340 | 316 | ||
317 | int trace_empty(struct trace_iterator *iter); | ||
318 | |||
319 | void *trace_find_next_entry_inc(struct trace_iterator *iter); | ||
320 | |||
321 | void trace_init_global_iter(struct trace_iterator *iter); | ||
322 | |||
323 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); | ||
324 | |||
341 | void default_wait_pipe(struct trace_iterator *iter); | 325 | void default_wait_pipe(struct trace_iterator *iter); |
342 | void poll_wait_pipe(struct trace_iterator *iter); | 326 | void poll_wait_pipe(struct trace_iterator *iter); |
343 | 327 | ||
@@ -355,15 +339,14 @@ void tracing_sched_wakeup_trace(struct trace_array *tr, | |||
355 | struct task_struct *wakee, | 339 | struct task_struct *wakee, |
356 | struct task_struct *cur, | 340 | struct task_struct *cur, |
357 | unsigned long flags, int pc); | 341 | unsigned long flags, int pc); |
358 | void trace_special(struct trace_array *tr, | ||
359 | struct trace_array_cpu *data, | ||
360 | unsigned long arg1, | ||
361 | unsigned long arg2, | ||
362 | unsigned long arg3, int pc); | ||
363 | void trace_function(struct trace_array *tr, | 342 | void trace_function(struct trace_array *tr, |
364 | unsigned long ip, | 343 | unsigned long ip, |
365 | unsigned long parent_ip, | 344 | unsigned long parent_ip, |
366 | unsigned long flags, int pc); | 345 | unsigned long flags, int pc); |
346 | void trace_graph_function(struct trace_array *tr, | ||
347 | unsigned long ip, | ||
348 | unsigned long parent_ip, | ||
349 | unsigned long flags, int pc); | ||
367 | void trace_default_header(struct seq_file *m); | 350 | void trace_default_header(struct seq_file *m); |
368 | void print_trace_header(struct seq_file *m, struct trace_iterator *iter); | 351 | void print_trace_header(struct seq_file *m, struct trace_iterator *iter); |
369 | int trace_empty(struct trace_iterator *iter); | 352 | int trace_empty(struct trace_iterator *iter); |
@@ -380,8 +363,15 @@ void tracing_start_sched_switch_record(void); | |||
380 | int register_tracer(struct tracer *type); | 363 | int register_tracer(struct tracer *type); |
381 | void unregister_tracer(struct tracer *type); | 364 | void unregister_tracer(struct tracer *type); |
382 | int is_tracing_stopped(void); | 365 | int is_tracing_stopped(void); |
366 | enum trace_file_type { | ||
367 | TRACE_FILE_LAT_FMT = 1, | ||
368 | TRACE_FILE_ANNOTATE = 2, | ||
369 | }; | ||
370 | |||
371 | extern cpumask_var_t __read_mostly tracing_buffer_mask; | ||
383 | 372 | ||
384 | extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); | 373 | #define for_each_tracing_cpu(cpu) \ |
374 | for_each_cpu(cpu, tracing_buffer_mask) | ||
385 | 375 | ||
386 | extern unsigned long nsecs_to_usecs(unsigned long nsecs); | 376 | extern unsigned long nsecs_to_usecs(unsigned long nsecs); |
387 | 377 | ||
@@ -452,12 +442,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace, | |||
452 | struct trace_array *tr); | 442 | struct trace_array *tr); |
453 | extern int trace_selftest_startup_sched_switch(struct tracer *trace, | 443 | extern int trace_selftest_startup_sched_switch(struct tracer *trace, |
454 | struct trace_array *tr); | 444 | struct trace_array *tr); |
455 | extern int trace_selftest_startup_sysprof(struct tracer *trace, | ||
456 | struct trace_array *tr); | ||
457 | extern int trace_selftest_startup_branch(struct tracer *trace, | 445 | extern int trace_selftest_startup_branch(struct tracer *trace, |
458 | struct trace_array *tr); | 446 | struct trace_array *tr); |
459 | extern int trace_selftest_startup_ksym(struct tracer *trace, | ||
460 | struct trace_array *tr); | ||
461 | #endif /* CONFIG_FTRACE_STARTUP_TEST */ | 447 | #endif /* CONFIG_FTRACE_STARTUP_TEST */ |
462 | 448 | ||
463 | extern void *head_page(struct trace_array_cpu *data); | 449 | extern void *head_page(struct trace_array_cpu *data); |
@@ -471,6 +457,8 @@ trace_array_vprintk(struct trace_array *tr, | |||
471 | unsigned long ip, const char *fmt, va_list args); | 457 | unsigned long ip, const char *fmt, va_list args); |
472 | int trace_array_printk(struct trace_array *tr, | 458 | int trace_array_printk(struct trace_array *tr, |
473 | unsigned long ip, const char *fmt, ...); | 459 | unsigned long ip, const char *fmt, ...); |
460 | void trace_printk_seq(struct trace_seq *s); | ||
461 | enum print_line_t print_trace_line(struct trace_iterator *iter); | ||
474 | 462 | ||
475 | extern unsigned long trace_flags; | 463 | extern unsigned long trace_flags; |
476 | 464 | ||
@@ -617,6 +605,7 @@ enum trace_iterator_flags { | |||
617 | TRACE_ITER_LATENCY_FMT = 0x20000, | 605 | TRACE_ITER_LATENCY_FMT = 0x20000, |
618 | TRACE_ITER_SLEEP_TIME = 0x40000, | 606 | TRACE_ITER_SLEEP_TIME = 0x40000, |
619 | TRACE_ITER_GRAPH_TIME = 0x80000, | 607 | TRACE_ITER_GRAPH_TIME = 0x80000, |
608 | TRACE_ITER_RECORD_CMD = 0x100000, | ||
620 | }; | 609 | }; |
621 | 610 | ||
622 | /* | 611 | /* |
@@ -628,54 +617,6 @@ enum trace_iterator_flags { | |||
628 | 617 | ||
629 | extern struct tracer nop_trace; | 618 | extern struct tracer nop_trace; |
630 | 619 | ||
631 | /** | ||
632 | * ftrace_preempt_disable - disable preemption scheduler safe | ||
633 | * | ||
634 | * When tracing can happen inside the scheduler, there exists | ||
635 | * cases that the tracing might happen before the need_resched | ||
636 | * flag is checked. If this happens and the tracer calls | ||
637 | * preempt_enable (after a disable), a schedule might take place | ||
638 | * causing an infinite recursion. | ||
639 | * | ||
640 | * To prevent this, we read the need_resched flag before | ||
641 | * disabling preemption. When we want to enable preemption we | ||
642 | * check the flag, if it is set, then we call preempt_enable_no_resched. | ||
643 | * Otherwise, we call preempt_enable. | ||
644 | * | ||
645 | * The rational for doing the above is that if need_resched is set | ||
646 | * and we have yet to reschedule, we are either in an atomic location | ||
647 | * (where we do not need to check for scheduling) or we are inside | ||
648 | * the scheduler and do not want to resched. | ||
649 | */ | ||
650 | static inline int ftrace_preempt_disable(void) | ||
651 | { | ||
652 | int resched; | ||
653 | |||
654 | resched = need_resched(); | ||
655 | preempt_disable_notrace(); | ||
656 | |||
657 | return resched; | ||
658 | } | ||
659 | |||
660 | /** | ||
661 | * ftrace_preempt_enable - enable preemption scheduler safe | ||
662 | * @resched: the return value from ftrace_preempt_disable | ||
663 | * | ||
664 | * This is a scheduler safe way to enable preemption and not miss | ||
665 | * any preemption checks. The disabled saved the state of preemption. | ||
666 | * If resched is set, then we are either inside an atomic or | ||
667 | * are inside the scheduler (we would have already scheduled | ||
668 | * otherwise). In this case, we do not want to call normal | ||
669 | * preempt_enable, but preempt_enable_no_resched instead. | ||
670 | */ | ||
671 | static inline void ftrace_preempt_enable(int resched) | ||
672 | { | ||
673 | if (resched) | ||
674 | preempt_enable_no_resched_notrace(); | ||
675 | else | ||
676 | preempt_enable_notrace(); | ||
677 | } | ||
678 | |||
679 | #ifdef CONFIG_BRANCH_TRACER | 620 | #ifdef CONFIG_BRANCH_TRACER |
680 | extern int enable_branch_tracing(struct trace_array *tr); | 621 | extern int enable_branch_tracing(struct trace_array *tr); |
681 | extern void disable_branch_tracing(void); | 622 | extern void disable_branch_tracing(void); |
@@ -766,6 +707,8 @@ struct filter_pred { | |||
766 | int pop_n; | 707 | int pop_n; |
767 | }; | 708 | }; |
768 | 709 | ||
710 | extern struct list_head ftrace_common_fields; | ||
711 | |||
769 | extern enum regex_type | 712 | extern enum regex_type |
770 | filter_parse_regex(char *buff, int len, char **search, int *not); | 713 | filter_parse_regex(char *buff, int len, char **search, int *not); |
771 | extern void print_event_filter(struct ftrace_event_call *call, | 714 | extern void print_event_filter(struct ftrace_event_call *call, |
@@ -795,6 +738,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, | |||
795 | return 0; | 738 | return 0; |
796 | } | 739 | } |
797 | 740 | ||
741 | extern void trace_event_enable_cmd_record(bool enable); | ||
742 | |||
798 | extern struct mutex event_mutex; | 743 | extern struct mutex event_mutex; |
799 | extern struct list_head ftrace_events; | 744 | extern struct list_head ftrace_events; |
800 | 745 | ||
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c deleted file mode 100644 index c21d5f3956ad..000000000000 --- a/kernel/trace/trace_boot.c +++ /dev/null | |||
@@ -1,185 +0,0 @@ | |||
1 | /* | ||
2 | * ring buffer based initcalls tracer | ||
3 | * | ||
4 | * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> | ||
5 | * | ||
6 | */ | ||
7 | |||
8 | #include <linux/init.h> | ||
9 | #include <linux/debugfs.h> | ||
10 | #include <linux/ftrace.h> | ||
11 | #include <linux/kallsyms.h> | ||
12 | #include <linux/time.h> | ||
13 | |||
14 | #include "trace.h" | ||
15 | #include "trace_output.h" | ||
16 | |||
17 | static struct trace_array *boot_trace; | ||
18 | static bool pre_initcalls_finished; | ||
19 | |||
20 | /* Tells the boot tracer that the pre_smp_initcalls are finished. | ||
21 | * So we are ready . | ||
22 | * It doesn't enable sched events tracing however. | ||
23 | * You have to call enable_boot_trace to do so. | ||
24 | */ | ||
25 | void start_boot_trace(void) | ||
26 | { | ||
27 | pre_initcalls_finished = true; | ||
28 | } | ||
29 | |||
30 | void enable_boot_trace(void) | ||
31 | { | ||
32 | if (boot_trace && pre_initcalls_finished) | ||
33 | tracing_start_sched_switch_record(); | ||
34 | } | ||
35 | |||
36 | void disable_boot_trace(void) | ||
37 | { | ||
38 | if (boot_trace && pre_initcalls_finished) | ||
39 | tracing_stop_sched_switch_record(); | ||
40 | } | ||
41 | |||
42 | static int boot_trace_init(struct trace_array *tr) | ||
43 | { | ||
44 | boot_trace = tr; | ||
45 | |||
46 | if (!tr) | ||
47 | return 0; | ||
48 | |||
49 | tracing_reset_online_cpus(tr); | ||
50 | |||
51 | tracing_sched_switch_assign_trace(tr); | ||
52 | return 0; | ||
53 | } | ||
54 | |||
55 | static enum print_line_t | ||
56 | initcall_call_print_line(struct trace_iterator *iter) | ||
57 | { | ||
58 | struct trace_entry *entry = iter->ent; | ||
59 | struct trace_seq *s = &iter->seq; | ||
60 | struct trace_boot_call *field; | ||
61 | struct boot_trace_call *call; | ||
62 | u64 ts; | ||
63 | unsigned long nsec_rem; | ||
64 | int ret; | ||
65 | |||
66 | trace_assign_type(field, entry); | ||
67 | call = &field->boot_call; | ||
68 | ts = iter->ts; | ||
69 | nsec_rem = do_div(ts, NSEC_PER_SEC); | ||
70 | |||
71 | ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", | ||
72 | (unsigned long)ts, nsec_rem, call->func, call->caller); | ||
73 | |||
74 | if (!ret) | ||
75 | return TRACE_TYPE_PARTIAL_LINE; | ||
76 | else | ||
77 | return TRACE_TYPE_HANDLED; | ||
78 | } | ||
79 | |||
80 | static enum print_line_t | ||
81 | initcall_ret_print_line(struct trace_iterator *iter) | ||
82 | { | ||
83 | struct trace_entry *entry = iter->ent; | ||
84 | struct trace_seq *s = &iter->seq; | ||
85 | struct trace_boot_ret *field; | ||
86 | struct boot_trace_ret *init_ret; | ||
87 | u64 ts; | ||
88 | unsigned long nsec_rem; | ||
89 | int ret; | ||
90 | |||
91 | trace_assign_type(field, entry); | ||
92 | init_ret = &field->boot_ret; | ||
93 | ts = iter->ts; | ||
94 | nsec_rem = do_div(ts, NSEC_PER_SEC); | ||
95 | |||
96 | ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " | ||
97 | "returned %d after %llu msecs\n", | ||
98 | (unsigned long) ts, | ||
99 | nsec_rem, | ||
100 | init_ret->func, init_ret->result, init_ret->duration); | ||
101 | |||
102 | if (!ret) | ||
103 | return TRACE_TYPE_PARTIAL_LINE; | ||
104 | else | ||
105 | return TRACE_TYPE_HANDLED; | ||
106 | } | ||
107 | |||
108 | static enum print_line_t initcall_print_line(struct trace_iterator *iter) | ||
109 | { | ||
110 | struct trace_entry *entry = iter->ent; | ||
111 | |||
112 | switch (entry->type) { | ||
113 | case TRACE_BOOT_CALL: | ||
114 | return initcall_call_print_line(iter); | ||
115 | case TRACE_BOOT_RET: | ||
116 | return initcall_ret_print_line(iter); | ||
117 | default: | ||
118 | return TRACE_TYPE_UNHANDLED; | ||
119 | } | ||
120 | } | ||
121 | |||
122 | struct tracer boot_tracer __read_mostly = | ||
123 | { | ||
124 | .name = "initcall", | ||
125 | .init = boot_trace_init, | ||
126 | .reset = tracing_reset_online_cpus, | ||
127 | .print_line = initcall_print_line, | ||
128 | }; | ||
129 | |||
130 | void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) | ||
131 | { | ||
132 | struct ftrace_event_call *call = &event_boot_call; | ||
133 | struct ring_buffer_event *event; | ||
134 | struct ring_buffer *buffer; | ||
135 | struct trace_boot_call *entry; | ||
136 | struct trace_array *tr = boot_trace; | ||
137 | |||
138 | if (!tr || !pre_initcalls_finished) | ||
139 | return; | ||
140 | |||
141 | /* Get its name now since this function could | ||
142 | * disappear because it is in the .init section. | ||
143 | */ | ||
144 | sprint_symbol(bt->func, (unsigned long)fn); | ||
145 | preempt_disable(); | ||
146 | |||
147 | buffer = tr->buffer; | ||
148 | event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL, | ||
149 | sizeof(*entry), 0, 0); | ||
150 | if (!event) | ||
151 | goto out; | ||
152 | entry = ring_buffer_event_data(event); | ||
153 | entry->boot_call = *bt; | ||
154 | if (!filter_check_discard(call, entry, buffer, event)) | ||
155 | trace_buffer_unlock_commit(buffer, event, 0, 0); | ||
156 | out: | ||
157 | preempt_enable(); | ||
158 | } | ||
159 | |||
160 | void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) | ||
161 | { | ||
162 | struct ftrace_event_call *call = &event_boot_ret; | ||
163 | struct ring_buffer_event *event; | ||
164 | struct ring_buffer *buffer; | ||
165 | struct trace_boot_ret *entry; | ||
166 | struct trace_array *tr = boot_trace; | ||
167 | |||
168 | if (!tr || !pre_initcalls_finished) | ||
169 | return; | ||
170 | |||
171 | sprint_symbol(bt->func, (unsigned long)fn); | ||
172 | preempt_disable(); | ||
173 | |||
174 | buffer = tr->buffer; | ||
175 | event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET, | ||
176 | sizeof(*entry), 0, 0); | ||
177 | if (!event) | ||
178 | goto out; | ||
179 | entry = ring_buffer_event_data(event); | ||
180 | entry->boot_ret = *bt; | ||
181 | if (!filter_check_discard(call, entry, buffer, event)) | ||
182 | trace_buffer_unlock_commit(buffer, event, 0, 0); | ||
183 | out: | ||
184 | preempt_enable(); | ||
185 | } | ||
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 9d589d8dcd1a..685a67d55db0 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c | |||
@@ -32,16 +32,15 @@ | |||
32 | u64 notrace trace_clock_local(void) | 32 | u64 notrace trace_clock_local(void) |
33 | { | 33 | { |
34 | u64 clock; | 34 | u64 clock; |
35 | int resched; | ||
36 | 35 | ||
37 | /* | 36 | /* |
38 | * sched_clock() is an architecture implemented, fast, scalable, | 37 | * sched_clock() is an architecture implemented, fast, scalable, |
39 | * lockless clock. It is not guaranteed to be coherent across | 38 | * lockless clock. It is not guaranteed to be coherent across |
40 | * CPUs, nor across CPU idle events. | 39 | * CPUs, nor across CPU idle events. |
41 | */ | 40 | */ |
42 | resched = ftrace_preempt_disable(); | 41 | preempt_disable_notrace(); |
43 | clock = sched_clock(); | 42 | clock = sched_clock(); |
44 | ftrace_preempt_enable(resched); | 43 | preempt_enable_notrace(); |
45 | 44 | ||
46 | return clock; | 45 | return clock; |
47 | } | 46 | } |
@@ -56,7 +55,7 @@ u64 notrace trace_clock_local(void) | |||
56 | */ | 55 | */ |
57 | u64 notrace trace_clock(void) | 56 | u64 notrace trace_clock(void) |
58 | { | 57 | { |
59 | return cpu_clock(raw_smp_processor_id()); | 58 | return local_clock(); |
60 | } | 59 | } |
61 | 60 | ||
62 | 61 | ||
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index dc008c1240da..e3dfecaf13e6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, | |||
151 | ); | 151 | ); |
152 | 152 | ||
153 | /* | 153 | /* |
154 | * Special (free-form) trace entry: | ||
155 | */ | ||
156 | FTRACE_ENTRY(special, special_entry, | ||
157 | |||
158 | TRACE_SPECIAL, | ||
159 | |||
160 | F_STRUCT( | ||
161 | __field( unsigned long, arg1 ) | ||
162 | __field( unsigned long, arg2 ) | ||
163 | __field( unsigned long, arg3 ) | ||
164 | ), | ||
165 | |||
166 | F_printk("(%08lx) (%08lx) (%08lx)", | ||
167 | __entry->arg1, __entry->arg2, __entry->arg3) | ||
168 | ); | ||
169 | |||
170 | /* | ||
171 | * Stack-trace entry: | 154 | * Stack-trace entry: |
172 | */ | 155 | */ |
173 | 156 | ||
@@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, | |||
271 | __entry->map_id, __entry->opcode) | 254 | __entry->map_id, __entry->opcode) |
272 | ); | 255 | ); |
273 | 256 | ||
274 | FTRACE_ENTRY(boot_call, trace_boot_call, | ||
275 | |||
276 | TRACE_BOOT_CALL, | ||
277 | |||
278 | F_STRUCT( | ||
279 | __field_struct( struct boot_trace_call, boot_call ) | ||
280 | __field_desc( pid_t, boot_call, caller ) | ||
281 | __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN) | ||
282 | ), | ||
283 | |||
284 | F_printk("%d %s", __entry->caller, __entry->func) | ||
285 | ); | ||
286 | |||
287 | FTRACE_ENTRY(boot_ret, trace_boot_ret, | ||
288 | |||
289 | TRACE_BOOT_RET, | ||
290 | |||
291 | F_STRUCT( | ||
292 | __field_struct( struct boot_trace_ret, boot_ret ) | ||
293 | __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN) | ||
294 | __field_desc( int, boot_ret, result ) | ||
295 | __field_desc( unsigned long, boot_ret, duration ) | ||
296 | ), | ||
297 | |||
298 | F_printk("%s %d %lx", | ||
299 | __entry->func, __entry->result, __entry->duration) | ||
300 | ); | ||
301 | 257 | ||
302 | #define TRACE_FUNC_SIZE 30 | 258 | #define TRACE_FUNC_SIZE 30 |
303 | #define TRACE_FILE_SIZE 20 | 259 | #define TRACE_FILE_SIZE 20 |
@@ -318,53 +274,3 @@ FTRACE_ENTRY(branch, trace_branch, | |||
318 | __entry->func, __entry->file, __entry->correct) | 274 | __entry->func, __entry->file, __entry->correct) |
319 | ); | 275 | ); |
320 | 276 | ||
321 | FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, | ||
322 | |||
323 | TRACE_KMEM_ALLOC, | ||
324 | |||
325 | F_STRUCT( | ||
326 | __field( enum kmemtrace_type_id, type_id ) | ||
327 | __field( unsigned long, call_site ) | ||
328 | __field( const void *, ptr ) | ||
329 | __field( size_t, bytes_req ) | ||
330 | __field( size_t, bytes_alloc ) | ||
331 | __field( gfp_t, gfp_flags ) | ||
332 | __field( int, node ) | ||
333 | ), | ||
334 | |||
335 | F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" | ||
336 | " flags:%x node:%d", | ||
337 | __entry->type_id, __entry->call_site, __entry->ptr, | ||
338 | __entry->bytes_req, __entry->bytes_alloc, | ||
339 | __entry->gfp_flags, __entry->node) | ||
340 | ); | ||
341 | |||
342 | FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, | ||
343 | |||
344 | TRACE_KMEM_FREE, | ||
345 | |||
346 | F_STRUCT( | ||
347 | __field( enum kmemtrace_type_id, type_id ) | ||
348 | __field( unsigned long, call_site ) | ||
349 | __field( const void *, ptr ) | ||
350 | ), | ||
351 | |||
352 | F_printk("type:%u call_site:%lx ptr:%p", | ||
353 | __entry->type_id, __entry->call_site, __entry->ptr) | ||
354 | ); | ||
355 | |||
356 | FTRACE_ENTRY(ksym_trace, ksym_trace_entry, | ||
357 | |||
358 | TRACE_KSYM, | ||
359 | |||
360 | F_STRUCT( | ||
361 | __field( unsigned long, ip ) | ||
362 | __field( unsigned char, type ) | ||
363 | __array( char , cmd, TASK_COMM_LEN ) | ||
364 | __field( unsigned long, addr ) | ||
365 | ), | ||
366 | |||
367 | F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s", | ||
368 | (void *)__entry->ip, (unsigned int)__entry->type, | ||
369 | (void *)__entry->addr, __entry->cmd) | ||
370 | ); | ||
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 8a2b73f7c068..39c059ca670e 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -9,9 +9,7 @@ | |||
9 | #include <linux/kprobes.h> | 9 | #include <linux/kprobes.h> |
10 | #include "trace.h" | 10 | #include "trace.h" |
11 | 11 | ||
12 | EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); | 12 | static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; |
13 | |||
14 | static char *perf_trace_buf[4]; | ||
15 | 13 | ||
16 | /* | 14 | /* |
17 | * Force it to be aligned to unsigned long to avoid misaligned accesses | 15 | * Force it to be aligned to unsigned long to avoid misaligned accesses |
@@ -26,7 +24,7 @@ static int total_ref_count; | |||
26 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, | 24 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, |
27 | struct perf_event *p_event) | 25 | struct perf_event *p_event) |
28 | { | 26 | { |
29 | struct hlist_head *list; | 27 | struct hlist_head __percpu *list; |
30 | int ret = -ENOMEM; | 28 | int ret = -ENOMEM; |
31 | int cpu; | 29 | int cpu; |
32 | 30 | ||
@@ -44,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, | |||
44 | tp_event->perf_events = list; | 42 | tp_event->perf_events = list; |
45 | 43 | ||
46 | if (!total_ref_count) { | 44 | if (!total_ref_count) { |
47 | char *buf; | 45 | char __percpu *buf; |
48 | int i; | 46 | int i; |
49 | 47 | ||
50 | for (i = 0; i < 4; i++) { | 48 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
51 | buf = (char *)alloc_percpu(perf_trace_t); | 49 | buf = (char __percpu *)alloc_percpu(perf_trace_t); |
52 | if (!buf) | 50 | if (!buf) |
53 | goto fail; | 51 | goto fail; |
54 | 52 | ||
@@ -56,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, | |||
56 | } | 54 | } |
57 | } | 55 | } |
58 | 56 | ||
59 | if (tp_event->class->reg) | 57 | ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); |
60 | ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); | ||
61 | else | ||
62 | ret = tracepoint_probe_register(tp_event->name, | ||
63 | tp_event->class->perf_probe, | ||
64 | tp_event); | ||
65 | |||
66 | if (ret) | 58 | if (ret) |
67 | goto fail; | 59 | goto fail; |
68 | 60 | ||
@@ -73,7 +65,7 @@ fail: | |||
73 | if (!total_ref_count) { | 65 | if (!total_ref_count) { |
74 | int i; | 66 | int i; |
75 | 67 | ||
76 | for (i = 0; i < 4; i++) { | 68 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
77 | free_percpu(perf_trace_buf[i]); | 69 | free_percpu(perf_trace_buf[i]); |
78 | perf_trace_buf[i] = NULL; | 70 | perf_trace_buf[i] = NULL; |
79 | } | 71 | } |
@@ -96,11 +88,11 @@ int perf_trace_init(struct perf_event *p_event) | |||
96 | mutex_lock(&event_mutex); | 88 | mutex_lock(&event_mutex); |
97 | list_for_each_entry(tp_event, &ftrace_events, list) { | 89 | list_for_each_entry(tp_event, &ftrace_events, list) { |
98 | if (tp_event->event.type == event_id && | 90 | if (tp_event->event.type == event_id && |
99 | tp_event->class && | 91 | tp_event->class && tp_event->class->reg && |
100 | (tp_event->class->perf_probe || | ||
101 | tp_event->class->reg) && | ||
102 | try_module_get(tp_event->mod)) { | 92 | try_module_get(tp_event->mod)) { |
103 | ret = perf_trace_event_init(tp_event, p_event); | 93 | ret = perf_trace_event_init(tp_event, p_event); |
94 | if (ret) | ||
95 | module_put(tp_event->mod); | ||
104 | break; | 96 | break; |
105 | } | 97 | } |
106 | } | 98 | } |
@@ -109,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event) | |||
109 | return ret; | 101 | return ret; |
110 | } | 102 | } |
111 | 103 | ||
112 | int perf_trace_enable(struct perf_event *p_event) | 104 | int perf_trace_add(struct perf_event *p_event, int flags) |
113 | { | 105 | { |
114 | struct ftrace_event_call *tp_event = p_event->tp_event; | 106 | struct ftrace_event_call *tp_event = p_event->tp_event; |
107 | struct hlist_head __percpu *pcpu_list; | ||
115 | struct hlist_head *list; | 108 | struct hlist_head *list; |
116 | 109 | ||
117 | list = tp_event->perf_events; | 110 | pcpu_list = tp_event->perf_events; |
118 | if (WARN_ON_ONCE(!list)) | 111 | if (WARN_ON_ONCE(!pcpu_list)) |
119 | return -EINVAL; | 112 | return -EINVAL; |
120 | 113 | ||
121 | list = this_cpu_ptr(list); | 114 | if (!(flags & PERF_EF_START)) |
115 | p_event->hw.state = PERF_HES_STOPPED; | ||
116 | |||
117 | list = this_cpu_ptr(pcpu_list); | ||
122 | hlist_add_head_rcu(&p_event->hlist_entry, list); | 118 | hlist_add_head_rcu(&p_event->hlist_entry, list); |
123 | 119 | ||
124 | return 0; | 120 | return 0; |
125 | } | 121 | } |
126 | 122 | ||
127 | void perf_trace_disable(struct perf_event *p_event) | 123 | void perf_trace_del(struct perf_event *p_event, int flags) |
128 | { | 124 | { |
129 | hlist_del_rcu(&p_event->hlist_entry); | 125 | hlist_del_rcu(&p_event->hlist_entry); |
130 | } | 126 | } |
@@ -138,29 +134,25 @@ void perf_trace_destroy(struct perf_event *p_event) | |||
138 | if (--tp_event->perf_refcount > 0) | 134 | if (--tp_event->perf_refcount > 0) |
139 | goto out; | 135 | goto out; |
140 | 136 | ||
141 | if (tp_event->class->reg) | 137 | tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); |
142 | tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); | ||
143 | else | ||
144 | tracepoint_probe_unregister(tp_event->name, | ||
145 | tp_event->class->perf_probe, | ||
146 | tp_event); | ||
147 | 138 | ||
148 | /* | 139 | /* |
149 | * Ensure our callback won't be called anymore. See | 140 | * Ensure our callback won't be called anymore. The buffers |
150 | * tracepoint_probe_unregister() and __DO_TRACE(). | 141 | * will be freed after that. |
151 | */ | 142 | */ |
152 | synchronize_sched(); | 143 | tracepoint_synchronize_unregister(); |
153 | 144 | ||
154 | free_percpu(tp_event->perf_events); | 145 | free_percpu(tp_event->perf_events); |
155 | tp_event->perf_events = NULL; | 146 | tp_event->perf_events = NULL; |
156 | 147 | ||
157 | if (!--total_ref_count) { | 148 | if (!--total_ref_count) { |
158 | for (i = 0; i < 4; i++) { | 149 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
159 | free_percpu(perf_trace_buf[i]); | 150 | free_percpu(perf_trace_buf[i]); |
160 | perf_trace_buf[i] = NULL; | 151 | perf_trace_buf[i] = NULL; |
161 | } | 152 | } |
162 | } | 153 | } |
163 | out: | 154 | out: |
155 | module_put(tp_event->mod); | ||
164 | mutex_unlock(&event_mutex); | 156 | mutex_unlock(&event_mutex); |
165 | } | 157 | } |
166 | 158 | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53cffc0b0801..0725eeab1937 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -28,6 +28,7 @@ | |||
28 | DEFINE_MUTEX(event_mutex); | 28 | DEFINE_MUTEX(event_mutex); |
29 | 29 | ||
30 | LIST_HEAD(ftrace_events); | 30 | LIST_HEAD(ftrace_events); |
31 | LIST_HEAD(ftrace_common_fields); | ||
31 | 32 | ||
32 | struct list_head * | 33 | struct list_head * |
33 | trace_get_fields(struct ftrace_event_call *event_call) | 34 | trace_get_fields(struct ftrace_event_call *event_call) |
@@ -37,15 +38,11 @@ trace_get_fields(struct ftrace_event_call *event_call) | |||
37 | return event_call->class->get_fields(event_call); | 38 | return event_call->class->get_fields(event_call); |
38 | } | 39 | } |
39 | 40 | ||
40 | int trace_define_field(struct ftrace_event_call *call, const char *type, | 41 | static int __trace_define_field(struct list_head *head, const char *type, |
41 | const char *name, int offset, int size, int is_signed, | 42 | const char *name, int offset, int size, |
42 | int filter_type) | 43 | int is_signed, int filter_type) |
43 | { | 44 | { |
44 | struct ftrace_event_field *field; | 45 | struct ftrace_event_field *field; |
45 | struct list_head *head; | ||
46 | |||
47 | if (WARN_ON(!call->class)) | ||
48 | return 0; | ||
49 | 46 | ||
50 | field = kzalloc(sizeof(*field), GFP_KERNEL); | 47 | field = kzalloc(sizeof(*field), GFP_KERNEL); |
51 | if (!field) | 48 | if (!field) |
@@ -68,7 +65,6 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, | |||
68 | field->size = size; | 65 | field->size = size; |
69 | field->is_signed = is_signed; | 66 | field->is_signed = is_signed; |
70 | 67 | ||
71 | head = trace_get_fields(call); | ||
72 | list_add(&field->link, head); | 68 | list_add(&field->link, head); |
73 | 69 | ||
74 | return 0; | 70 | return 0; |
@@ -80,17 +76,32 @@ err: | |||
80 | 76 | ||
81 | return -ENOMEM; | 77 | return -ENOMEM; |
82 | } | 78 | } |
79 | |||
80 | int trace_define_field(struct ftrace_event_call *call, const char *type, | ||
81 | const char *name, int offset, int size, int is_signed, | ||
82 | int filter_type) | ||
83 | { | ||
84 | struct list_head *head; | ||
85 | |||
86 | if (WARN_ON(!call->class)) | ||
87 | return 0; | ||
88 | |||
89 | head = trace_get_fields(call); | ||
90 | return __trace_define_field(head, type, name, offset, size, | ||
91 | is_signed, filter_type); | ||
92 | } | ||
83 | EXPORT_SYMBOL_GPL(trace_define_field); | 93 | EXPORT_SYMBOL_GPL(trace_define_field); |
84 | 94 | ||
85 | #define __common_field(type, item) \ | 95 | #define __common_field(type, item) \ |
86 | ret = trace_define_field(call, #type, "common_" #item, \ | 96 | ret = __trace_define_field(&ftrace_common_fields, #type, \ |
87 | offsetof(typeof(ent), item), \ | 97 | "common_" #item, \ |
88 | sizeof(ent.item), \ | 98 | offsetof(typeof(ent), item), \ |
89 | is_signed_type(type), FILTER_OTHER); \ | 99 | sizeof(ent.item), \ |
100 | is_signed_type(type), FILTER_OTHER); \ | ||
90 | if (ret) \ | 101 | if (ret) \ |
91 | return ret; | 102 | return ret; |
92 | 103 | ||
93 | static int trace_define_common_fields(struct ftrace_event_call *call) | 104 | static int trace_define_common_fields(void) |
94 | { | 105 | { |
95 | int ret; | 106 | int ret; |
96 | struct trace_entry ent; | 107 | struct trace_entry ent; |
@@ -130,6 +141,55 @@ int trace_event_raw_init(struct ftrace_event_call *call) | |||
130 | } | 141 | } |
131 | EXPORT_SYMBOL_GPL(trace_event_raw_init); | 142 | EXPORT_SYMBOL_GPL(trace_event_raw_init); |
132 | 143 | ||
144 | int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) | ||
145 | { | ||
146 | switch (type) { | ||
147 | case TRACE_REG_REGISTER: | ||
148 | return tracepoint_probe_register(call->name, | ||
149 | call->class->probe, | ||
150 | call); | ||
151 | case TRACE_REG_UNREGISTER: | ||
152 | tracepoint_probe_unregister(call->name, | ||
153 | call->class->probe, | ||
154 | call); | ||
155 | return 0; | ||
156 | |||
157 | #ifdef CONFIG_PERF_EVENTS | ||
158 | case TRACE_REG_PERF_REGISTER: | ||
159 | return tracepoint_probe_register(call->name, | ||
160 | call->class->perf_probe, | ||
161 | call); | ||
162 | case TRACE_REG_PERF_UNREGISTER: | ||
163 | tracepoint_probe_unregister(call->name, | ||
164 | call->class->perf_probe, | ||
165 | call); | ||
166 | return 0; | ||
167 | #endif | ||
168 | } | ||
169 | return 0; | ||
170 | } | ||
171 | EXPORT_SYMBOL_GPL(ftrace_event_reg); | ||
172 | |||
173 | void trace_event_enable_cmd_record(bool enable) | ||
174 | { | ||
175 | struct ftrace_event_call *call; | ||
176 | |||
177 | mutex_lock(&event_mutex); | ||
178 | list_for_each_entry(call, &ftrace_events, list) { | ||
179 | if (!(call->flags & TRACE_EVENT_FL_ENABLED)) | ||
180 | continue; | ||
181 | |||
182 | if (enable) { | ||
183 | tracing_start_cmdline_record(); | ||
184 | call->flags |= TRACE_EVENT_FL_RECORDED_CMD; | ||
185 | } else { | ||
186 | tracing_stop_cmdline_record(); | ||
187 | call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; | ||
188 | } | ||
189 | } | ||
190 | mutex_unlock(&event_mutex); | ||
191 | } | ||
192 | |||
133 | static int ftrace_event_enable_disable(struct ftrace_event_call *call, | 193 | static int ftrace_event_enable_disable(struct ftrace_event_call *call, |
134 | int enable) | 194 | int enable) |
135 | { | 195 | { |
@@ -139,24 +199,20 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, | |||
139 | case 0: | 199 | case 0: |
140 | if (call->flags & TRACE_EVENT_FL_ENABLED) { | 200 | if (call->flags & TRACE_EVENT_FL_ENABLED) { |
141 | call->flags &= ~TRACE_EVENT_FL_ENABLED; | 201 | call->flags &= ~TRACE_EVENT_FL_ENABLED; |
142 | tracing_stop_cmdline_record(); | 202 | if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { |
143 | if (call->class->reg) | 203 | tracing_stop_cmdline_record(); |
144 | call->class->reg(call, TRACE_REG_UNREGISTER); | 204 | call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; |
145 | else | 205 | } |
146 | tracepoint_probe_unregister(call->name, | 206 | call->class->reg(call, TRACE_REG_UNREGISTER); |
147 | call->class->probe, | ||
148 | call); | ||
149 | } | 207 | } |
150 | break; | 208 | break; |
151 | case 1: | 209 | case 1: |
152 | if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { | 210 | if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { |
153 | tracing_start_cmdline_record(); | 211 | if (trace_flags & TRACE_ITER_RECORD_CMD) { |
154 | if (call->class->reg) | 212 | tracing_start_cmdline_record(); |
155 | ret = call->class->reg(call, TRACE_REG_REGISTER); | 213 | call->flags |= TRACE_EVENT_FL_RECORDED_CMD; |
156 | else | 214 | } |
157 | ret = tracepoint_probe_register(call->name, | 215 | ret = call->class->reg(call, TRACE_REG_REGISTER); |
158 | call->class->probe, | ||
159 | call); | ||
160 | if (ret) { | 216 | if (ret) { |
161 | tracing_stop_cmdline_record(); | 217 | tracing_stop_cmdline_record(); |
162 | pr_info("event trace: Could not enable event " | 218 | pr_info("event trace: Could not enable event " |
@@ -194,8 +250,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, | |||
194 | mutex_lock(&event_mutex); | 250 | mutex_lock(&event_mutex); |
195 | list_for_each_entry(call, &ftrace_events, list) { | 251 | list_for_each_entry(call, &ftrace_events, list) { |
196 | 252 | ||
197 | if (!call->name || !call->class || | 253 | if (!call->name || !call->class || !call->class->reg) |
198 | (!call->class->probe && !call->class->reg)) | ||
199 | continue; | 254 | continue; |
200 | 255 | ||
201 | if (match && | 256 | if (match && |
@@ -321,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
321 | * The ftrace subsystem is for showing formats only. | 376 | * The ftrace subsystem is for showing formats only. |
322 | * They can not be enabled or disabled via the event files. | 377 | * They can not be enabled or disabled via the event files. |
323 | */ | 378 | */ |
324 | if (call->class && (call->class->probe || call->class->reg)) | 379 | if (call->class && call->class->reg) |
325 | return call; | 380 | return call; |
326 | } | 381 | } |
327 | 382 | ||
@@ -474,8 +529,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
474 | 529 | ||
475 | mutex_lock(&event_mutex); | 530 | mutex_lock(&event_mutex); |
476 | list_for_each_entry(call, &ftrace_events, list) { | 531 | list_for_each_entry(call, &ftrace_events, list) { |
477 | if (!call->name || !call->class || | 532 | if (!call->name || !call->class || !call->class->reg) |
478 | (!call->class->probe && !call->class->reg)) | ||
479 | continue; | 533 | continue; |
480 | 534 | ||
481 | if (system && strcmp(call->class->system, system) != 0) | 535 | if (system && strcmp(call->class->system, system) != 0) |
@@ -544,85 +598,146 @@ out: | |||
544 | return ret; | 598 | return ret; |
545 | } | 599 | } |
546 | 600 | ||
547 | static ssize_t | 601 | enum { |
548 | event_format_read(struct file *filp, char __user *ubuf, size_t cnt, | 602 | FORMAT_HEADER = 1, |
549 | loff_t *ppos) | 603 | FORMAT_FIELD_SEPERATOR = 2, |
604 | FORMAT_PRINTFMT = 3, | ||
605 | }; | ||
606 | |||
607 | static void *f_next(struct seq_file *m, void *v, loff_t *pos) | ||
550 | { | 608 | { |
551 | struct ftrace_event_call *call = filp->private_data; | 609 | struct ftrace_event_call *call = m->private; |
552 | struct ftrace_event_field *field; | 610 | struct ftrace_event_field *field; |
553 | struct list_head *head; | 611 | struct list_head *common_head = &ftrace_common_fields; |
554 | struct trace_seq *s; | 612 | struct list_head *head = trace_get_fields(call); |
555 | int common_field_count = 5; | ||
556 | char *buf; | ||
557 | int r = 0; | ||
558 | 613 | ||
559 | if (*ppos) | 614 | (*pos)++; |
560 | return 0; | ||
561 | 615 | ||
562 | s = kmalloc(sizeof(*s), GFP_KERNEL); | 616 | switch ((unsigned long)v) { |
563 | if (!s) | 617 | case FORMAT_HEADER: |
564 | return -ENOMEM; | 618 | if (unlikely(list_empty(common_head))) |
619 | return NULL; | ||
565 | 620 | ||
566 | trace_seq_init(s); | 621 | field = list_entry(common_head->prev, |
622 | struct ftrace_event_field, link); | ||
623 | return field; | ||
567 | 624 | ||
568 | trace_seq_printf(s, "name: %s\n", call->name); | 625 | case FORMAT_FIELD_SEPERATOR: |
569 | trace_seq_printf(s, "ID: %d\n", call->event.type); | 626 | if (unlikely(list_empty(head))) |
570 | trace_seq_printf(s, "format:\n"); | 627 | return NULL; |
571 | 628 | ||
572 | head = trace_get_fields(call); | 629 | field = list_entry(head->prev, struct ftrace_event_field, link); |
573 | list_for_each_entry_reverse(field, head, link) { | 630 | return field; |
574 | /* | ||
575 | * Smartly shows the array type(except dynamic array). | ||
576 | * Normal: | ||
577 | * field:TYPE VAR | ||
578 | * If TYPE := TYPE[LEN], it is shown: | ||
579 | * field:TYPE VAR[LEN] | ||
580 | */ | ||
581 | const char *array_descriptor = strchr(field->type, '['); | ||
582 | 631 | ||
583 | if (!strncmp(field->type, "__data_loc", 10)) | 632 | case FORMAT_PRINTFMT: |
584 | array_descriptor = NULL; | 633 | /* all done */ |
634 | return NULL; | ||
635 | } | ||
585 | 636 | ||
586 | if (!array_descriptor) { | 637 | field = v; |
587 | r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" | 638 | if (field->link.prev == common_head) |
588 | "\tsize:%u;\tsigned:%d;\n", | 639 | return (void *)FORMAT_FIELD_SEPERATOR; |
589 | field->type, field->name, field->offset, | 640 | else if (field->link.prev == head) |
590 | field->size, !!field->is_signed); | 641 | return (void *)FORMAT_PRINTFMT; |
591 | } else { | ||
592 | r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" | ||
593 | "\tsize:%u;\tsigned:%d;\n", | ||
594 | (int)(array_descriptor - field->type), | ||
595 | field->type, field->name, | ||
596 | array_descriptor, field->offset, | ||
597 | field->size, !!field->is_signed); | ||
598 | } | ||
599 | 642 | ||
600 | if (--common_field_count == 0) | 643 | field = list_entry(field->link.prev, struct ftrace_event_field, link); |
601 | r = trace_seq_printf(s, "\n"); | ||
602 | 644 | ||
603 | if (!r) | 645 | return field; |
604 | break; | 646 | } |
605 | } | ||
606 | 647 | ||
607 | if (r) | 648 | static void *f_start(struct seq_file *m, loff_t *pos) |
608 | r = trace_seq_printf(s, "\nprint fmt: %s\n", | 649 | { |
609 | call->print_fmt); | 650 | loff_t l = 0; |
651 | void *p; | ||
610 | 652 | ||
611 | if (!r) { | 653 | /* Start by showing the header */ |
612 | /* | 654 | if (!*pos) |
613 | * ug! The format output is bigger than a PAGE!! | 655 | return (void *)FORMAT_HEADER; |
614 | */ | 656 | |
615 | buf = "FORMAT TOO BIG\n"; | 657 | p = (void *)FORMAT_HEADER; |
616 | r = simple_read_from_buffer(ubuf, cnt, ppos, | 658 | do { |
617 | buf, strlen(buf)); | 659 | p = f_next(m, p, &l); |
618 | goto out; | 660 | } while (p && l < *pos); |
661 | |||
662 | return p; | ||
663 | } | ||
664 | |||
665 | static int f_show(struct seq_file *m, void *v) | ||
666 | { | ||
667 | struct ftrace_event_call *call = m->private; | ||
668 | struct ftrace_event_field *field; | ||
669 | const char *array_descriptor; | ||
670 | |||
671 | switch ((unsigned long)v) { | ||
672 | case FORMAT_HEADER: | ||
673 | seq_printf(m, "name: %s\n", call->name); | ||
674 | seq_printf(m, "ID: %d\n", call->event.type); | ||
675 | seq_printf(m, "format:\n"); | ||
676 | return 0; | ||
677 | |||
678 | case FORMAT_FIELD_SEPERATOR: | ||
679 | seq_putc(m, '\n'); | ||
680 | return 0; | ||
681 | |||
682 | case FORMAT_PRINTFMT: | ||
683 | seq_printf(m, "\nprint fmt: %s\n", | ||
684 | call->print_fmt); | ||
685 | return 0; | ||
619 | } | 686 | } |
620 | 687 | ||
621 | r = simple_read_from_buffer(ubuf, cnt, ppos, | 688 | field = v; |
622 | s->buffer, s->len); | 689 | |
623 | out: | 690 | /* |
624 | kfree(s); | 691 | * Smartly shows the array type(except dynamic array). |
625 | return r; | 692 | * Normal: |
693 | * field:TYPE VAR | ||
694 | * If TYPE := TYPE[LEN], it is shown: | ||
695 | * field:TYPE VAR[LEN] | ||
696 | */ | ||
697 | array_descriptor = strchr(field->type, '['); | ||
698 | |||
699 | if (!strncmp(field->type, "__data_loc", 10)) | ||
700 | array_descriptor = NULL; | ||
701 | |||
702 | if (!array_descriptor) | ||
703 | seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", | ||
704 | field->type, field->name, field->offset, | ||
705 | field->size, !!field->is_signed); | ||
706 | else | ||
707 | seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", | ||
708 | (int)(array_descriptor - field->type), | ||
709 | field->type, field->name, | ||
710 | array_descriptor, field->offset, | ||
711 | field->size, !!field->is_signed); | ||
712 | |||
713 | return 0; | ||
714 | } | ||
715 | |||
716 | static void f_stop(struct seq_file *m, void *p) | ||
717 | { | ||
718 | } | ||
719 | |||
720 | static const struct seq_operations trace_format_seq_ops = { | ||
721 | .start = f_start, | ||
722 | .next = f_next, | ||
723 | .stop = f_stop, | ||
724 | .show = f_show, | ||
725 | }; | ||
726 | |||
727 | static int trace_format_open(struct inode *inode, struct file *file) | ||
728 | { | ||
729 | struct ftrace_event_call *call = inode->i_private; | ||
730 | struct seq_file *m; | ||
731 | int ret; | ||
732 | |||
733 | ret = seq_open(file, &trace_format_seq_ops); | ||
734 | if (ret < 0) | ||
735 | return ret; | ||
736 | |||
737 | m = file->private_data; | ||
738 | m->private = call; | ||
739 | |||
740 | return 0; | ||
626 | } | 741 | } |
627 | 742 | ||
628 | static ssize_t | 743 | static ssize_t |
@@ -817,39 +932,47 @@ static const struct file_operations ftrace_enable_fops = { | |||
817 | .open = tracing_open_generic, | 932 | .open = tracing_open_generic, |
818 | .read = event_enable_read, | 933 | .read = event_enable_read, |
819 | .write = event_enable_write, | 934 | .write = event_enable_write, |
935 | .llseek = default_llseek, | ||
820 | }; | 936 | }; |
821 | 937 | ||
822 | static const struct file_operations ftrace_event_format_fops = { | 938 | static const struct file_operations ftrace_event_format_fops = { |
823 | .open = tracing_open_generic, | 939 | .open = trace_format_open, |
824 | .read = event_format_read, | 940 | .read = seq_read, |
941 | .llseek = seq_lseek, | ||
942 | .release = seq_release, | ||
825 | }; | 943 | }; |
826 | 944 | ||
827 | static const struct file_operations ftrace_event_id_fops = { | 945 | static const struct file_operations ftrace_event_id_fops = { |
828 | .open = tracing_open_generic, | 946 | .open = tracing_open_generic, |
829 | .read = event_id_read, | 947 | .read = event_id_read, |
948 | .llseek = default_llseek, | ||
830 | }; | 949 | }; |
831 | 950 | ||
832 | static const struct file_operations ftrace_event_filter_fops = { | 951 | static const struct file_operations ftrace_event_filter_fops = { |
833 | .open = tracing_open_generic, | 952 | .open = tracing_open_generic, |
834 | .read = event_filter_read, | 953 | .read = event_filter_read, |
835 | .write = event_filter_write, | 954 | .write = event_filter_write, |
955 | .llseek = default_llseek, | ||
836 | }; | 956 | }; |
837 | 957 | ||
838 | static const struct file_operations ftrace_subsystem_filter_fops = { | 958 | static const struct file_operations ftrace_subsystem_filter_fops = { |
839 | .open = tracing_open_generic, | 959 | .open = tracing_open_generic, |
840 | .read = subsystem_filter_read, | 960 | .read = subsystem_filter_read, |
841 | .write = subsystem_filter_write, | 961 | .write = subsystem_filter_write, |
962 | .llseek = default_llseek, | ||
842 | }; | 963 | }; |
843 | 964 | ||
844 | static const struct file_operations ftrace_system_enable_fops = { | 965 | static const struct file_operations ftrace_system_enable_fops = { |
845 | .open = tracing_open_generic, | 966 | .open = tracing_open_generic, |
846 | .read = system_enable_read, | 967 | .read = system_enable_read, |
847 | .write = system_enable_write, | 968 | .write = system_enable_write, |
969 | .llseek = default_llseek, | ||
848 | }; | 970 | }; |
849 | 971 | ||
850 | static const struct file_operations ftrace_show_header_fops = { | 972 | static const struct file_operations ftrace_show_header_fops = { |
851 | .open = tracing_open_generic, | 973 | .open = tracing_open_generic, |
852 | .read = show_header, | 974 | .read = show_header, |
975 | .llseek = default_llseek, | ||
853 | }; | 976 | }; |
854 | 977 | ||
855 | static struct dentry *event_trace_events_dir(void) | 978 | static struct dentry *event_trace_events_dir(void) |
@@ -963,35 +1086,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
963 | return -1; | 1086 | return -1; |
964 | } | 1087 | } |
965 | 1088 | ||
966 | if (call->class->probe || call->class->reg) | 1089 | if (call->class->reg) |
967 | trace_create_file("enable", 0644, call->dir, call, | 1090 | trace_create_file("enable", 0644, call->dir, call, |
968 | enable); | 1091 | enable); |
969 | 1092 | ||
970 | #ifdef CONFIG_PERF_EVENTS | 1093 | #ifdef CONFIG_PERF_EVENTS |
971 | if (call->event.type && (call->class->perf_probe || call->class->reg)) | 1094 | if (call->event.type && call->class->reg) |
972 | trace_create_file("id", 0444, call->dir, call, | 1095 | trace_create_file("id", 0444, call->dir, call, |
973 | id); | 1096 | id); |
974 | #endif | 1097 | #endif |
975 | 1098 | ||
976 | if (call->class->define_fields) { | 1099 | /* |
977 | /* | 1100 | * Other events may have the same class. Only update |
978 | * Other events may have the same class. Only update | 1101 | * the fields if they are not already defined. |
979 | * the fields if they are not already defined. | 1102 | */ |
980 | */ | 1103 | head = trace_get_fields(call); |
981 | head = trace_get_fields(call); | 1104 | if (list_empty(head)) { |
982 | if (list_empty(head)) { | 1105 | ret = call->class->define_fields(call); |
983 | ret = trace_define_common_fields(call); | 1106 | if (ret < 0) { |
984 | if (!ret) | 1107 | pr_warning("Could not initialize trace point" |
985 | ret = call->class->define_fields(call); | 1108 | " events/%s\n", call->name); |
986 | if (ret < 0) { | 1109 | return ret; |
987 | pr_warning("Could not initialize trace point" | ||
988 | " events/%s\n", call->name); | ||
989 | return ret; | ||
990 | } | ||
991 | } | 1110 | } |
992 | trace_create_file("filter", 0644, call->dir, call, | ||
993 | filter); | ||
994 | } | 1111 | } |
1112 | trace_create_file("filter", 0644, call->dir, call, | ||
1113 | filter); | ||
995 | 1114 | ||
996 | trace_create_file("format", 0444, call->dir, call, | 1115 | trace_create_file("format", 0444, call->dir, call, |
997 | format); | 1116 | format); |
@@ -999,11 +1118,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
999 | return 0; | 1118 | return 0; |
1000 | } | 1119 | } |
1001 | 1120 | ||
1002 | static int __trace_add_event_call(struct ftrace_event_call *call) | 1121 | static int |
1122 | __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, | ||
1123 | const struct file_operations *id, | ||
1124 | const struct file_operations *enable, | ||
1125 | const struct file_operations *filter, | ||
1126 | const struct file_operations *format) | ||
1003 | { | 1127 | { |
1004 | struct dentry *d_events; | 1128 | struct dentry *d_events; |
1005 | int ret; | 1129 | int ret; |
1006 | 1130 | ||
1131 | /* The linker may leave blanks */ | ||
1007 | if (!call->name) | 1132 | if (!call->name) |
1008 | return -EINVAL; | 1133 | return -EINVAL; |
1009 | 1134 | ||
@@ -1011,8 +1136,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call) | |||
1011 | ret = call->class->raw_init(call); | 1136 | ret = call->class->raw_init(call); |
1012 | if (ret < 0) { | 1137 | if (ret < 0) { |
1013 | if (ret != -ENOSYS) | 1138 | if (ret != -ENOSYS) |
1014 | pr_warning("Could not initialize trace " | 1139 | pr_warning("Could not initialize trace events/%s\n", |
1015 | "events/%s\n", call->name); | 1140 | call->name); |
1016 | return ret; | 1141 | return ret; |
1017 | } | 1142 | } |
1018 | } | 1143 | } |
@@ -1021,11 +1146,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call) | |||
1021 | if (!d_events) | 1146 | if (!d_events) |
1022 | return -ENOENT; | 1147 | return -ENOENT; |
1023 | 1148 | ||
1024 | ret = event_create_dir(call, d_events, &ftrace_event_id_fops, | 1149 | ret = event_create_dir(call, d_events, id, enable, filter, format); |
1025 | &ftrace_enable_fops, &ftrace_event_filter_fops, | ||
1026 | &ftrace_event_format_fops); | ||
1027 | if (!ret) | 1150 | if (!ret) |
1028 | list_add(&call->list, &ftrace_events); | 1151 | list_add(&call->list, &ftrace_events); |
1152 | call->mod = mod; | ||
1029 | 1153 | ||
1030 | return ret; | 1154 | return ret; |
1031 | } | 1155 | } |
@@ -1035,7 +1159,10 @@ int trace_add_event_call(struct ftrace_event_call *call) | |||
1035 | { | 1159 | { |
1036 | int ret; | 1160 | int ret; |
1037 | mutex_lock(&event_mutex); | 1161 | mutex_lock(&event_mutex); |
1038 | ret = __trace_add_event_call(call); | 1162 | ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, |
1163 | &ftrace_enable_fops, | ||
1164 | &ftrace_event_filter_fops, | ||
1165 | &ftrace_event_format_fops); | ||
1039 | mutex_unlock(&event_mutex); | 1166 | mutex_unlock(&event_mutex); |
1040 | return ret; | 1167 | return ret; |
1041 | } | 1168 | } |
@@ -1152,8 +1279,6 @@ static void trace_module_add_events(struct module *mod) | |||
1152 | { | 1279 | { |
1153 | struct ftrace_module_file_ops *file_ops = NULL; | 1280 | struct ftrace_module_file_ops *file_ops = NULL; |
1154 | struct ftrace_event_call *call, *start, *end; | 1281 | struct ftrace_event_call *call, *start, *end; |
1155 | struct dentry *d_events; | ||
1156 | int ret; | ||
1157 | 1282 | ||
1158 | start = mod->trace_events; | 1283 | start = mod->trace_events; |
1159 | end = mod->trace_events + mod->num_trace_events; | 1284 | end = mod->trace_events + mod->num_trace_events; |
@@ -1161,38 +1286,14 @@ static void trace_module_add_events(struct module *mod) | |||
1161 | if (start == end) | 1286 | if (start == end) |
1162 | return; | 1287 | return; |
1163 | 1288 | ||
1164 | d_events = event_trace_events_dir(); | 1289 | file_ops = trace_create_file_ops(mod); |
1165 | if (!d_events) | 1290 | if (!file_ops) |
1166 | return; | 1291 | return; |
1167 | 1292 | ||
1168 | for_each_event(call, start, end) { | 1293 | for_each_event(call, start, end) { |
1169 | /* The linker may leave blanks */ | 1294 | __trace_add_event_call(call, mod, |
1170 | if (!call->name) | ||
1171 | continue; | ||
1172 | if (call->class->raw_init) { | ||
1173 | ret = call->class->raw_init(call); | ||
1174 | if (ret < 0) { | ||
1175 | if (ret != -ENOSYS) | ||
1176 | pr_warning("Could not initialize trace " | ||
1177 | "point events/%s\n", call->name); | ||
1178 | continue; | ||
1179 | } | ||
1180 | } | ||
1181 | /* | ||
1182 | * This module has events, create file ops for this module | ||
1183 | * if not already done. | ||
1184 | */ | ||
1185 | if (!file_ops) { | ||
1186 | file_ops = trace_create_file_ops(mod); | ||
1187 | if (!file_ops) | ||
1188 | return; | ||
1189 | } | ||
1190 | call->mod = mod; | ||
1191 | ret = event_create_dir(call, d_events, | ||
1192 | &file_ops->id, &file_ops->enable, | 1295 | &file_ops->id, &file_ops->enable, |
1193 | &file_ops->filter, &file_ops->format); | 1296 | &file_ops->filter, &file_ops->format); |
1194 | if (!ret) | ||
1195 | list_add(&call->list, &ftrace_events); | ||
1196 | } | 1297 | } |
1197 | } | 1298 | } |
1198 | 1299 | ||
@@ -1319,25 +1420,14 @@ static __init int event_trace_init(void) | |||
1319 | trace_create_file("enable", 0644, d_events, | 1420 | trace_create_file("enable", 0644, d_events, |
1320 | NULL, &ftrace_system_enable_fops); | 1421 | NULL, &ftrace_system_enable_fops); |
1321 | 1422 | ||
1423 | if (trace_define_common_fields()) | ||
1424 | pr_warning("tracing: Failed to allocate common fields"); | ||
1425 | |||
1322 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { | 1426 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { |
1323 | /* The linker may leave blanks */ | 1427 | __trace_add_event_call(call, NULL, &ftrace_event_id_fops, |
1324 | if (!call->name) | ||
1325 | continue; | ||
1326 | if (call->class->raw_init) { | ||
1327 | ret = call->class->raw_init(call); | ||
1328 | if (ret < 0) { | ||
1329 | if (ret != -ENOSYS) | ||
1330 | pr_warning("Could not initialize trace " | ||
1331 | "point events/%s\n", call->name); | ||
1332 | continue; | ||
1333 | } | ||
1334 | } | ||
1335 | ret = event_create_dir(call, d_events, &ftrace_event_id_fops, | ||
1336 | &ftrace_enable_fops, | 1428 | &ftrace_enable_fops, |
1337 | &ftrace_event_filter_fops, | 1429 | &ftrace_event_filter_fops, |
1338 | &ftrace_event_format_fops); | 1430 | &ftrace_event_format_fops); |
1339 | if (!ret) | ||
1340 | list_add(&call->list, &ftrace_events); | ||
1341 | } | 1431 | } |
1342 | 1432 | ||
1343 | while (true) { | 1433 | while (true) { |
@@ -1524,12 +1614,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) | |||
1524 | struct ftrace_entry *entry; | 1614 | struct ftrace_entry *entry; |
1525 | unsigned long flags; | 1615 | unsigned long flags; |
1526 | long disabled; | 1616 | long disabled; |
1527 | int resched; | ||
1528 | int cpu; | 1617 | int cpu; |
1529 | int pc; | 1618 | int pc; |
1530 | 1619 | ||
1531 | pc = preempt_count(); | 1620 | pc = preempt_count(); |
1532 | resched = ftrace_preempt_disable(); | 1621 | preempt_disable_notrace(); |
1533 | cpu = raw_smp_processor_id(); | 1622 | cpu = raw_smp_processor_id(); |
1534 | disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); | 1623 | disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); |
1535 | 1624 | ||
@@ -1551,7 +1640,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) | |||
1551 | 1640 | ||
1552 | out: | 1641 | out: |
1553 | atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); | 1642 | atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); |
1554 | ftrace_preempt_enable(resched); | 1643 | preempt_enable_notrace(); |
1555 | } | 1644 | } |
1556 | 1645 | ||
1557 | static struct ftrace_ops trace_ops __initdata = | 1646 | static struct ftrace_ops trace_ops __initdata = |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 57bb1bb32999..36d40104b17f 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -497,12 +497,10 @@ void print_subsystem_event_filter(struct event_subsystem *system, | |||
497 | } | 497 | } |
498 | 498 | ||
499 | static struct ftrace_event_field * | 499 | static struct ftrace_event_field * |
500 | find_event_field(struct ftrace_event_call *call, char *name) | 500 | __find_event_field(struct list_head *head, char *name) |
501 | { | 501 | { |
502 | struct ftrace_event_field *field; | 502 | struct ftrace_event_field *field; |
503 | struct list_head *head; | ||
504 | 503 | ||
505 | head = trace_get_fields(call); | ||
506 | list_for_each_entry(field, head, link) { | 504 | list_for_each_entry(field, head, link) { |
507 | if (!strcmp(field->name, name)) | 505 | if (!strcmp(field->name, name)) |
508 | return field; | 506 | return field; |
@@ -511,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name) | |||
511 | return NULL; | 509 | return NULL; |
512 | } | 510 | } |
513 | 511 | ||
512 | static struct ftrace_event_field * | ||
513 | find_event_field(struct ftrace_event_call *call, char *name) | ||
514 | { | ||
515 | struct ftrace_event_field *field; | ||
516 | struct list_head *head; | ||
517 | |||
518 | field = __find_event_field(&ftrace_common_fields, name); | ||
519 | if (field) | ||
520 | return field; | ||
521 | |||
522 | head = trace_get_fields(call); | ||
523 | return __find_event_field(head, name); | ||
524 | } | ||
525 | |||
514 | static void filter_free_pred(struct filter_pred *pred) | 526 | static void filter_free_pred(struct filter_pred *pred) |
515 | { | 527 | { |
516 | if (!pred) | 528 | if (!pred) |
@@ -627,9 +639,6 @@ static int init_subsystem_preds(struct event_subsystem *system) | |||
627 | int err; | 639 | int err; |
628 | 640 | ||
629 | list_for_each_entry(call, &ftrace_events, list) { | 641 | list_for_each_entry(call, &ftrace_events, list) { |
630 | if (!call->class || !call->class->define_fields) | ||
631 | continue; | ||
632 | |||
633 | if (strcmp(call->class->system, system->name) != 0) | 642 | if (strcmp(call->class->system, system->name) != 0) |
634 | continue; | 643 | continue; |
635 | 644 | ||
@@ -646,9 +655,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) | |||
646 | struct ftrace_event_call *call; | 655 | struct ftrace_event_call *call; |
647 | 656 | ||
648 | list_for_each_entry(call, &ftrace_events, list) { | 657 | list_for_each_entry(call, &ftrace_events, list) { |
649 | if (!call->class || !call->class->define_fields) | ||
650 | continue; | ||
651 | |||
652 | if (strcmp(call->class->system, system->name) != 0) | 658 | if (strcmp(call->class->system, system->name) != 0) |
653 | continue; | 659 | continue; |
654 | 660 | ||
@@ -1251,9 +1257,6 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1251 | list_for_each_entry(call, &ftrace_events, list) { | 1257 | list_for_each_entry(call, &ftrace_events, list) { |
1252 | struct event_filter *filter = call->filter; | 1258 | struct event_filter *filter = call->filter; |
1253 | 1259 | ||
1254 | if (!call->class || !call->class->define_fields) | ||
1255 | continue; | ||
1256 | |||
1257 | if (strcmp(call->class->system, system->name) != 0) | 1260 | if (strcmp(call->class->system, system->name) != 0) |
1258 | continue; | 1261 | continue; |
1259 | 1262 | ||
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 8536e2a65969..4ba44deaac25 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ | |||
125 | 125 | ||
126 | #include "trace_entries.h" | 126 | #include "trace_entries.h" |
127 | 127 | ||
128 | static int ftrace_raw_init_event(struct ftrace_event_call *call) | ||
129 | { | ||
130 | INIT_LIST_HEAD(&call->class->fields); | ||
131 | return 0; | ||
132 | } | ||
133 | |||
134 | #undef __entry | 128 | #undef __entry |
135 | #define __entry REC | 129 | #define __entry REC |
136 | 130 | ||
@@ -158,7 +152,7 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) | |||
158 | struct ftrace_event_class event_class_ftrace_##call = { \ | 152 | struct ftrace_event_class event_class_ftrace_##call = { \ |
159 | .system = __stringify(TRACE_SYSTEM), \ | 153 | .system = __stringify(TRACE_SYSTEM), \ |
160 | .define_fields = ftrace_define_fields_##call, \ | 154 | .define_fields = ftrace_define_fields_##call, \ |
161 | .raw_init = ftrace_raw_init_event, \ | 155 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ |
162 | }; \ | 156 | }; \ |
163 | \ | 157 | \ |
164 | struct ftrace_event_call __used \ | 158 | struct ftrace_event_call __used \ |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b3f3776b0cd6..16aee4d44e8f 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) | |||
54 | struct trace_array_cpu *data; | 54 | struct trace_array_cpu *data; |
55 | unsigned long flags; | 55 | unsigned long flags; |
56 | long disabled; | 56 | long disabled; |
57 | int cpu, resched; | 57 | int cpu; |
58 | int pc; | 58 | int pc; |
59 | 59 | ||
60 | if (unlikely(!ftrace_function_enabled)) | 60 | if (unlikely(!ftrace_function_enabled)) |
61 | return; | 61 | return; |
62 | 62 | ||
63 | pc = preempt_count(); | 63 | pc = preempt_count(); |
64 | resched = ftrace_preempt_disable(); | 64 | preempt_disable_notrace(); |
65 | local_save_flags(flags); | 65 | local_save_flags(flags); |
66 | cpu = raw_smp_processor_id(); | 66 | cpu = raw_smp_processor_id(); |
67 | data = tr->data[cpu]; | 67 | data = tr->data[cpu]; |
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) | |||
71 | trace_function(tr, ip, parent_ip, flags, pc); | 71 | trace_function(tr, ip, parent_ip, flags, pc); |
72 | 72 | ||
73 | atomic_dec(&data->disabled); | 73 | atomic_dec(&data->disabled); |
74 | ftrace_preempt_enable(resched); | 74 | preempt_enable_notrace(); |
75 | } | 75 | } |
76 | 76 | ||
77 | static void | 77 | static void |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 79f4bac99a94..76b05980225c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -15,15 +15,19 @@ | |||
15 | #include "trace.h" | 15 | #include "trace.h" |
16 | #include "trace_output.h" | 16 | #include "trace_output.h" |
17 | 17 | ||
18 | /* When set, irq functions will be ignored */ | ||
19 | static int ftrace_graph_skip_irqs; | ||
20 | |||
18 | struct fgraph_cpu_data { | 21 | struct fgraph_cpu_data { |
19 | pid_t last_pid; | 22 | pid_t last_pid; |
20 | int depth; | 23 | int depth; |
24 | int depth_irq; | ||
21 | int ignore; | 25 | int ignore; |
22 | unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; | 26 | unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; |
23 | }; | 27 | }; |
24 | 28 | ||
25 | struct fgraph_data { | 29 | struct fgraph_data { |
26 | struct fgraph_cpu_data *cpu_data; | 30 | struct fgraph_cpu_data __percpu *cpu_data; |
27 | 31 | ||
28 | /* Place to preserve last processed entry. */ | 32 | /* Place to preserve last processed entry. */ |
29 | struct ftrace_graph_ent_entry ent; | 33 | struct ftrace_graph_ent_entry ent; |
@@ -41,6 +45,7 @@ struct fgraph_data { | |||
41 | #define TRACE_GRAPH_PRINT_PROC 0x8 | 45 | #define TRACE_GRAPH_PRINT_PROC 0x8 |
42 | #define TRACE_GRAPH_PRINT_DURATION 0x10 | 46 | #define TRACE_GRAPH_PRINT_DURATION 0x10 |
43 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 | 47 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 |
48 | #define TRACE_GRAPH_PRINT_IRQS 0x40 | ||
44 | 49 | ||
45 | static struct tracer_opt trace_opts[] = { | 50 | static struct tracer_opt trace_opts[] = { |
46 | /* Display overruns? (for self-debug purpose) */ | 51 | /* Display overruns? (for self-debug purpose) */ |
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = { | |||
55 | { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, | 60 | { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, |
56 | /* Display absolute time of an entry */ | 61 | /* Display absolute time of an entry */ |
57 | { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, | 62 | { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, |
63 | /* Display interrupts */ | ||
64 | { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, | ||
58 | { } /* Empty entry */ | 65 | { } /* Empty entry */ |
59 | }; | 66 | }; |
60 | 67 | ||
61 | static struct tracer_flags tracer_flags = { | 68 | static struct tracer_flags tracer_flags = { |
62 | /* Don't display overruns and proc by default */ | 69 | /* Don't display overruns and proc by default */ |
63 | .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | | 70 | .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | |
64 | TRACE_GRAPH_PRINT_DURATION, | 71 | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, |
65 | .opts = trace_opts | 72 | .opts = trace_opts |
66 | }; | 73 | }; |
67 | 74 | ||
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr, | |||
204 | return 1; | 211 | return 1; |
205 | } | 212 | } |
206 | 213 | ||
214 | static inline int ftrace_graph_ignore_irqs(void) | ||
215 | { | ||
216 | if (!ftrace_graph_skip_irqs) | ||
217 | return 0; | ||
218 | |||
219 | return in_irq(); | ||
220 | } | ||
221 | |||
207 | int trace_graph_entry(struct ftrace_graph_ent *trace) | 222 | int trace_graph_entry(struct ftrace_graph_ent *trace) |
208 | { | 223 | { |
209 | struct trace_array *tr = graph_array; | 224 | struct trace_array *tr = graph_array; |
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
218 | return 0; | 233 | return 0; |
219 | 234 | ||
220 | /* trace it when it is-nested-in or is a function enabled. */ | 235 | /* trace it when it is-nested-in or is a function enabled. */ |
221 | if (!(trace->depth || ftrace_graph_addr(trace->func))) | 236 | if (!(trace->depth || ftrace_graph_addr(trace->func)) || |
237 | ftrace_graph_ignore_irqs()) | ||
222 | return 0; | 238 | return 0; |
223 | 239 | ||
224 | local_irq_save(flags); | 240 | local_irq_save(flags); |
@@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) | |||
246 | return trace_graph_entry(trace); | 262 | return trace_graph_entry(trace); |
247 | } | 263 | } |
248 | 264 | ||
265 | static void | ||
266 | __trace_graph_function(struct trace_array *tr, | ||
267 | unsigned long ip, unsigned long flags, int pc) | ||
268 | { | ||
269 | u64 time = trace_clock_local(); | ||
270 | struct ftrace_graph_ent ent = { | ||
271 | .func = ip, | ||
272 | .depth = 0, | ||
273 | }; | ||
274 | struct ftrace_graph_ret ret = { | ||
275 | .func = ip, | ||
276 | .depth = 0, | ||
277 | .calltime = time, | ||
278 | .rettime = time, | ||
279 | }; | ||
280 | |||
281 | __trace_graph_entry(tr, &ent, flags, pc); | ||
282 | __trace_graph_return(tr, &ret, flags, pc); | ||
283 | } | ||
284 | |||
285 | void | ||
286 | trace_graph_function(struct trace_array *tr, | ||
287 | unsigned long ip, unsigned long parent_ip, | ||
288 | unsigned long flags, int pc) | ||
289 | { | ||
290 | __trace_graph_function(tr, ip, flags, pc); | ||
291 | } | ||
292 | |||
249 | void __trace_graph_return(struct trace_array *tr, | 293 | void __trace_graph_return(struct trace_array *tr, |
250 | struct ftrace_graph_ret *trace, | 294 | struct ftrace_graph_ret *trace, |
251 | unsigned long flags, | 295 | unsigned long flags, |
@@ -507,7 +551,15 @@ get_return_for_leaf(struct trace_iterator *iter, | |||
507 | * if the output fails. | 551 | * if the output fails. |
508 | */ | 552 | */ |
509 | data->ent = *curr; | 553 | data->ent = *curr; |
510 | data->ret = *next; | 554 | /* |
555 | * If the next event is not a return type, then | ||
556 | * we only care about what type it is. Otherwise we can | ||
557 | * safely copy the entire event. | ||
558 | */ | ||
559 | if (next->ent.type == TRACE_GRAPH_RET) | ||
560 | data->ret = *next; | ||
561 | else | ||
562 | data->ret.ent.type = next->ent.type; | ||
511 | } | 563 | } |
512 | } | 564 | } |
513 | 565 | ||
@@ -641,7 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) | |||
641 | 693 | ||
642 | /* Print nsecs (we don't want to exceed 7 numbers) */ | 694 | /* Print nsecs (we don't want to exceed 7 numbers) */ |
643 | if (len < 7) { | 695 | if (len < 7) { |
644 | snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); | 696 | size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); |
697 | |||
698 | snprintf(nsecs_str, slen, "%03lu", nsecs_rem); | ||
645 | ret = trace_seq_printf(s, ".%s", nsecs_str); | 699 | ret = trace_seq_printf(s, ".%s", nsecs_str); |
646 | if (!ret) | 700 | if (!ret) |
647 | return TRACE_TYPE_PARTIAL_LINE; | 701 | return TRACE_TYPE_PARTIAL_LINE; |
@@ -846,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, | |||
846 | return 0; | 900 | return 0; |
847 | } | 901 | } |
848 | 902 | ||
903 | /* | ||
904 | * Entry check for irq code | ||
905 | * | ||
906 | * returns 1 if | ||
907 | * - we are inside irq code | ||
908 | * - we just extered irq code | ||
909 | * | ||
910 | * retunns 0 if | ||
911 | * - funcgraph-interrupts option is set | ||
912 | * - we are not inside irq code | ||
913 | */ | ||
914 | static int | ||
915 | check_irq_entry(struct trace_iterator *iter, u32 flags, | ||
916 | unsigned long addr, int depth) | ||
917 | { | ||
918 | int cpu = iter->cpu; | ||
919 | int *depth_irq; | ||
920 | struct fgraph_data *data = iter->private; | ||
921 | |||
922 | /* | ||
923 | * If we are either displaying irqs, or we got called as | ||
924 | * a graph event and private data does not exist, | ||
925 | * then we bypass the irq check. | ||
926 | */ | ||
927 | if ((flags & TRACE_GRAPH_PRINT_IRQS) || | ||
928 | (!data)) | ||
929 | return 0; | ||
930 | |||
931 | depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); | ||
932 | |||
933 | /* | ||
934 | * We are inside the irq code | ||
935 | */ | ||
936 | if (*depth_irq >= 0) | ||
937 | return 1; | ||
938 | |||
939 | if ((addr < (unsigned long)__irqentry_text_start) || | ||
940 | (addr >= (unsigned long)__irqentry_text_end)) | ||
941 | return 0; | ||
942 | |||
943 | /* | ||
944 | * We are entering irq code. | ||
945 | */ | ||
946 | *depth_irq = depth; | ||
947 | return 1; | ||
948 | } | ||
949 | |||
950 | /* | ||
951 | * Return check for irq code | ||
952 | * | ||
953 | * returns 1 if | ||
954 | * - we are inside irq code | ||
955 | * - we just left irq code | ||
956 | * | ||
957 | * returns 0 if | ||
958 | * - funcgraph-interrupts option is set | ||
959 | * - we are not inside irq code | ||
960 | */ | ||
961 | static int | ||
962 | check_irq_return(struct trace_iterator *iter, u32 flags, int depth) | ||
963 | { | ||
964 | int cpu = iter->cpu; | ||
965 | int *depth_irq; | ||
966 | struct fgraph_data *data = iter->private; | ||
967 | |||
968 | /* | ||
969 | * If we are either displaying irqs, or we got called as | ||
970 | * a graph event and private data does not exist, | ||
971 | * then we bypass the irq check. | ||
972 | */ | ||
973 | if ((flags & TRACE_GRAPH_PRINT_IRQS) || | ||
974 | (!data)) | ||
975 | return 0; | ||
976 | |||
977 | depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); | ||
978 | |||
979 | /* | ||
980 | * We are not inside the irq code. | ||
981 | */ | ||
982 | if (*depth_irq == -1) | ||
983 | return 0; | ||
984 | |||
985 | /* | ||
986 | * We are inside the irq code, and this is returning entry. | ||
987 | * Let's not trace it and clear the entry depth, since | ||
988 | * we are out of irq code. | ||
989 | * | ||
990 | * This condition ensures that we 'leave the irq code' once | ||
991 | * we are out of the entry depth. Thus protecting us from | ||
992 | * the RETURN entry loss. | ||
993 | */ | ||
994 | if (*depth_irq >= depth) { | ||
995 | *depth_irq = -1; | ||
996 | return 1; | ||
997 | } | ||
998 | |||
999 | /* | ||
1000 | * We are inside the irq code, and this is not the entry. | ||
1001 | */ | ||
1002 | return 1; | ||
1003 | } | ||
1004 | |||
849 | static enum print_line_t | 1005 | static enum print_line_t |
850 | print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, | 1006 | print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, |
851 | struct trace_iterator *iter, u32 flags) | 1007 | struct trace_iterator *iter, u32 flags) |
@@ -856,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, | |||
856 | static enum print_line_t ret; | 1012 | static enum print_line_t ret; |
857 | int cpu = iter->cpu; | 1013 | int cpu = iter->cpu; |
858 | 1014 | ||
1015 | if (check_irq_entry(iter, flags, call->func, call->depth)) | ||
1016 | return TRACE_TYPE_HANDLED; | ||
1017 | |||
859 | if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) | 1018 | if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) |
860 | return TRACE_TYPE_PARTIAL_LINE; | 1019 | return TRACE_TYPE_PARTIAL_LINE; |
861 | 1020 | ||
@@ -893,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
893 | int ret; | 1052 | int ret; |
894 | int i; | 1053 | int i; |
895 | 1054 | ||
1055 | if (check_irq_return(iter, flags, trace->depth)) | ||
1056 | return TRACE_TYPE_HANDLED; | ||
1057 | |||
896 | if (data) { | 1058 | if (data) { |
897 | struct fgraph_cpu_data *cpu_data; | 1059 | struct fgraph_cpu_data *cpu_data; |
898 | int cpu = iter->cpu; | 1060 | int cpu = iter->cpu; |
@@ -1045,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
1045 | 1207 | ||
1046 | 1208 | ||
1047 | enum print_line_t | 1209 | enum print_line_t |
1048 | print_graph_function_flags(struct trace_iterator *iter, u32 flags) | 1210 | __print_graph_function_flags(struct trace_iterator *iter, u32 flags) |
1049 | { | 1211 | { |
1050 | struct ftrace_graph_ent_entry *field; | 1212 | struct ftrace_graph_ent_entry *field; |
1051 | struct fgraph_data *data = iter->private; | 1213 | struct fgraph_data *data = iter->private; |
@@ -1108,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) | |||
1108 | static enum print_line_t | 1270 | static enum print_line_t |
1109 | print_graph_function(struct trace_iterator *iter) | 1271 | print_graph_function(struct trace_iterator *iter) |
1110 | { | 1272 | { |
1111 | return print_graph_function_flags(iter, tracer_flags.val); | 1273 | return __print_graph_function_flags(iter, tracer_flags.val); |
1274 | } | ||
1275 | |||
1276 | enum print_line_t print_graph_function_flags(struct trace_iterator *iter, | ||
1277 | u32 flags) | ||
1278 | { | ||
1279 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | ||
1280 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
1281 | else | ||
1282 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
1283 | |||
1284 | return __print_graph_function_flags(iter, flags); | ||
1112 | } | 1285 | } |
1113 | 1286 | ||
1114 | static enum print_line_t | 1287 | static enum print_line_t |
@@ -1140,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) | |||
1140 | seq_printf(s, "#%.*s|||| / \n", size, spaces); | 1313 | seq_printf(s, "#%.*s|||| / \n", size, spaces); |
1141 | } | 1314 | } |
1142 | 1315 | ||
1143 | void print_graph_headers_flags(struct seq_file *s, u32 flags) | 1316 | static void __print_graph_headers_flags(struct seq_file *s, u32 flags) |
1144 | { | 1317 | { |
1145 | int lat = trace_flags & TRACE_ITER_LATENCY_FMT; | 1318 | int lat = trace_flags & TRACE_ITER_LATENCY_FMT; |
1146 | 1319 | ||
@@ -1181,6 +1354,23 @@ void print_graph_headers(struct seq_file *s) | |||
1181 | print_graph_headers_flags(s, tracer_flags.val); | 1354 | print_graph_headers_flags(s, tracer_flags.val); |
1182 | } | 1355 | } |
1183 | 1356 | ||
1357 | void print_graph_headers_flags(struct seq_file *s, u32 flags) | ||
1358 | { | ||
1359 | struct trace_iterator *iter = s->private; | ||
1360 | |||
1361 | if (trace_flags & TRACE_ITER_LATENCY_FMT) { | ||
1362 | /* print nothing if the buffers are empty */ | ||
1363 | if (trace_empty(iter)) | ||
1364 | return; | ||
1365 | |||
1366 | print_trace_header(s, iter); | ||
1367 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
1368 | } else | ||
1369 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
1370 | |||
1371 | __print_graph_headers_flags(s, flags); | ||
1372 | } | ||
1373 | |||
1184 | void graph_trace_open(struct trace_iterator *iter) | 1374 | void graph_trace_open(struct trace_iterator *iter) |
1185 | { | 1375 | { |
1186 | /* pid and depth on the last trace processed */ | 1376 | /* pid and depth on the last trace processed */ |
@@ -1201,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter) | |||
1201 | pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); | 1391 | pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); |
1202 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); | 1392 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); |
1203 | int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); | 1393 | int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); |
1394 | int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); | ||
1395 | |||
1204 | *pid = -1; | 1396 | *pid = -1; |
1205 | *depth = 0; | 1397 | *depth = 0; |
1206 | *ignore = 0; | 1398 | *ignore = 0; |
1399 | *depth_irq = -1; | ||
1207 | } | 1400 | } |
1208 | 1401 | ||
1209 | iter->private = data; | 1402 | iter->private = data; |
@@ -1226,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter) | |||
1226 | } | 1419 | } |
1227 | } | 1420 | } |
1228 | 1421 | ||
1422 | static int func_graph_set_flag(u32 old_flags, u32 bit, int set) | ||
1423 | { | ||
1424 | if (bit == TRACE_GRAPH_PRINT_IRQS) | ||
1425 | ftrace_graph_skip_irqs = !set; | ||
1426 | |||
1427 | return 0; | ||
1428 | } | ||
1429 | |||
1229 | static struct trace_event_functions graph_functions = { | 1430 | static struct trace_event_functions graph_functions = { |
1230 | .trace = print_graph_function_event, | 1431 | .trace = print_graph_function_event, |
1231 | }; | 1432 | }; |
@@ -1252,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = { | |||
1252 | .print_line = print_graph_function, | 1453 | .print_line = print_graph_function, |
1253 | .print_header = print_graph_headers, | 1454 | .print_header = print_graph_headers, |
1254 | .flags = &tracer_flags, | 1455 | .flags = &tracer_flags, |
1456 | .set_flag = func_graph_set_flag, | ||
1255 | #ifdef CONFIG_FTRACE_SELFTEST | 1457 | #ifdef CONFIG_FTRACE_SELFTEST |
1256 | .selftest = trace_selftest_startup_function_graph, | 1458 | .selftest = trace_selftest_startup_function_graph, |
1257 | #endif | 1459 | #endif |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 6fd486e0cef4..5cf8c602b880 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence; | |||
87 | 87 | ||
88 | #ifdef CONFIG_FUNCTION_TRACER | 88 | #ifdef CONFIG_FUNCTION_TRACER |
89 | /* | 89 | /* |
90 | * irqsoff uses its own tracer function to keep the overhead down: | 90 | * Prologue for the preempt and irqs off function tracers. |
91 | * | ||
92 | * Returns 1 if it is OK to continue, and data->disabled is | ||
93 | * incremented. | ||
94 | * 0 if the trace is to be ignored, and data->disabled | ||
95 | * is kept the same. | ||
96 | * | ||
97 | * Note, this function is also used outside this ifdef but | ||
98 | * inside the #ifdef of the function graph tracer below. | ||
99 | * This is OK, since the function graph tracer is | ||
100 | * dependent on the function tracer. | ||
91 | */ | 101 | */ |
92 | static void | 102 | static int func_prolog_dec(struct trace_array *tr, |
93 | irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | 103 | struct trace_array_cpu **data, |
104 | unsigned long *flags) | ||
94 | { | 105 | { |
95 | struct trace_array *tr = irqsoff_trace; | ||
96 | struct trace_array_cpu *data; | ||
97 | unsigned long flags; | ||
98 | long disabled; | 106 | long disabled; |
99 | int cpu; | 107 | int cpu; |
100 | 108 | ||
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
106 | */ | 114 | */ |
107 | cpu = raw_smp_processor_id(); | 115 | cpu = raw_smp_processor_id(); |
108 | if (likely(!per_cpu(tracing_cpu, cpu))) | 116 | if (likely(!per_cpu(tracing_cpu, cpu))) |
109 | return; | 117 | return 0; |
110 | 118 | ||
111 | local_save_flags(flags); | 119 | local_save_flags(*flags); |
112 | /* slight chance to get a false positive on tracing_cpu */ | 120 | /* slight chance to get a false positive on tracing_cpu */ |
113 | if (!irqs_disabled_flags(flags)) | 121 | if (!irqs_disabled_flags(*flags)) |
114 | return; | 122 | return 0; |
115 | 123 | ||
116 | data = tr->data[cpu]; | 124 | *data = tr->data[cpu]; |
117 | disabled = atomic_inc_return(&data->disabled); | 125 | disabled = atomic_inc_return(&(*data)->disabled); |
118 | 126 | ||
119 | if (likely(disabled == 1)) | 127 | if (likely(disabled == 1)) |
120 | trace_function(tr, ip, parent_ip, flags, preempt_count()); | 128 | return 1; |
129 | |||
130 | atomic_dec(&(*data)->disabled); | ||
131 | |||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * irqsoff uses its own tracer function to keep the overhead down: | ||
137 | */ | ||
138 | static void | ||
139 | irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | ||
140 | { | ||
141 | struct trace_array *tr = irqsoff_trace; | ||
142 | struct trace_array_cpu *data; | ||
143 | unsigned long flags; | ||
144 | |||
145 | if (!func_prolog_dec(tr, &data, &flags)) | ||
146 | return; | ||
147 | |||
148 | trace_function(tr, ip, parent_ip, flags, preempt_count()); | ||
121 | 149 | ||
122 | atomic_dec(&data->disabled); | 150 | atomic_dec(&data->disabled); |
123 | } | 151 | } |
@@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) | |||
155 | struct trace_array *tr = irqsoff_trace; | 183 | struct trace_array *tr = irqsoff_trace; |
156 | struct trace_array_cpu *data; | 184 | struct trace_array_cpu *data; |
157 | unsigned long flags; | 185 | unsigned long flags; |
158 | long disabled; | ||
159 | int ret; | 186 | int ret; |
160 | int cpu; | ||
161 | int pc; | 187 | int pc; |
162 | 188 | ||
163 | cpu = raw_smp_processor_id(); | 189 | if (!func_prolog_dec(tr, &data, &flags)) |
164 | if (likely(!per_cpu(tracing_cpu, cpu))) | ||
165 | return 0; | 190 | return 0; |
166 | 191 | ||
167 | local_save_flags(flags); | 192 | pc = preempt_count(); |
168 | /* slight chance to get a false positive on tracing_cpu */ | 193 | ret = __trace_graph_entry(tr, trace, flags, pc); |
169 | if (!irqs_disabled_flags(flags)) | ||
170 | return 0; | ||
171 | |||
172 | data = tr->data[cpu]; | ||
173 | disabled = atomic_inc_return(&data->disabled); | ||
174 | |||
175 | if (likely(disabled == 1)) { | ||
176 | pc = preempt_count(); | ||
177 | ret = __trace_graph_entry(tr, trace, flags, pc); | ||
178 | } else | ||
179 | ret = 0; | ||
180 | |||
181 | atomic_dec(&data->disabled); | 194 | atomic_dec(&data->disabled); |
195 | |||
182 | return ret; | 196 | return ret; |
183 | } | 197 | } |
184 | 198 | ||
@@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) | |||
187 | struct trace_array *tr = irqsoff_trace; | 201 | struct trace_array *tr = irqsoff_trace; |
188 | struct trace_array_cpu *data; | 202 | struct trace_array_cpu *data; |
189 | unsigned long flags; | 203 | unsigned long flags; |
190 | long disabled; | ||
191 | int cpu; | ||
192 | int pc; | 204 | int pc; |
193 | 205 | ||
194 | cpu = raw_smp_processor_id(); | 206 | if (!func_prolog_dec(tr, &data, &flags)) |
195 | if (likely(!per_cpu(tracing_cpu, cpu))) | ||
196 | return; | 207 | return; |
197 | 208 | ||
198 | local_save_flags(flags); | 209 | pc = preempt_count(); |
199 | /* slight chance to get a false positive on tracing_cpu */ | 210 | __trace_graph_return(tr, trace, flags, pc); |
200 | if (!irqs_disabled_flags(flags)) | ||
201 | return; | ||
202 | |||
203 | data = tr->data[cpu]; | ||
204 | disabled = atomic_inc_return(&data->disabled); | ||
205 | |||
206 | if (likely(disabled == 1)) { | ||
207 | pc = preempt_count(); | ||
208 | __trace_graph_return(tr, trace, flags, pc); | ||
209 | } | ||
210 | |||
211 | atomic_dec(&data->disabled); | 211 | atomic_dec(&data->disabled); |
212 | } | 212 | } |
213 | 213 | ||
@@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter) | |||
229 | 229 | ||
230 | static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) | 230 | static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) |
231 | { | 231 | { |
232 | u32 flags = GRAPH_TRACER_FLAGS; | ||
233 | |||
234 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | ||
235 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
236 | else | ||
237 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
238 | |||
239 | /* | 232 | /* |
240 | * In graph mode call the graph tracer output function, | 233 | * In graph mode call the graph tracer output function, |
241 | * otherwise go with the TRACE_FN event handler | 234 | * otherwise go with the TRACE_FN event handler |
242 | */ | 235 | */ |
243 | if (is_graph()) | 236 | if (is_graph()) |
244 | return print_graph_function_flags(iter, flags); | 237 | return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); |
245 | 238 | ||
246 | return TRACE_TYPE_UNHANDLED; | 239 | return TRACE_TYPE_UNHANDLED; |
247 | } | 240 | } |
248 | 241 | ||
249 | static void irqsoff_print_header(struct seq_file *s) | 242 | static void irqsoff_print_header(struct seq_file *s) |
250 | { | 243 | { |
251 | if (is_graph()) { | 244 | if (is_graph()) |
252 | struct trace_iterator *iter = s->private; | 245 | print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); |
253 | u32 flags = GRAPH_TRACER_FLAGS; | 246 | else |
254 | |||
255 | if (trace_flags & TRACE_ITER_LATENCY_FMT) { | ||
256 | /* print nothing if the buffers are empty */ | ||
257 | if (trace_empty(iter)) | ||
258 | return; | ||
259 | |||
260 | print_trace_header(s, iter); | ||
261 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
262 | } else | ||
263 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
264 | |||
265 | print_graph_headers_flags(s, flags); | ||
266 | } else | ||
267 | trace_default_header(s); | 247 | trace_default_header(s); |
268 | } | 248 | } |
269 | 249 | ||
270 | static void | 250 | static void |
271 | trace_graph_function(struct trace_array *tr, | ||
272 | unsigned long ip, unsigned long flags, int pc) | ||
273 | { | ||
274 | u64 time = trace_clock_local(); | ||
275 | struct ftrace_graph_ent ent = { | ||
276 | .func = ip, | ||
277 | .depth = 0, | ||
278 | }; | ||
279 | struct ftrace_graph_ret ret = { | ||
280 | .func = ip, | ||
281 | .depth = 0, | ||
282 | .calltime = time, | ||
283 | .rettime = time, | ||
284 | }; | ||
285 | |||
286 | __trace_graph_entry(tr, &ent, flags, pc); | ||
287 | __trace_graph_return(tr, &ret, flags, pc); | ||
288 | } | ||
289 | |||
290 | static void | ||
291 | __trace_function(struct trace_array *tr, | 251 | __trace_function(struct trace_array *tr, |
292 | unsigned long ip, unsigned long parent_ip, | 252 | unsigned long ip, unsigned long parent_ip, |
293 | unsigned long flags, int pc) | 253 | unsigned long flags, int pc) |
294 | { | 254 | { |
295 | if (!is_graph()) | 255 | if (is_graph()) |
256 | trace_graph_function(tr, ip, parent_ip, flags, pc); | ||
257 | else | ||
296 | trace_function(tr, ip, parent_ip, flags, pc); | 258 | trace_function(tr, ip, parent_ip, flags, pc); |
297 | else { | ||
298 | trace_graph_function(tr, parent_ip, flags, pc); | ||
299 | trace_graph_function(tr, ip, flags, pc); | ||
300 | } | ||
301 | } | 259 | } |
302 | 260 | ||
303 | #else | 261 | #else |
@@ -649,6 +607,7 @@ static struct tracer irqsoff_tracer __read_mostly = | |||
649 | #endif | 607 | #endif |
650 | .open = irqsoff_trace_open, | 608 | .open = irqsoff_trace_open, |
651 | .close = irqsoff_trace_close, | 609 | .close = irqsoff_trace_close, |
610 | .use_max_tr = 1, | ||
652 | }; | 611 | }; |
653 | # define register_irqsoff(trace) register_tracer(&trace) | 612 | # define register_irqsoff(trace) register_tracer(&trace) |
654 | #else | 613 | #else |
@@ -681,6 +640,7 @@ static struct tracer preemptoff_tracer __read_mostly = | |||
681 | #endif | 640 | #endif |
682 | .open = irqsoff_trace_open, | 641 | .open = irqsoff_trace_open, |
683 | .close = irqsoff_trace_close, | 642 | .close = irqsoff_trace_close, |
643 | .use_max_tr = 1, | ||
684 | }; | 644 | }; |
685 | # define register_preemptoff(trace) register_tracer(&trace) | 645 | # define register_preemptoff(trace) register_tracer(&trace) |
686 | #else | 646 | #else |
@@ -715,6 +675,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = | |||
715 | #endif | 675 | #endif |
716 | .open = irqsoff_trace_open, | 676 | .open = irqsoff_trace_open, |
717 | .close = irqsoff_trace_close, | 677 | .close = irqsoff_trace_close, |
678 | .use_max_tr = 1, | ||
718 | }; | 679 | }; |
719 | 680 | ||
720 | # define register_preemptirqsoff(trace) register_tracer(&trace) | 681 | # define register_preemptirqsoff(trace) register_tracer(&trace) |
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c new file mode 100644 index 000000000000..3c5c5dfea0b3 --- /dev/null +++ b/kernel/trace/trace_kdb.c | |||
@@ -0,0 +1,135 @@ | |||
1 | /* | ||
2 | * kdb helper for dumping the ftrace buffer | ||
3 | * | ||
4 | * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com> | ||
5 | * | ||
6 | * ftrace_dump_buf based on ftrace_dump: | ||
7 | * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> | ||
8 | * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> | ||
9 | * | ||
10 | */ | ||
11 | #include <linux/init.h> | ||
12 | #include <linux/kgdb.h> | ||
13 | #include <linux/kdb.h> | ||
14 | #include <linux/ftrace.h> | ||
15 | |||
16 | #include "trace.h" | ||
17 | #include "trace_output.h" | ||
18 | |||
19 | static void ftrace_dump_buf(int skip_lines, long cpu_file) | ||
20 | { | ||
21 | /* use static because iter can be a bit big for the stack */ | ||
22 | static struct trace_iterator iter; | ||
23 | unsigned int old_userobj; | ||
24 | int cnt = 0, cpu; | ||
25 | |||
26 | trace_init_global_iter(&iter); | ||
27 | |||
28 | for_each_tracing_cpu(cpu) { | ||
29 | atomic_inc(&iter.tr->data[cpu]->disabled); | ||
30 | } | ||
31 | |||
32 | old_userobj = trace_flags; | ||
33 | |||
34 | /* don't look at user memory in panic mode */ | ||
35 | trace_flags &= ~TRACE_ITER_SYM_USEROBJ; | ||
36 | |||
37 | kdb_printf("Dumping ftrace buffer:\n"); | ||
38 | |||
39 | /* reset all but tr, trace, and overruns */ | ||
40 | memset(&iter.seq, 0, | ||
41 | sizeof(struct trace_iterator) - | ||
42 | offsetof(struct trace_iterator, seq)); | ||
43 | iter.iter_flags |= TRACE_FILE_LAT_FMT; | ||
44 | iter.pos = -1; | ||
45 | |||
46 | if (cpu_file == TRACE_PIPE_ALL_CPU) { | ||
47 | for_each_tracing_cpu(cpu) { | ||
48 | iter.buffer_iter[cpu] = | ||
49 | ring_buffer_read_prepare(iter.tr->buffer, cpu); | ||
50 | ring_buffer_read_start(iter.buffer_iter[cpu]); | ||
51 | tracing_iter_reset(&iter, cpu); | ||
52 | } | ||
53 | } else { | ||
54 | iter.cpu_file = cpu_file; | ||
55 | iter.buffer_iter[cpu_file] = | ||
56 | ring_buffer_read_prepare(iter.tr->buffer, cpu_file); | ||
57 | ring_buffer_read_start(iter.buffer_iter[cpu_file]); | ||
58 | tracing_iter_reset(&iter, cpu_file); | ||
59 | } | ||
60 | if (!trace_empty(&iter)) | ||
61 | trace_find_next_entry_inc(&iter); | ||
62 | while (!trace_empty(&iter)) { | ||
63 | if (!cnt) | ||
64 | kdb_printf("---------------------------------\n"); | ||
65 | cnt++; | ||
66 | |||
67 | if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) | ||
68 | print_trace_line(&iter); | ||
69 | if (!skip_lines) | ||
70 | trace_printk_seq(&iter.seq); | ||
71 | else | ||
72 | skip_lines--; | ||
73 | if (KDB_FLAG(CMD_INTERRUPT)) | ||
74 | goto out; | ||
75 | } | ||
76 | |||
77 | if (!cnt) | ||
78 | kdb_printf(" (ftrace buffer empty)\n"); | ||
79 | else | ||
80 | kdb_printf("---------------------------------\n"); | ||
81 | |||
82 | out: | ||
83 | trace_flags = old_userobj; | ||
84 | |||
85 | for_each_tracing_cpu(cpu) { | ||
86 | atomic_dec(&iter.tr->data[cpu]->disabled); | ||
87 | } | ||
88 | |||
89 | for_each_tracing_cpu(cpu) | ||
90 | if (iter.buffer_iter[cpu]) | ||
91 | ring_buffer_read_finish(iter.buffer_iter[cpu]); | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * kdb_ftdump - Dump the ftrace log buffer | ||
96 | */ | ||
97 | static int kdb_ftdump(int argc, const char **argv) | ||
98 | { | ||
99 | int skip_lines = 0; | ||
100 | long cpu_file; | ||
101 | char *cp; | ||
102 | |||
103 | if (argc > 2) | ||
104 | return KDB_ARGCOUNT; | ||
105 | |||
106 | if (argc) { | ||
107 | skip_lines = simple_strtol(argv[1], &cp, 0); | ||
108 | if (*cp) | ||
109 | skip_lines = 0; | ||
110 | } | ||
111 | |||
112 | if (argc == 2) { | ||
113 | cpu_file = simple_strtol(argv[2], &cp, 0); | ||
114 | if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 || | ||
115 | !cpu_online(cpu_file)) | ||
116 | return KDB_BADINT; | ||
117 | } else { | ||
118 | cpu_file = TRACE_PIPE_ALL_CPU; | ||
119 | } | ||
120 | |||
121 | kdb_trap_printk++; | ||
122 | ftrace_dump_buf(skip_lines, cpu_file); | ||
123 | kdb_trap_printk--; | ||
124 | |||
125 | return 0; | ||
126 | } | ||
127 | |||
128 | static __init int kdb_ftrace_register(void) | ||
129 | { | ||
130 | kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", | ||
131 | "Dump ftrace log", 0, KDB_REPEAT_NONE); | ||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | late_initcall(kdb_ftrace_register); | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f52b5f50299d..2dec9bcde8b4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/ptrace.h> | 30 | #include <linux/ptrace.h> |
31 | #include <linux/perf_event.h> | 31 | #include <linux/perf_event.h> |
32 | #include <linux/stringify.h> | 32 | #include <linux/stringify.h> |
33 | #include <linux/limits.h> | ||
33 | #include <asm/bitsperlong.h> | 34 | #include <asm/bitsperlong.h> |
34 | 35 | ||
35 | #include "trace.h" | 36 | #include "trace.h" |
@@ -38,6 +39,7 @@ | |||
38 | #define MAX_TRACE_ARGS 128 | 39 | #define MAX_TRACE_ARGS 128 |
39 | #define MAX_ARGSTR_LEN 63 | 40 | #define MAX_ARGSTR_LEN 63 |
40 | #define MAX_EVENT_NAME_LEN 64 | 41 | #define MAX_EVENT_NAME_LEN 64 |
42 | #define MAX_STRING_SIZE PATH_MAX | ||
41 | #define KPROBE_EVENT_SYSTEM "kprobes" | 43 | #define KPROBE_EVENT_SYSTEM "kprobes" |
42 | 44 | ||
43 | /* Reserved field names */ | 45 | /* Reserved field names */ |
@@ -58,14 +60,16 @@ const char *reserved_field_names[] = { | |||
58 | }; | 60 | }; |
59 | 61 | ||
60 | /* Printing function type */ | 62 | /* Printing function type */ |
61 | typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); | 63 | typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, |
64 | void *); | ||
62 | #define PRINT_TYPE_FUNC_NAME(type) print_type_##type | 65 | #define PRINT_TYPE_FUNC_NAME(type) print_type_##type |
63 | #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type | 66 | #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type |
64 | 67 | ||
65 | /* Printing in basic type function template */ | 68 | /* Printing in basic type function template */ |
66 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ | 69 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ |
67 | static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ | 70 | static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ |
68 | const char *name, void *data)\ | 71 | const char *name, \ |
72 | void *data, void *ent)\ | ||
69 | { \ | 73 | { \ |
70 | return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ | 74 | return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ |
71 | } \ | 75 | } \ |
@@ -80,6 +84,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) | |||
80 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) | 84 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) |
81 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) | 85 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) |
82 | 86 | ||
87 | /* data_rloc: data relative location, compatible with u32 */ | ||
88 | #define make_data_rloc(len, roffs) \ | ||
89 | (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) | ||
90 | #define get_rloc_len(dl) ((u32)(dl) >> 16) | ||
91 | #define get_rloc_offs(dl) ((u32)(dl) & 0xffff) | ||
92 | |||
93 | static inline void *get_rloc_data(u32 *dl) | ||
94 | { | ||
95 | return (u8 *)dl + get_rloc_offs(*dl); | ||
96 | } | ||
97 | |||
98 | /* For data_loc conversion */ | ||
99 | static inline void *get_loc_data(u32 *dl, void *ent) | ||
100 | { | ||
101 | return (u8 *)ent + get_rloc_offs(*dl); | ||
102 | } | ||
103 | |||
104 | /* | ||
105 | * Convert data_rloc to data_loc: | ||
106 | * data_rloc stores the offset from data_rloc itself, but data_loc | ||
107 | * stores the offset from event entry. | ||
108 | */ | ||
109 | #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) | ||
110 | |||
111 | /* For defining macros, define string/string_size types */ | ||
112 | typedef u32 string; | ||
113 | typedef u32 string_size; | ||
114 | |||
115 | /* Print type function for string type */ | ||
116 | static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, | ||
117 | const char *name, | ||
118 | void *data, void *ent) | ||
119 | { | ||
120 | int len = *(u32 *)data >> 16; | ||
121 | |||
122 | if (!len) | ||
123 | return trace_seq_printf(s, " %s=(fault)", name); | ||
124 | else | ||
125 | return trace_seq_printf(s, " %s=\"%s\"", name, | ||
126 | (const char *)get_loc_data(data, ent)); | ||
127 | } | ||
128 | static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; | ||
129 | |||
83 | /* Data fetch function type */ | 130 | /* Data fetch function type */ |
84 | typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); | 131 | typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); |
85 | 132 | ||
@@ -94,32 +141,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm, | |||
94 | return fprm->fn(regs, fprm->data, dest); | 141 | return fprm->fn(regs, fprm->data, dest); |
95 | } | 142 | } |
96 | 143 | ||
97 | #define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type | 144 | #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type |
98 | /* | 145 | /* |
99 | * Define macro for basic types - we don't need to define s* types, because | 146 | * Define macro for basic types - we don't need to define s* types, because |
100 | * we have to care only about bitwidth at recording time. | 147 | * we have to care only about bitwidth at recording time. |
101 | */ | 148 | */ |
102 | #define DEFINE_BASIC_FETCH_FUNCS(kind) \ | 149 | #define DEFINE_BASIC_FETCH_FUNCS(method) \ |
103 | DEFINE_FETCH_##kind(u8) \ | 150 | DEFINE_FETCH_##method(u8) \ |
104 | DEFINE_FETCH_##kind(u16) \ | 151 | DEFINE_FETCH_##method(u16) \ |
105 | DEFINE_FETCH_##kind(u32) \ | 152 | DEFINE_FETCH_##method(u32) \ |
106 | DEFINE_FETCH_##kind(u64) | 153 | DEFINE_FETCH_##method(u64) |
107 | 154 | ||
108 | #define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ | 155 | #define CHECK_FETCH_FUNCS(method, fn) \ |
109 | ((FETCH_FUNC_NAME(kind, u8) == fn) || \ | 156 | (((FETCH_FUNC_NAME(method, u8) == fn) || \ |
110 | (FETCH_FUNC_NAME(kind, u16) == fn) || \ | 157 | (FETCH_FUNC_NAME(method, u16) == fn) || \ |
111 | (FETCH_FUNC_NAME(kind, u32) == fn) || \ | 158 | (FETCH_FUNC_NAME(method, u32) == fn) || \ |
112 | (FETCH_FUNC_NAME(kind, u64) == fn)) | 159 | (FETCH_FUNC_NAME(method, u64) == fn) || \ |
160 | (FETCH_FUNC_NAME(method, string) == fn) || \ | ||
161 | (FETCH_FUNC_NAME(method, string_size) == fn)) \ | ||
162 | && (fn != NULL)) | ||
113 | 163 | ||
114 | /* Data fetch function templates */ | 164 | /* Data fetch function templates */ |
115 | #define DEFINE_FETCH_reg(type) \ | 165 | #define DEFINE_FETCH_reg(type) \ |
116 | static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ | 166 | static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ |
117 | void *offset, void *dest) \ | 167 | void *offset, void *dest) \ |
118 | { \ | 168 | { \ |
119 | *(type *)dest = (type)regs_get_register(regs, \ | 169 | *(type *)dest = (type)regs_get_register(regs, \ |
120 | (unsigned int)((unsigned long)offset)); \ | 170 | (unsigned int)((unsigned long)offset)); \ |
121 | } | 171 | } |
122 | DEFINE_BASIC_FETCH_FUNCS(reg) | 172 | DEFINE_BASIC_FETCH_FUNCS(reg) |
173 | /* No string on the register */ | ||
174 | #define fetch_reg_string NULL | ||
175 | #define fetch_reg_string_size NULL | ||
123 | 176 | ||
124 | #define DEFINE_FETCH_stack(type) \ | 177 | #define DEFINE_FETCH_stack(type) \ |
125 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | 178 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ |
@@ -129,6 +182,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | |||
129 | (unsigned int)((unsigned long)offset)); \ | 182 | (unsigned int)((unsigned long)offset)); \ |
130 | } | 183 | } |
131 | DEFINE_BASIC_FETCH_FUNCS(stack) | 184 | DEFINE_BASIC_FETCH_FUNCS(stack) |
185 | /* No string on the stack entry */ | ||
186 | #define fetch_stack_string NULL | ||
187 | #define fetch_stack_string_size NULL | ||
132 | 188 | ||
133 | #define DEFINE_FETCH_retval(type) \ | 189 | #define DEFINE_FETCH_retval(type) \ |
134 | static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ | 190 | static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ |
@@ -137,6 +193,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ | |||
137 | *(type *)dest = (type)regs_return_value(regs); \ | 193 | *(type *)dest = (type)regs_return_value(regs); \ |
138 | } | 194 | } |
139 | DEFINE_BASIC_FETCH_FUNCS(retval) | 195 | DEFINE_BASIC_FETCH_FUNCS(retval) |
196 | /* No string on the retval */ | ||
197 | #define fetch_retval_string NULL | ||
198 | #define fetch_retval_string_size NULL | ||
140 | 199 | ||
141 | #define DEFINE_FETCH_memory(type) \ | 200 | #define DEFINE_FETCH_memory(type) \ |
142 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | 201 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ |
@@ -149,6 +208,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | |||
149 | *(type *)dest = retval; \ | 208 | *(type *)dest = retval; \ |
150 | } | 209 | } |
151 | DEFINE_BASIC_FETCH_FUNCS(memory) | 210 | DEFINE_BASIC_FETCH_FUNCS(memory) |
211 | /* | ||
212 | * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max | ||
213 | * length and relative data location. | ||
214 | */ | ||
215 | static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | ||
216 | void *addr, void *dest) | ||
217 | { | ||
218 | long ret; | ||
219 | int maxlen = get_rloc_len(*(u32 *)dest); | ||
220 | u8 *dst = get_rloc_data(dest); | ||
221 | u8 *src = addr; | ||
222 | mm_segment_t old_fs = get_fs(); | ||
223 | if (!maxlen) | ||
224 | return; | ||
225 | /* | ||
226 | * Try to get string again, since the string can be changed while | ||
227 | * probing. | ||
228 | */ | ||
229 | set_fs(KERNEL_DS); | ||
230 | pagefault_disable(); | ||
231 | do | ||
232 | ret = __copy_from_user_inatomic(dst++, src++, 1); | ||
233 | while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); | ||
234 | dst[-1] = '\0'; | ||
235 | pagefault_enable(); | ||
236 | set_fs(old_fs); | ||
237 | |||
238 | if (ret < 0) { /* Failed to fetch string */ | ||
239 | ((u8 *)get_rloc_data(dest))[0] = '\0'; | ||
240 | *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); | ||
241 | } else | ||
242 | *(u32 *)dest = make_data_rloc(src - (u8 *)addr, | ||
243 | get_rloc_offs(*(u32 *)dest)); | ||
244 | } | ||
245 | /* Return the length of string -- including null terminal byte */ | ||
246 | static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, | ||
247 | void *addr, void *dest) | ||
248 | { | ||
249 | int ret, len = 0; | ||
250 | u8 c; | ||
251 | mm_segment_t old_fs = get_fs(); | ||
252 | |||
253 | set_fs(KERNEL_DS); | ||
254 | pagefault_disable(); | ||
255 | do { | ||
256 | ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); | ||
257 | len++; | ||
258 | } while (c && ret == 0 && len < MAX_STRING_SIZE); | ||
259 | pagefault_enable(); | ||
260 | set_fs(old_fs); | ||
261 | |||
262 | if (ret < 0) /* Failed to check the length */ | ||
263 | *(u32 *)dest = 0; | ||
264 | else | ||
265 | *(u32 *)dest = len; | ||
266 | } | ||
152 | 267 | ||
153 | /* Memory fetching by symbol */ | 268 | /* Memory fetching by symbol */ |
154 | struct symbol_cache { | 269 | struct symbol_cache { |
@@ -203,6 +318,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ | |||
203 | *(type *)dest = 0; \ | 318 | *(type *)dest = 0; \ |
204 | } | 319 | } |
205 | DEFINE_BASIC_FETCH_FUNCS(symbol) | 320 | DEFINE_BASIC_FETCH_FUNCS(symbol) |
321 | DEFINE_FETCH_symbol(string) | ||
322 | DEFINE_FETCH_symbol(string_size) | ||
206 | 323 | ||
207 | /* Dereference memory access function */ | 324 | /* Dereference memory access function */ |
208 | struct deref_fetch_param { | 325 | struct deref_fetch_param { |
@@ -224,12 +341,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ | |||
224 | *(type *)dest = 0; \ | 341 | *(type *)dest = 0; \ |
225 | } | 342 | } |
226 | DEFINE_BASIC_FETCH_FUNCS(deref) | 343 | DEFINE_BASIC_FETCH_FUNCS(deref) |
344 | DEFINE_FETCH_deref(string) | ||
345 | DEFINE_FETCH_deref(string_size) | ||
227 | 346 | ||
228 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | 347 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) |
229 | { | 348 | { |
230 | if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) | 349 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) |
231 | free_deref_fetch_param(data->orig.data); | 350 | free_deref_fetch_param(data->orig.data); |
232 | else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) | 351 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) |
233 | free_symbol_cache(data->orig.data); | 352 | free_symbol_cache(data->orig.data); |
234 | kfree(data); | 353 | kfree(data); |
235 | } | 354 | } |
@@ -240,23 +359,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | |||
240 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) | 359 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) |
241 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) | 360 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) |
242 | 361 | ||
243 | #define ASSIGN_FETCH_FUNC(kind, type) \ | 362 | /* Fetch types */ |
244 | .kind = FETCH_FUNC_NAME(kind, type) | 363 | enum { |
245 | 364 | FETCH_MTD_reg = 0, | |
246 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ | 365 | FETCH_MTD_stack, |
247 | {.name = #ptype, \ | 366 | FETCH_MTD_retval, |
248 | .size = sizeof(ftype), \ | 367 | FETCH_MTD_memory, |
249 | .is_signed = sign, \ | 368 | FETCH_MTD_symbol, |
250 | .print = PRINT_TYPE_FUNC_NAME(ptype), \ | 369 | FETCH_MTD_deref, |
251 | .fmt = PRINT_TYPE_FMT_NAME(ptype), \ | 370 | FETCH_MTD_END, |
252 | ASSIGN_FETCH_FUNC(reg, ftype), \ | 371 | }; |
253 | ASSIGN_FETCH_FUNC(stack, ftype), \ | 372 | |
254 | ASSIGN_FETCH_FUNC(retval, ftype), \ | 373 | #define ASSIGN_FETCH_FUNC(method, type) \ |
255 | ASSIGN_FETCH_FUNC(memory, ftype), \ | 374 | [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) |
256 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | 375 | |
257 | ASSIGN_FETCH_FUNC(deref, ftype), \ | 376 | #define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ |
377 | {.name = _name, \ | ||
378 | .size = _size, \ | ||
379 | .is_signed = sign, \ | ||
380 | .print = PRINT_TYPE_FUNC_NAME(ptype), \ | ||
381 | .fmt = PRINT_TYPE_FMT_NAME(ptype), \ | ||
382 | .fmttype = _fmttype, \ | ||
383 | .fetch = { \ | ||
384 | ASSIGN_FETCH_FUNC(reg, ftype), \ | ||
385 | ASSIGN_FETCH_FUNC(stack, ftype), \ | ||
386 | ASSIGN_FETCH_FUNC(retval, ftype), \ | ||
387 | ASSIGN_FETCH_FUNC(memory, ftype), \ | ||
388 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | ||
389 | ASSIGN_FETCH_FUNC(deref, ftype), \ | ||
390 | } \ | ||
258 | } | 391 | } |
259 | 392 | ||
393 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ | ||
394 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) | ||
395 | |||
396 | #define FETCH_TYPE_STRING 0 | ||
397 | #define FETCH_TYPE_STRSIZE 1 | ||
398 | |||
260 | /* Fetch type information table */ | 399 | /* Fetch type information table */ |
261 | static const struct fetch_type { | 400 | static const struct fetch_type { |
262 | const char *name; /* Name of type */ | 401 | const char *name; /* Name of type */ |
@@ -264,14 +403,16 @@ static const struct fetch_type { | |||
264 | int is_signed; /* Signed flag */ | 403 | int is_signed; /* Signed flag */ |
265 | print_type_func_t print; /* Print functions */ | 404 | print_type_func_t print; /* Print functions */ |
266 | const char *fmt; /* Fromat string */ | 405 | const char *fmt; /* Fromat string */ |
406 | const char *fmttype; /* Name in format file */ | ||
267 | /* Fetch functions */ | 407 | /* Fetch functions */ |
268 | fetch_func_t reg; | 408 | fetch_func_t fetch[FETCH_MTD_END]; |
269 | fetch_func_t stack; | ||
270 | fetch_func_t retval; | ||
271 | fetch_func_t memory; | ||
272 | fetch_func_t symbol; | ||
273 | fetch_func_t deref; | ||
274 | } fetch_type_table[] = { | 409 | } fetch_type_table[] = { |
410 | /* Special types */ | ||
411 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | ||
412 | sizeof(u32), 1, "__data_loc char[]"), | ||
413 | [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, | ||
414 | string_size, sizeof(u32), 0, "u32"), | ||
415 | /* Basic types */ | ||
275 | ASSIGN_FETCH_TYPE(u8, u8, 0), | 416 | ASSIGN_FETCH_TYPE(u8, u8, 0), |
276 | ASSIGN_FETCH_TYPE(u16, u16, 0), | 417 | ASSIGN_FETCH_TYPE(u16, u16, 0), |
277 | ASSIGN_FETCH_TYPE(u32, u32, 0), | 418 | ASSIGN_FETCH_TYPE(u32, u32, 0), |
@@ -302,12 +443,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs, | |||
302 | *(unsigned long *)dest = kernel_stack_pointer(regs); | 443 | *(unsigned long *)dest = kernel_stack_pointer(regs); |
303 | } | 444 | } |
304 | 445 | ||
446 | static fetch_func_t get_fetch_size_function(const struct fetch_type *type, | ||
447 | fetch_func_t orig_fn) | ||
448 | { | ||
449 | int i; | ||
450 | |||
451 | if (type != &fetch_type_table[FETCH_TYPE_STRING]) | ||
452 | return NULL; /* Only string type needs size function */ | ||
453 | for (i = 0; i < FETCH_MTD_END; i++) | ||
454 | if (type->fetch[i] == orig_fn) | ||
455 | return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; | ||
456 | |||
457 | WARN_ON(1); /* This should not happen */ | ||
458 | return NULL; | ||
459 | } | ||
460 | |||
305 | /** | 461 | /** |
306 | * Kprobe event core functions | 462 | * Kprobe event core functions |
307 | */ | 463 | */ |
308 | 464 | ||
309 | struct probe_arg { | 465 | struct probe_arg { |
310 | struct fetch_param fetch; | 466 | struct fetch_param fetch; |
467 | struct fetch_param fetch_size; | ||
311 | unsigned int offset; /* Offset from argument entry */ | 468 | unsigned int offset; /* Offset from argument entry */ |
312 | const char *name; /* Name of this argument */ | 469 | const char *name; /* Name of this argument */ |
313 | const char *comm; /* Command of this argument */ | 470 | const char *comm; /* Command of this argument */ |
@@ -356,8 +513,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); | |||
356 | static int kretprobe_dispatcher(struct kretprobe_instance *ri, | 513 | static int kretprobe_dispatcher(struct kretprobe_instance *ri, |
357 | struct pt_regs *regs); | 514 | struct pt_regs *regs); |
358 | 515 | ||
359 | /* Check the name is good for event/group */ | 516 | /* Check the name is good for event/group/fields */ |
360 | static int check_event_name(const char *name) | 517 | static int is_good_name(const char *name) |
361 | { | 518 | { |
362 | if (!isalpha(*name) && *name != '_') | 519 | if (!isalpha(*name) && *name != '_') |
363 | return 0; | 520 | return 0; |
@@ -399,7 +556,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, | |||
399 | else | 556 | else |
400 | tp->rp.kp.pre_handler = kprobe_dispatcher; | 557 | tp->rp.kp.pre_handler = kprobe_dispatcher; |
401 | 558 | ||
402 | if (!event || !check_event_name(event)) { | 559 | if (!event || !is_good_name(event)) { |
403 | ret = -EINVAL; | 560 | ret = -EINVAL; |
404 | goto error; | 561 | goto error; |
405 | } | 562 | } |
@@ -409,7 +566,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, | |||
409 | if (!tp->call.name) | 566 | if (!tp->call.name) |
410 | goto error; | 567 | goto error; |
411 | 568 | ||
412 | if (!group || !check_event_name(group)) { | 569 | if (!group || !is_good_name(group)) { |
413 | ret = -EINVAL; | 570 | ret = -EINVAL; |
414 | goto error; | 571 | goto error; |
415 | } | 572 | } |
@@ -429,9 +586,9 @@ error: | |||
429 | 586 | ||
430 | static void free_probe_arg(struct probe_arg *arg) | 587 | static void free_probe_arg(struct probe_arg *arg) |
431 | { | 588 | { |
432 | if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) | 589 | if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) |
433 | free_deref_fetch_param(arg->fetch.data); | 590 | free_deref_fetch_param(arg->fetch.data); |
434 | else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) | 591 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) |
435 | free_symbol_cache(arg->fetch.data); | 592 | free_symbol_cache(arg->fetch.data); |
436 | kfree(arg->name); | 593 | kfree(arg->name); |
437 | kfree(arg->comm); | 594 | kfree(arg->comm); |
@@ -490,7 +647,7 @@ static int register_trace_probe(struct trace_probe *tp) | |||
490 | } | 647 | } |
491 | ret = register_probe_event(tp); | 648 | ret = register_probe_event(tp); |
492 | if (ret) { | 649 | if (ret) { |
493 | pr_warning("Faild to register probe event(%d)\n", ret); | 650 | pr_warning("Failed to register probe event(%d)\n", ret); |
494 | goto end; | 651 | goto end; |
495 | } | 652 | } |
496 | 653 | ||
@@ -548,7 +705,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
548 | 705 | ||
549 | if (strcmp(arg, "retval") == 0) { | 706 | if (strcmp(arg, "retval") == 0) { |
550 | if (is_return) | 707 | if (is_return) |
551 | f->fn = t->retval; | 708 | f->fn = t->fetch[FETCH_MTD_retval]; |
552 | else | 709 | else |
553 | ret = -EINVAL; | 710 | ret = -EINVAL; |
554 | } else if (strncmp(arg, "stack", 5) == 0) { | 711 | } else if (strncmp(arg, "stack", 5) == 0) { |
@@ -562,7 +719,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
562 | if (ret || param > PARAM_MAX_STACK) | 719 | if (ret || param > PARAM_MAX_STACK) |
563 | ret = -EINVAL; | 720 | ret = -EINVAL; |
564 | else { | 721 | else { |
565 | f->fn = t->stack; | 722 | f->fn = t->fetch[FETCH_MTD_stack]; |
566 | f->data = (void *)param; | 723 | f->data = (void *)param; |
567 | } | 724 | } |
568 | } else | 725 | } else |
@@ -588,7 +745,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
588 | case '%': /* named register */ | 745 | case '%': /* named register */ |
589 | ret = regs_query_register_offset(arg + 1); | 746 | ret = regs_query_register_offset(arg + 1); |
590 | if (ret >= 0) { | 747 | if (ret >= 0) { |
591 | f->fn = t->reg; | 748 | f->fn = t->fetch[FETCH_MTD_reg]; |
592 | f->data = (void *)(unsigned long)ret; | 749 | f->data = (void *)(unsigned long)ret; |
593 | ret = 0; | 750 | ret = 0; |
594 | } | 751 | } |
@@ -598,7 +755,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
598 | ret = strict_strtoul(arg + 1, 0, ¶m); | 755 | ret = strict_strtoul(arg + 1, 0, ¶m); |
599 | if (ret) | 756 | if (ret) |
600 | break; | 757 | break; |
601 | f->fn = t->memory; | 758 | f->fn = t->fetch[FETCH_MTD_memory]; |
602 | f->data = (void *)param; | 759 | f->data = (void *)param; |
603 | } else { | 760 | } else { |
604 | ret = split_symbol_offset(arg + 1, &offset); | 761 | ret = split_symbol_offset(arg + 1, &offset); |
@@ -606,7 +763,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
606 | break; | 763 | break; |
607 | f->data = alloc_symbol_cache(arg + 1, offset); | 764 | f->data = alloc_symbol_cache(arg + 1, offset); |
608 | if (f->data) | 765 | if (f->data) |
609 | f->fn = t->symbol; | 766 | f->fn = t->fetch[FETCH_MTD_symbol]; |
610 | } | 767 | } |
611 | break; | 768 | break; |
612 | case '+': /* deref memory */ | 769 | case '+': /* deref memory */ |
@@ -636,14 +793,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
636 | if (ret) | 793 | if (ret) |
637 | kfree(dprm); | 794 | kfree(dprm); |
638 | else { | 795 | else { |
639 | f->fn = t->deref; | 796 | f->fn = t->fetch[FETCH_MTD_deref]; |
640 | f->data = (void *)dprm; | 797 | f->data = (void *)dprm; |
641 | } | 798 | } |
642 | } | 799 | } |
643 | break; | 800 | break; |
644 | } | 801 | } |
645 | if (!ret && !f->fn) | 802 | if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ |
803 | pr_info("%s type has no corresponding fetch method.\n", | ||
804 | t->name); | ||
646 | ret = -EINVAL; | 805 | ret = -EINVAL; |
806 | } | ||
647 | return ret; | 807 | return ret; |
648 | } | 808 | } |
649 | 809 | ||
@@ -652,6 +812,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, | |||
652 | struct probe_arg *parg, int is_return) | 812 | struct probe_arg *parg, int is_return) |
653 | { | 813 | { |
654 | const char *t; | 814 | const char *t; |
815 | int ret; | ||
655 | 816 | ||
656 | if (strlen(arg) > MAX_ARGSTR_LEN) { | 817 | if (strlen(arg) > MAX_ARGSTR_LEN) { |
657 | pr_info("Argument is too long.: %s\n", arg); | 818 | pr_info("Argument is too long.: %s\n", arg); |
@@ -674,7 +835,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, | |||
674 | } | 835 | } |
675 | parg->offset = tp->size; | 836 | parg->offset = tp->size; |
676 | tp->size += parg->type->size; | 837 | tp->size += parg->type->size; |
677 | return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); | 838 | ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); |
839 | if (ret >= 0) { | ||
840 | parg->fetch_size.fn = get_fetch_size_function(parg->type, | ||
841 | parg->fetch.fn); | ||
842 | parg->fetch_size.data = parg->fetch.data; | ||
843 | } | ||
844 | return ret; | ||
678 | } | 845 | } |
679 | 846 | ||
680 | /* Return 1 if name is reserved or already used by another argument */ | 847 | /* Return 1 if name is reserved or already used by another argument */ |
@@ -715,7 +882,7 @@ static int create_trace_probe(int argc, char **argv) | |||
715 | int i, ret = 0; | 882 | int i, ret = 0; |
716 | int is_return = 0, is_delete = 0; | 883 | int is_return = 0, is_delete = 0; |
717 | char *symbol = NULL, *event = NULL, *group = NULL; | 884 | char *symbol = NULL, *event = NULL, *group = NULL; |
718 | char *arg, *tmp; | 885 | char *arg; |
719 | unsigned long offset = 0; | 886 | unsigned long offset = 0; |
720 | void *addr = NULL; | 887 | void *addr = NULL; |
721 | char buf[MAX_EVENT_NAME_LEN]; | 888 | char buf[MAX_EVENT_NAME_LEN]; |
@@ -757,14 +924,17 @@ static int create_trace_probe(int argc, char **argv) | |||
757 | pr_info("Delete command needs an event name.\n"); | 924 | pr_info("Delete command needs an event name.\n"); |
758 | return -EINVAL; | 925 | return -EINVAL; |
759 | } | 926 | } |
927 | mutex_lock(&probe_lock); | ||
760 | tp = find_probe_event(event, group); | 928 | tp = find_probe_event(event, group); |
761 | if (!tp) { | 929 | if (!tp) { |
930 | mutex_unlock(&probe_lock); | ||
762 | pr_info("Event %s/%s doesn't exist.\n", group, event); | 931 | pr_info("Event %s/%s doesn't exist.\n", group, event); |
763 | return -ENOENT; | 932 | return -ENOENT; |
764 | } | 933 | } |
765 | /* delete an event */ | 934 | /* delete an event */ |
766 | unregister_trace_probe(tp); | 935 | unregister_trace_probe(tp); |
767 | free_trace_probe(tp); | 936 | free_trace_probe(tp); |
937 | mutex_unlock(&probe_lock); | ||
768 | return 0; | 938 | return 0; |
769 | } | 939 | } |
770 | 940 | ||
@@ -821,26 +991,36 @@ static int create_trace_probe(int argc, char **argv) | |||
821 | /* parse arguments */ | 991 | /* parse arguments */ |
822 | ret = 0; | 992 | ret = 0; |
823 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { | 993 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { |
994 | /* Increment count for freeing args in error case */ | ||
995 | tp->nr_args++; | ||
996 | |||
824 | /* Parse argument name */ | 997 | /* Parse argument name */ |
825 | arg = strchr(argv[i], '='); | 998 | arg = strchr(argv[i], '='); |
826 | if (arg) | 999 | if (arg) { |
827 | *arg++ = '\0'; | 1000 | *arg++ = '\0'; |
828 | else | 1001 | tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); |
1002 | } else { | ||
829 | arg = argv[i]; | 1003 | arg = argv[i]; |
1004 | /* If argument name is omitted, set "argN" */ | ||
1005 | snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); | ||
1006 | tp->args[i].name = kstrdup(buf, GFP_KERNEL); | ||
1007 | } | ||
830 | 1008 | ||
831 | tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); | ||
832 | if (!tp->args[i].name) { | 1009 | if (!tp->args[i].name) { |
833 | pr_info("Failed to allocate argument%d name '%s'.\n", | 1010 | pr_info("Failed to allocate argument[%d] name.\n", i); |
834 | i, argv[i]); | ||
835 | ret = -ENOMEM; | 1011 | ret = -ENOMEM; |
836 | goto error; | 1012 | goto error; |
837 | } | 1013 | } |
838 | tmp = strchr(tp->args[i].name, ':'); | 1014 | |
839 | if (tmp) | 1015 | if (!is_good_name(tp->args[i].name)) { |
840 | *tmp = '_'; /* convert : to _ */ | 1016 | pr_info("Invalid argument[%d] name: %s\n", |
1017 | i, tp->args[i].name); | ||
1018 | ret = -EINVAL; | ||
1019 | goto error; | ||
1020 | } | ||
841 | 1021 | ||
842 | if (conflict_field_name(tp->args[i].name, tp->args, i)) { | 1022 | if (conflict_field_name(tp->args[i].name, tp->args, i)) { |
843 | pr_info("Argument%d name '%s' conflicts with " | 1023 | pr_info("Argument[%d] name '%s' conflicts with " |
844 | "another field.\n", i, argv[i]); | 1024 | "another field.\n", i, argv[i]); |
845 | ret = -EINVAL; | 1025 | ret = -EINVAL; |
846 | goto error; | 1026 | goto error; |
@@ -849,12 +1029,9 @@ static int create_trace_probe(int argc, char **argv) | |||
849 | /* Parse fetch argument */ | 1029 | /* Parse fetch argument */ |
850 | ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); | 1030 | ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); |
851 | if (ret) { | 1031 | if (ret) { |
852 | pr_info("Parse error at argument%d. (%d)\n", i, ret); | 1032 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); |
853 | kfree(tp->args[i].name); | ||
854 | goto error; | 1033 | goto error; |
855 | } | 1034 | } |
856 | |||
857 | tp->nr_args++; | ||
858 | } | 1035 | } |
859 | 1036 | ||
860 | ret = register_trace_probe(tp); | 1037 | ret = register_trace_probe(tp); |
@@ -1043,6 +1220,54 @@ static const struct file_operations kprobe_profile_ops = { | |||
1043 | .release = seq_release, | 1220 | .release = seq_release, |
1044 | }; | 1221 | }; |
1045 | 1222 | ||
1223 | /* Sum up total data length for dynamic arraies (strings) */ | ||
1224 | static __kprobes int __get_data_size(struct trace_probe *tp, | ||
1225 | struct pt_regs *regs) | ||
1226 | { | ||
1227 | int i, ret = 0; | ||
1228 | u32 len; | ||
1229 | |||
1230 | for (i = 0; i < tp->nr_args; i++) | ||
1231 | if (unlikely(tp->args[i].fetch_size.fn)) { | ||
1232 | call_fetch(&tp->args[i].fetch_size, regs, &len); | ||
1233 | ret += len; | ||
1234 | } | ||
1235 | |||
1236 | return ret; | ||
1237 | } | ||
1238 | |||
1239 | /* Store the value of each argument */ | ||
1240 | static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, | ||
1241 | struct pt_regs *regs, | ||
1242 | u8 *data, int maxlen) | ||
1243 | { | ||
1244 | int i; | ||
1245 | u32 end = tp->size; | ||
1246 | u32 *dl; /* Data (relative) location */ | ||
1247 | |||
1248 | for (i = 0; i < tp->nr_args; i++) { | ||
1249 | if (unlikely(tp->args[i].fetch_size.fn)) { | ||
1250 | /* | ||
1251 | * First, we set the relative location and | ||
1252 | * maximum data length to *dl | ||
1253 | */ | ||
1254 | dl = (u32 *)(data + tp->args[i].offset); | ||
1255 | *dl = make_data_rloc(maxlen, end - tp->args[i].offset); | ||
1256 | /* Then try to fetch string or dynamic array data */ | ||
1257 | call_fetch(&tp->args[i].fetch, regs, dl); | ||
1258 | /* Reduce maximum length */ | ||
1259 | end += get_rloc_len(*dl); | ||
1260 | maxlen -= get_rloc_len(*dl); | ||
1261 | /* Trick here, convert data_rloc to data_loc */ | ||
1262 | *dl = convert_rloc_to_loc(*dl, | ||
1263 | ent_size + tp->args[i].offset); | ||
1264 | } else | ||
1265 | /* Just fetching data normally */ | ||
1266 | call_fetch(&tp->args[i].fetch, regs, | ||
1267 | data + tp->args[i].offset); | ||
1268 | } | ||
1269 | } | ||
1270 | |||
1046 | /* Kprobe handler */ | 1271 | /* Kprobe handler */ |
1047 | static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | 1272 | static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) |
1048 | { | 1273 | { |
@@ -1050,8 +1275,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
1050 | struct kprobe_trace_entry_head *entry; | 1275 | struct kprobe_trace_entry_head *entry; |
1051 | struct ring_buffer_event *event; | 1276 | struct ring_buffer_event *event; |
1052 | struct ring_buffer *buffer; | 1277 | struct ring_buffer *buffer; |
1053 | u8 *data; | 1278 | int size, dsize, pc; |
1054 | int size, i, pc; | ||
1055 | unsigned long irq_flags; | 1279 | unsigned long irq_flags; |
1056 | struct ftrace_event_call *call = &tp->call; | 1280 | struct ftrace_event_call *call = &tp->call; |
1057 | 1281 | ||
@@ -1060,7 +1284,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
1060 | local_save_flags(irq_flags); | 1284 | local_save_flags(irq_flags); |
1061 | pc = preempt_count(); | 1285 | pc = preempt_count(); |
1062 | 1286 | ||
1063 | size = sizeof(*entry) + tp->size; | 1287 | dsize = __get_data_size(tp, regs); |
1288 | size = sizeof(*entry) + tp->size + dsize; | ||
1064 | 1289 | ||
1065 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, | 1290 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, |
1066 | size, irq_flags, pc); | 1291 | size, irq_flags, pc); |
@@ -1069,9 +1294,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
1069 | 1294 | ||
1070 | entry = ring_buffer_event_data(event); | 1295 | entry = ring_buffer_event_data(event); |
1071 | entry->ip = (unsigned long)kp->addr; | 1296 | entry->ip = (unsigned long)kp->addr; |
1072 | data = (u8 *)&entry[1]; | 1297 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1073 | for (i = 0; i < tp->nr_args; i++) | ||
1074 | call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); | ||
1075 | 1298 | ||
1076 | if (!filter_current_check_discard(buffer, call, entry, event)) | 1299 | if (!filter_current_check_discard(buffer, call, entry, event)) |
1077 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); | 1300 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); |
@@ -1085,15 +1308,15 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, | |||
1085 | struct kretprobe_trace_entry_head *entry; | 1308 | struct kretprobe_trace_entry_head *entry; |
1086 | struct ring_buffer_event *event; | 1309 | struct ring_buffer_event *event; |
1087 | struct ring_buffer *buffer; | 1310 | struct ring_buffer *buffer; |
1088 | u8 *data; | 1311 | int size, pc, dsize; |
1089 | int size, i, pc; | ||
1090 | unsigned long irq_flags; | 1312 | unsigned long irq_flags; |
1091 | struct ftrace_event_call *call = &tp->call; | 1313 | struct ftrace_event_call *call = &tp->call; |
1092 | 1314 | ||
1093 | local_save_flags(irq_flags); | 1315 | local_save_flags(irq_flags); |
1094 | pc = preempt_count(); | 1316 | pc = preempt_count(); |
1095 | 1317 | ||
1096 | size = sizeof(*entry) + tp->size; | 1318 | dsize = __get_data_size(tp, regs); |
1319 | size = sizeof(*entry) + tp->size + dsize; | ||
1097 | 1320 | ||
1098 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, | 1321 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, |
1099 | size, irq_flags, pc); | 1322 | size, irq_flags, pc); |
@@ -1103,9 +1326,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, | |||
1103 | entry = ring_buffer_event_data(event); | 1326 | entry = ring_buffer_event_data(event); |
1104 | entry->func = (unsigned long)tp->rp.kp.addr; | 1327 | entry->func = (unsigned long)tp->rp.kp.addr; |
1105 | entry->ret_ip = (unsigned long)ri->ret_addr; | 1328 | entry->ret_ip = (unsigned long)ri->ret_addr; |
1106 | data = (u8 *)&entry[1]; | 1329 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1107 | for (i = 0; i < tp->nr_args; i++) | ||
1108 | call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); | ||
1109 | 1330 | ||
1110 | if (!filter_current_check_discard(buffer, call, entry, event)) | 1331 | if (!filter_current_check_discard(buffer, call, entry, event)) |
1111 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); | 1332 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); |
@@ -1137,7 +1358,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, | |||
1137 | data = (u8 *)&field[1]; | 1358 | data = (u8 *)&field[1]; |
1138 | for (i = 0; i < tp->nr_args; i++) | 1359 | for (i = 0; i < tp->nr_args; i++) |
1139 | if (!tp->args[i].type->print(s, tp->args[i].name, | 1360 | if (!tp->args[i].type->print(s, tp->args[i].name, |
1140 | data + tp->args[i].offset)) | 1361 | data + tp->args[i].offset, field)) |
1141 | goto partial; | 1362 | goto partial; |
1142 | 1363 | ||
1143 | if (!trace_seq_puts(s, "\n")) | 1364 | if (!trace_seq_puts(s, "\n")) |
@@ -1179,7 +1400,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, | |||
1179 | data = (u8 *)&field[1]; | 1400 | data = (u8 *)&field[1]; |
1180 | for (i = 0; i < tp->nr_args; i++) | 1401 | for (i = 0; i < tp->nr_args; i++) |
1181 | if (!tp->args[i].type->print(s, tp->args[i].name, | 1402 | if (!tp->args[i].type->print(s, tp->args[i].name, |
1182 | data + tp->args[i].offset)) | 1403 | data + tp->args[i].offset, field)) |
1183 | goto partial; | 1404 | goto partial; |
1184 | 1405 | ||
1185 | if (!trace_seq_puts(s, "\n")) | 1406 | if (!trace_seq_puts(s, "\n")) |
@@ -1214,11 +1435,6 @@ static void probe_event_disable(struct ftrace_event_call *call) | |||
1214 | } | 1435 | } |
1215 | } | 1436 | } |
1216 | 1437 | ||
1217 | static int probe_event_raw_init(struct ftrace_event_call *event_call) | ||
1218 | { | ||
1219 | return 0; | ||
1220 | } | ||
1221 | |||
1222 | #undef DEFINE_FIELD | 1438 | #undef DEFINE_FIELD |
1223 | #define DEFINE_FIELD(type, item, name, is_signed) \ | 1439 | #define DEFINE_FIELD(type, item, name, is_signed) \ |
1224 | do { \ | 1440 | do { \ |
@@ -1239,7 +1455,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
1239 | DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); | 1455 | DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); |
1240 | /* Set argument names as fields */ | 1456 | /* Set argument names as fields */ |
1241 | for (i = 0; i < tp->nr_args; i++) { | 1457 | for (i = 0; i < tp->nr_args; i++) { |
1242 | ret = trace_define_field(event_call, tp->args[i].type->name, | 1458 | ret = trace_define_field(event_call, tp->args[i].type->fmttype, |
1243 | tp->args[i].name, | 1459 | tp->args[i].name, |
1244 | sizeof(field) + tp->args[i].offset, | 1460 | sizeof(field) + tp->args[i].offset, |
1245 | tp->args[i].type->size, | 1461 | tp->args[i].type->size, |
@@ -1261,7 +1477,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
1261 | DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); | 1477 | DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); |
1262 | /* Set argument names as fields */ | 1478 | /* Set argument names as fields */ |
1263 | for (i = 0; i < tp->nr_args; i++) { | 1479 | for (i = 0; i < tp->nr_args; i++) { |
1264 | ret = trace_define_field(event_call, tp->args[i].type->name, | 1480 | ret = trace_define_field(event_call, tp->args[i].type->fmttype, |
1265 | tp->args[i].name, | 1481 | tp->args[i].name, |
1266 | sizeof(field) + tp->args[i].offset, | 1482 | sizeof(field) + tp->args[i].offset, |
1267 | tp->args[i].type->size, | 1483 | tp->args[i].type->size, |
@@ -1301,8 +1517,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) | |||
1301 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); | 1517 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); |
1302 | 1518 | ||
1303 | for (i = 0; i < tp->nr_args; i++) { | 1519 | for (i = 0; i < tp->nr_args; i++) { |
1304 | pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", | 1520 | if (strcmp(tp->args[i].type->name, "string") == 0) |
1305 | tp->args[i].name); | 1521 | pos += snprintf(buf + pos, LEN_OR_ZERO, |
1522 | ", __get_str(%s)", | ||
1523 | tp->args[i].name); | ||
1524 | else | ||
1525 | pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", | ||
1526 | tp->args[i].name); | ||
1306 | } | 1527 | } |
1307 | 1528 | ||
1308 | #undef LEN_OR_ZERO | 1529 | #undef LEN_OR_ZERO |
@@ -1339,11 +1560,11 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, | |||
1339 | struct ftrace_event_call *call = &tp->call; | 1560 | struct ftrace_event_call *call = &tp->call; |
1340 | struct kprobe_trace_entry_head *entry; | 1561 | struct kprobe_trace_entry_head *entry; |
1341 | struct hlist_head *head; | 1562 | struct hlist_head *head; |
1342 | u8 *data; | 1563 | int size, __size, dsize; |
1343 | int size, __size, i; | ||
1344 | int rctx; | 1564 | int rctx; |
1345 | 1565 | ||
1346 | __size = sizeof(*entry) + tp->size; | 1566 | dsize = __get_data_size(tp, regs); |
1567 | __size = sizeof(*entry) + tp->size + dsize; | ||
1347 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1568 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
1348 | size -= sizeof(u32); | 1569 | size -= sizeof(u32); |
1349 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, | 1570 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, |
@@ -1355,9 +1576,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, | |||
1355 | return; | 1576 | return; |
1356 | 1577 | ||
1357 | entry->ip = (unsigned long)kp->addr; | 1578 | entry->ip = (unsigned long)kp->addr; |
1358 | data = (u8 *)&entry[1]; | 1579 | memset(&entry[1], 0, dsize); |
1359 | for (i = 0; i < tp->nr_args; i++) | 1580 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1360 | call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); | ||
1361 | 1581 | ||
1362 | head = this_cpu_ptr(call->perf_events); | 1582 | head = this_cpu_ptr(call->perf_events); |
1363 | perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); | 1583 | perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); |
@@ -1371,11 +1591,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, | |||
1371 | struct ftrace_event_call *call = &tp->call; | 1591 | struct ftrace_event_call *call = &tp->call; |
1372 | struct kretprobe_trace_entry_head *entry; | 1592 | struct kretprobe_trace_entry_head *entry; |
1373 | struct hlist_head *head; | 1593 | struct hlist_head *head; |
1374 | u8 *data; | 1594 | int size, __size, dsize; |
1375 | int size, __size, i; | ||
1376 | int rctx; | 1595 | int rctx; |
1377 | 1596 | ||
1378 | __size = sizeof(*entry) + tp->size; | 1597 | dsize = __get_data_size(tp, regs); |
1598 | __size = sizeof(*entry) + tp->size + dsize; | ||
1379 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1599 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
1380 | size -= sizeof(u32); | 1600 | size -= sizeof(u32); |
1381 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, | 1601 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, |
@@ -1388,9 +1608,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, | |||
1388 | 1608 | ||
1389 | entry->func = (unsigned long)tp->rp.kp.addr; | 1609 | entry->func = (unsigned long)tp->rp.kp.addr; |
1390 | entry->ret_ip = (unsigned long)ri->ret_addr; | 1610 | entry->ret_ip = (unsigned long)ri->ret_addr; |
1391 | data = (u8 *)&entry[1]; | 1611 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1392 | for (i = 0; i < tp->nr_args; i++) | ||
1393 | call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); | ||
1394 | 1612 | ||
1395 | head = this_cpu_ptr(call->perf_events); | 1613 | head = this_cpu_ptr(call->perf_events); |
1396 | perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); | 1614 | perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); |
@@ -1486,15 +1704,12 @@ static int register_probe_event(struct trace_probe *tp) | |||
1486 | int ret; | 1704 | int ret; |
1487 | 1705 | ||
1488 | /* Initialize ftrace_event_call */ | 1706 | /* Initialize ftrace_event_call */ |
1707 | INIT_LIST_HEAD(&call->class->fields); | ||
1489 | if (probe_is_return(tp)) { | 1708 | if (probe_is_return(tp)) { |
1490 | INIT_LIST_HEAD(&call->class->fields); | ||
1491 | call->event.funcs = &kretprobe_funcs; | 1709 | call->event.funcs = &kretprobe_funcs; |
1492 | call->class->raw_init = probe_event_raw_init; | ||
1493 | call->class->define_fields = kretprobe_event_define_fields; | 1710 | call->class->define_fields = kretprobe_event_define_fields; |
1494 | } else { | 1711 | } else { |
1495 | INIT_LIST_HEAD(&call->class->fields); | ||
1496 | call->event.funcs = &kprobe_funcs; | 1712 | call->event.funcs = &kprobe_funcs; |
1497 | call->class->raw_init = probe_event_raw_init; | ||
1498 | call->class->define_fields = kprobe_event_define_fields; | 1713 | call->class->define_fields = kprobe_event_define_fields; |
1499 | } | 1714 | } |
1500 | if (set_print_fmt(tp) < 0) | 1715 | if (set_print_fmt(tp) < 0) |
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c deleted file mode 100644 index 8eaf00749b65..000000000000 --- a/kernel/trace/trace_ksym.c +++ /dev/null | |||
@@ -1,508 +0,0 @@ | |||
1 | /* | ||
2 | * trace_ksym.c - Kernel Symbol Tracer | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright (C) IBM Corporation, 2009 | ||
19 | */ | ||
20 | |||
21 | #include <linux/kallsyms.h> | ||
22 | #include <linux/uaccess.h> | ||
23 | #include <linux/debugfs.h> | ||
24 | #include <linux/ftrace.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/slab.h> | ||
27 | #include <linux/fs.h> | ||
28 | |||
29 | #include "trace_output.h" | ||
30 | #include "trace.h" | ||
31 | |||
32 | #include <linux/hw_breakpoint.h> | ||
33 | #include <asm/hw_breakpoint.h> | ||
34 | |||
35 | #include <asm/atomic.h> | ||
36 | |||
37 | #define KSYM_TRACER_OP_LEN 3 /* rw- */ | ||
38 | |||
39 | struct trace_ksym { | ||
40 | struct perf_event **ksym_hbp; | ||
41 | struct perf_event_attr attr; | ||
42 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
43 | atomic64_t counter; | ||
44 | #endif | ||
45 | struct hlist_node ksym_hlist; | ||
46 | }; | ||
47 | |||
48 | static struct trace_array *ksym_trace_array; | ||
49 | |||
50 | static unsigned int ksym_tracing_enabled; | ||
51 | |||
52 | static HLIST_HEAD(ksym_filter_head); | ||
53 | |||
54 | static DEFINE_MUTEX(ksym_tracer_mutex); | ||
55 | |||
56 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
57 | |||
58 | #define MAX_UL_INT 0xffffffff | ||
59 | |||
60 | void ksym_collect_stats(unsigned long hbp_hit_addr) | ||
61 | { | ||
62 | struct hlist_node *node; | ||
63 | struct trace_ksym *entry; | ||
64 | |||
65 | rcu_read_lock(); | ||
66 | hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { | ||
67 | if (entry->attr.bp_addr == hbp_hit_addr) { | ||
68 | atomic64_inc(&entry->counter); | ||
69 | break; | ||
70 | } | ||
71 | } | ||
72 | rcu_read_unlock(); | ||
73 | } | ||
74 | #endif /* CONFIG_PROFILE_KSYM_TRACER */ | ||
75 | |||
76 | void ksym_hbp_handler(struct perf_event *hbp, int nmi, | ||
77 | struct perf_sample_data *data, | ||
78 | struct pt_regs *regs) | ||
79 | { | ||
80 | struct ring_buffer_event *event; | ||
81 | struct ksym_trace_entry *entry; | ||
82 | struct ring_buffer *buffer; | ||
83 | int pc; | ||
84 | |||
85 | if (!ksym_tracing_enabled) | ||
86 | return; | ||
87 | |||
88 | buffer = ksym_trace_array->buffer; | ||
89 | |||
90 | pc = preempt_count(); | ||
91 | |||
92 | event = trace_buffer_lock_reserve(buffer, TRACE_KSYM, | ||
93 | sizeof(*entry), 0, pc); | ||
94 | if (!event) | ||
95 | return; | ||
96 | |||
97 | entry = ring_buffer_event_data(event); | ||
98 | entry->ip = instruction_pointer(regs); | ||
99 | entry->type = hw_breakpoint_type(hbp); | ||
100 | entry->addr = hw_breakpoint_addr(hbp); | ||
101 | strlcpy(entry->cmd, current->comm, TASK_COMM_LEN); | ||
102 | |||
103 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
104 | ksym_collect_stats(hw_breakpoint_addr(hbp)); | ||
105 | #endif /* CONFIG_PROFILE_KSYM_TRACER */ | ||
106 | |||
107 | trace_buffer_unlock_commit(buffer, event, 0, pc); | ||
108 | } | ||
109 | |||
110 | /* Valid access types are represented as | ||
111 | * | ||
112 | * rw- : Set Read/Write Access Breakpoint | ||
113 | * -w- : Set Write Access Breakpoint | ||
114 | * --- : Clear Breakpoints | ||
115 | * --x : Set Execution Break points (Not available yet) | ||
116 | * | ||
117 | */ | ||
118 | static int ksym_trace_get_access_type(char *str) | ||
119 | { | ||
120 | int access = 0; | ||
121 | |||
122 | if (str[0] == 'r') | ||
123 | access |= HW_BREAKPOINT_R; | ||
124 | |||
125 | if (str[1] == 'w') | ||
126 | access |= HW_BREAKPOINT_W; | ||
127 | |||
128 | if (str[2] == 'x') | ||
129 | access |= HW_BREAKPOINT_X; | ||
130 | |||
131 | switch (access) { | ||
132 | case HW_BREAKPOINT_R: | ||
133 | case HW_BREAKPOINT_W: | ||
134 | case HW_BREAKPOINT_W | HW_BREAKPOINT_R: | ||
135 | return access; | ||
136 | default: | ||
137 | return -EINVAL; | ||
138 | } | ||
139 | } | ||
140 | |||
141 | /* | ||
142 | * There can be several possible malformed requests and we attempt to capture | ||
143 | * all of them. We enumerate some of the rules | ||
144 | * 1. We will not allow kernel symbols with ':' since it is used as a delimiter. | ||
145 | * i.e. multiple ':' symbols disallowed. Possible uses are of the form | ||
146 | * <module>:<ksym_name>:<op>. | ||
147 | * 2. No delimiter symbol ':' in the input string | ||
148 | * 3. Spurious operator symbols or symbols not in their respective positions | ||
149 | * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file | ||
150 | * 5. Kernel symbol not a part of /proc/kallsyms | ||
151 | * 6. Duplicate requests | ||
152 | */ | ||
153 | static int parse_ksym_trace_str(char *input_string, char **ksymname, | ||
154 | unsigned long *addr) | ||
155 | { | ||
156 | int ret; | ||
157 | |||
158 | *ksymname = strsep(&input_string, ":"); | ||
159 | *addr = kallsyms_lookup_name(*ksymname); | ||
160 | |||
161 | /* Check for malformed request: (2), (1) and (5) */ | ||
162 | if ((!input_string) || | ||
163 | (strlen(input_string) != KSYM_TRACER_OP_LEN) || | ||
164 | (*addr == 0)) | ||
165 | return -EINVAL;; | ||
166 | |||
167 | ret = ksym_trace_get_access_type(input_string); | ||
168 | |||
169 | return ret; | ||
170 | } | ||
171 | |||
172 | int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) | ||
173 | { | ||
174 | struct trace_ksym *entry; | ||
175 | int ret = -ENOMEM; | ||
176 | |||
177 | entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); | ||
178 | if (!entry) | ||
179 | return -ENOMEM; | ||
180 | |||
181 | hw_breakpoint_init(&entry->attr); | ||
182 | |||
183 | entry->attr.bp_type = op; | ||
184 | entry->attr.bp_addr = addr; | ||
185 | entry->attr.bp_len = HW_BREAKPOINT_LEN_4; | ||
186 | |||
187 | entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, | ||
188 | ksym_hbp_handler); | ||
189 | |||
190 | if (IS_ERR(entry->ksym_hbp)) { | ||
191 | ret = PTR_ERR(entry->ksym_hbp); | ||
192 | if (ret == -ENOSPC) { | ||
193 | printk(KERN_ERR "ksym_tracer: Maximum limit reached." | ||
194 | " No new requests for tracing can be accepted now.\n"); | ||
195 | } else { | ||
196 | printk(KERN_INFO "ksym_tracer request failed. Try again" | ||
197 | " later!!\n"); | ||
198 | } | ||
199 | goto err; | ||
200 | } | ||
201 | |||
202 | hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); | ||
203 | |||
204 | return 0; | ||
205 | |||
206 | err: | ||
207 | kfree(entry); | ||
208 | |||
209 | return ret; | ||
210 | } | ||
211 | |||
212 | static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, | ||
213 | size_t count, loff_t *ppos) | ||
214 | { | ||
215 | struct trace_ksym *entry; | ||
216 | struct hlist_node *node; | ||
217 | struct trace_seq *s; | ||
218 | ssize_t cnt = 0; | ||
219 | int ret; | ||
220 | |||
221 | s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
222 | if (!s) | ||
223 | return -ENOMEM; | ||
224 | trace_seq_init(s); | ||
225 | |||
226 | mutex_lock(&ksym_tracer_mutex); | ||
227 | |||
228 | hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { | ||
229 | ret = trace_seq_printf(s, "%pS:", | ||
230 | (void *)(unsigned long)entry->attr.bp_addr); | ||
231 | if (entry->attr.bp_type == HW_BREAKPOINT_R) | ||
232 | ret = trace_seq_puts(s, "r--\n"); | ||
233 | else if (entry->attr.bp_type == HW_BREAKPOINT_W) | ||
234 | ret = trace_seq_puts(s, "-w-\n"); | ||
235 | else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R)) | ||
236 | ret = trace_seq_puts(s, "rw-\n"); | ||
237 | WARN_ON_ONCE(!ret); | ||
238 | } | ||
239 | |||
240 | cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); | ||
241 | |||
242 | mutex_unlock(&ksym_tracer_mutex); | ||
243 | |||
244 | kfree(s); | ||
245 | |||
246 | return cnt; | ||
247 | } | ||
248 | |||
249 | static void __ksym_trace_reset(void) | ||
250 | { | ||
251 | struct trace_ksym *entry; | ||
252 | struct hlist_node *node, *node1; | ||
253 | |||
254 | mutex_lock(&ksym_tracer_mutex); | ||
255 | hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, | ||
256 | ksym_hlist) { | ||
257 | unregister_wide_hw_breakpoint(entry->ksym_hbp); | ||
258 | hlist_del_rcu(&(entry->ksym_hlist)); | ||
259 | synchronize_rcu(); | ||
260 | kfree(entry); | ||
261 | } | ||
262 | mutex_unlock(&ksym_tracer_mutex); | ||
263 | } | ||
264 | |||
265 | static ssize_t ksym_trace_filter_write(struct file *file, | ||
266 | const char __user *buffer, | ||
267 | size_t count, loff_t *ppos) | ||
268 | { | ||
269 | struct trace_ksym *entry; | ||
270 | struct hlist_node *node; | ||
271 | char *buf, *input_string, *ksymname = NULL; | ||
272 | unsigned long ksym_addr = 0; | ||
273 | int ret, op, changed = 0; | ||
274 | |||
275 | buf = kzalloc(count + 1, GFP_KERNEL); | ||
276 | if (!buf) | ||
277 | return -ENOMEM; | ||
278 | |||
279 | ret = -EFAULT; | ||
280 | if (copy_from_user(buf, buffer, count)) | ||
281 | goto out; | ||
282 | |||
283 | buf[count] = '\0'; | ||
284 | input_string = strstrip(buf); | ||
285 | |||
286 | /* | ||
287 | * Clear all breakpoints if: | ||
288 | * 1: echo > ksym_trace_filter | ||
289 | * 2: echo 0 > ksym_trace_filter | ||
290 | * 3: echo "*:---" > ksym_trace_filter | ||
291 | */ | ||
292 | if (!input_string[0] || !strcmp(input_string, "0") || | ||
293 | !strcmp(input_string, "*:---")) { | ||
294 | __ksym_trace_reset(); | ||
295 | ret = 0; | ||
296 | goto out; | ||
297 | } | ||
298 | |||
299 | ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); | ||
300 | if (ret < 0) | ||
301 | goto out; | ||
302 | |||
303 | mutex_lock(&ksym_tracer_mutex); | ||
304 | |||
305 | ret = -EINVAL; | ||
306 | hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { | ||
307 | if (entry->attr.bp_addr == ksym_addr) { | ||
308 | /* Check for malformed request: (6) */ | ||
309 | if (entry->attr.bp_type != op) | ||
310 | changed = 1; | ||
311 | else | ||
312 | goto out_unlock; | ||
313 | break; | ||
314 | } | ||
315 | } | ||
316 | if (changed) { | ||
317 | unregister_wide_hw_breakpoint(entry->ksym_hbp); | ||
318 | entry->attr.bp_type = op; | ||
319 | ret = 0; | ||
320 | if (op > 0) { | ||
321 | entry->ksym_hbp = | ||
322 | register_wide_hw_breakpoint(&entry->attr, | ||
323 | ksym_hbp_handler); | ||
324 | if (IS_ERR(entry->ksym_hbp)) | ||
325 | ret = PTR_ERR(entry->ksym_hbp); | ||
326 | else | ||
327 | goto out_unlock; | ||
328 | } | ||
329 | /* Error or "symbol:---" case: drop it */ | ||
330 | hlist_del_rcu(&(entry->ksym_hlist)); | ||
331 | synchronize_rcu(); | ||
332 | kfree(entry); | ||
333 | goto out_unlock; | ||
334 | } else { | ||
335 | /* Check for malformed request: (4) */ | ||
336 | if (op) | ||
337 | ret = process_new_ksym_entry(ksymname, op, ksym_addr); | ||
338 | } | ||
339 | out_unlock: | ||
340 | mutex_unlock(&ksym_tracer_mutex); | ||
341 | out: | ||
342 | kfree(buf); | ||
343 | return !ret ? count : ret; | ||
344 | } | ||
345 | |||
346 | static const struct file_operations ksym_tracing_fops = { | ||
347 | .open = tracing_open_generic, | ||
348 | .read = ksym_trace_filter_read, | ||
349 | .write = ksym_trace_filter_write, | ||
350 | }; | ||
351 | |||
352 | static void ksym_trace_reset(struct trace_array *tr) | ||
353 | { | ||
354 | ksym_tracing_enabled = 0; | ||
355 | __ksym_trace_reset(); | ||
356 | } | ||
357 | |||
358 | static int ksym_trace_init(struct trace_array *tr) | ||
359 | { | ||
360 | int cpu, ret = 0; | ||
361 | |||
362 | for_each_online_cpu(cpu) | ||
363 | tracing_reset(tr, cpu); | ||
364 | ksym_tracing_enabled = 1; | ||
365 | ksym_trace_array = tr; | ||
366 | |||
367 | return ret; | ||
368 | } | ||
369 | |||
370 | static void ksym_trace_print_header(struct seq_file *m) | ||
371 | { | ||
372 | seq_puts(m, | ||
373 | "# TASK-PID CPU# Symbol " | ||
374 | "Type Function\n"); | ||
375 | seq_puts(m, | ||
376 | "# | | | " | ||
377 | " | |\n"); | ||
378 | } | ||
379 | |||
380 | static enum print_line_t ksym_trace_output(struct trace_iterator *iter) | ||
381 | { | ||
382 | struct trace_entry *entry = iter->ent; | ||
383 | struct trace_seq *s = &iter->seq; | ||
384 | struct ksym_trace_entry *field; | ||
385 | char str[KSYM_SYMBOL_LEN]; | ||
386 | int ret; | ||
387 | |||
388 | if (entry->type != TRACE_KSYM) | ||
389 | return TRACE_TYPE_UNHANDLED; | ||
390 | |||
391 | trace_assign_type(field, entry); | ||
392 | |||
393 | ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd, | ||
394 | entry->pid, iter->cpu, (char *)field->addr); | ||
395 | if (!ret) | ||
396 | return TRACE_TYPE_PARTIAL_LINE; | ||
397 | |||
398 | switch (field->type) { | ||
399 | case HW_BREAKPOINT_R: | ||
400 | ret = trace_seq_printf(s, " R "); | ||
401 | break; | ||
402 | case HW_BREAKPOINT_W: | ||
403 | ret = trace_seq_printf(s, " W "); | ||
404 | break; | ||
405 | case HW_BREAKPOINT_R | HW_BREAKPOINT_W: | ||
406 | ret = trace_seq_printf(s, " RW "); | ||
407 | break; | ||
408 | default: | ||
409 | return TRACE_TYPE_PARTIAL_LINE; | ||
410 | } | ||
411 | |||
412 | if (!ret) | ||
413 | return TRACE_TYPE_PARTIAL_LINE; | ||
414 | |||
415 | sprint_symbol(str, field->ip); | ||
416 | ret = trace_seq_printf(s, "%s\n", str); | ||
417 | if (!ret) | ||
418 | return TRACE_TYPE_PARTIAL_LINE; | ||
419 | |||
420 | return TRACE_TYPE_HANDLED; | ||
421 | } | ||
422 | |||
423 | struct tracer ksym_tracer __read_mostly = | ||
424 | { | ||
425 | .name = "ksym_tracer", | ||
426 | .init = ksym_trace_init, | ||
427 | .reset = ksym_trace_reset, | ||
428 | #ifdef CONFIG_FTRACE_SELFTEST | ||
429 | .selftest = trace_selftest_startup_ksym, | ||
430 | #endif | ||
431 | .print_header = ksym_trace_print_header, | ||
432 | .print_line = ksym_trace_output | ||
433 | }; | ||
434 | |||
435 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
436 | static int ksym_profile_show(struct seq_file *m, void *v) | ||
437 | { | ||
438 | struct hlist_node *node; | ||
439 | struct trace_ksym *entry; | ||
440 | int access_type = 0; | ||
441 | char fn_name[KSYM_NAME_LEN]; | ||
442 | |||
443 | seq_puts(m, " Access Type "); | ||
444 | seq_puts(m, " Symbol Counter\n"); | ||
445 | seq_puts(m, " ----------- "); | ||
446 | seq_puts(m, " ------ -------\n"); | ||
447 | |||
448 | rcu_read_lock(); | ||
449 | hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { | ||
450 | |||
451 | access_type = entry->attr.bp_type; | ||
452 | |||
453 | switch (access_type) { | ||
454 | case HW_BREAKPOINT_R: | ||
455 | seq_puts(m, " R "); | ||
456 | break; | ||
457 | case HW_BREAKPOINT_W: | ||
458 | seq_puts(m, " W "); | ||
459 | break; | ||
460 | case HW_BREAKPOINT_R | HW_BREAKPOINT_W: | ||
461 | seq_puts(m, " RW "); | ||
462 | break; | ||
463 | default: | ||
464 | seq_puts(m, " NA "); | ||
465 | } | ||
466 | |||
467 | if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) | ||
468 | seq_printf(m, " %-36s", fn_name); | ||
469 | else | ||
470 | seq_printf(m, " %-36s", "<NA>"); | ||
471 | seq_printf(m, " %15llu\n", | ||
472 | (unsigned long long)atomic64_read(&entry->counter)); | ||
473 | } | ||
474 | rcu_read_unlock(); | ||
475 | |||
476 | return 0; | ||
477 | } | ||
478 | |||
479 | static int ksym_profile_open(struct inode *node, struct file *file) | ||
480 | { | ||
481 | return single_open(file, ksym_profile_show, NULL); | ||
482 | } | ||
483 | |||
484 | static const struct file_operations ksym_profile_fops = { | ||
485 | .open = ksym_profile_open, | ||
486 | .read = seq_read, | ||
487 | .llseek = seq_lseek, | ||
488 | .release = single_release, | ||
489 | }; | ||
490 | #endif /* CONFIG_PROFILE_KSYM_TRACER */ | ||
491 | |||
492 | __init static int init_ksym_trace(void) | ||
493 | { | ||
494 | struct dentry *d_tracer; | ||
495 | |||
496 | d_tracer = tracing_init_dentry(); | ||
497 | |||
498 | trace_create_file("ksym_trace_filter", 0644, d_tracer, | ||
499 | NULL, &ksym_tracing_fops); | ||
500 | |||
501 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
502 | trace_create_file("ksym_profile", 0444, d_tracer, | ||
503 | NULL, &ksym_profile_fops); | ||
504 | #endif | ||
505 | |||
506 | return register_tracer(&ksym_tracer); | ||
507 | } | ||
508 | device_initcall(init_ksym_trace); | ||
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 57c1b4596470..02272baa2206 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -16,9 +16,6 @@ | |||
16 | 16 | ||
17 | DECLARE_RWSEM(trace_event_mutex); | 17 | DECLARE_RWSEM(trace_event_mutex); |
18 | 18 | ||
19 | DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); | ||
20 | EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); | ||
21 | |||
22 | static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; | 19 | static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; |
23 | 20 | ||
24 | static int next_event_type = __TRACE_LAST_TYPE + 1; | 21 | static int next_event_type = __TRACE_LAST_TYPE + 1; |
@@ -1069,65 +1066,6 @@ static struct trace_event trace_wake_event = { | |||
1069 | .funcs = &trace_wake_funcs, | 1066 | .funcs = &trace_wake_funcs, |
1070 | }; | 1067 | }; |
1071 | 1068 | ||
1072 | /* TRACE_SPECIAL */ | ||
1073 | static enum print_line_t trace_special_print(struct trace_iterator *iter, | ||
1074 | int flags, struct trace_event *event) | ||
1075 | { | ||
1076 | struct special_entry *field; | ||
1077 | |||
1078 | trace_assign_type(field, iter->ent); | ||
1079 | |||
1080 | if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n", | ||
1081 | field->arg1, | ||
1082 | field->arg2, | ||
1083 | field->arg3)) | ||
1084 | return TRACE_TYPE_PARTIAL_LINE; | ||
1085 | |||
1086 | return TRACE_TYPE_HANDLED; | ||
1087 | } | ||
1088 | |||
1089 | static enum print_line_t trace_special_hex(struct trace_iterator *iter, | ||
1090 | int flags, struct trace_event *event) | ||
1091 | { | ||
1092 | struct special_entry *field; | ||
1093 | struct trace_seq *s = &iter->seq; | ||
1094 | |||
1095 | trace_assign_type(field, iter->ent); | ||
1096 | |||
1097 | SEQ_PUT_HEX_FIELD_RET(s, field->arg1); | ||
1098 | SEQ_PUT_HEX_FIELD_RET(s, field->arg2); | ||
1099 | SEQ_PUT_HEX_FIELD_RET(s, field->arg3); | ||
1100 | |||
1101 | return TRACE_TYPE_HANDLED; | ||
1102 | } | ||
1103 | |||
1104 | static enum print_line_t trace_special_bin(struct trace_iterator *iter, | ||
1105 | int flags, struct trace_event *event) | ||
1106 | { | ||
1107 | struct special_entry *field; | ||
1108 | struct trace_seq *s = &iter->seq; | ||
1109 | |||
1110 | trace_assign_type(field, iter->ent); | ||
1111 | |||
1112 | SEQ_PUT_FIELD_RET(s, field->arg1); | ||
1113 | SEQ_PUT_FIELD_RET(s, field->arg2); | ||
1114 | SEQ_PUT_FIELD_RET(s, field->arg3); | ||
1115 | |||
1116 | return TRACE_TYPE_HANDLED; | ||
1117 | } | ||
1118 | |||
1119 | static struct trace_event_functions trace_special_funcs = { | ||
1120 | .trace = trace_special_print, | ||
1121 | .raw = trace_special_print, | ||
1122 | .hex = trace_special_hex, | ||
1123 | .binary = trace_special_bin, | ||
1124 | }; | ||
1125 | |||
1126 | static struct trace_event trace_special_event = { | ||
1127 | .type = TRACE_SPECIAL, | ||
1128 | .funcs = &trace_special_funcs, | ||
1129 | }; | ||
1130 | |||
1131 | /* TRACE_STACK */ | 1069 | /* TRACE_STACK */ |
1132 | 1070 | ||
1133 | static enum print_line_t trace_stack_print(struct trace_iterator *iter, | 1071 | static enum print_line_t trace_stack_print(struct trace_iterator *iter, |
@@ -1161,9 +1099,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, | |||
1161 | 1099 | ||
1162 | static struct trace_event_functions trace_stack_funcs = { | 1100 | static struct trace_event_functions trace_stack_funcs = { |
1163 | .trace = trace_stack_print, | 1101 | .trace = trace_stack_print, |
1164 | .raw = trace_special_print, | ||
1165 | .hex = trace_special_hex, | ||
1166 | .binary = trace_special_bin, | ||
1167 | }; | 1102 | }; |
1168 | 1103 | ||
1169 | static struct trace_event trace_stack_event = { | 1104 | static struct trace_event trace_stack_event = { |
@@ -1194,9 +1129,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, | |||
1194 | 1129 | ||
1195 | static struct trace_event_functions trace_user_stack_funcs = { | 1130 | static struct trace_event_functions trace_user_stack_funcs = { |
1196 | .trace = trace_user_stack_print, | 1131 | .trace = trace_user_stack_print, |
1197 | .raw = trace_special_print, | ||
1198 | .hex = trace_special_hex, | ||
1199 | .binary = trace_special_bin, | ||
1200 | }; | 1132 | }; |
1201 | 1133 | ||
1202 | static struct trace_event trace_user_stack_event = { | 1134 | static struct trace_event trace_user_stack_event = { |
@@ -1314,7 +1246,6 @@ static struct trace_event *events[] __initdata = { | |||
1314 | &trace_fn_event, | 1246 | &trace_fn_event, |
1315 | &trace_ctx_event, | 1247 | &trace_ctx_event, |
1316 | &trace_wake_event, | 1248 | &trace_wake_event, |
1317 | &trace_special_event, | ||
1318 | &trace_stack_event, | 1249 | &trace_stack_event, |
1319 | &trace_user_stack_event, | 1250 | &trace_user_stack_event, |
1320 | &trace_bprint_event, | 1251 | &trace_bprint_event, |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0e73bc2ef8c5..7319559ed59f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -31,50 +31,99 @@ static int wakeup_rt; | |||
31 | static arch_spinlock_t wakeup_lock = | 31 | static arch_spinlock_t wakeup_lock = |
32 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 32 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
33 | 33 | ||
34 | static void wakeup_reset(struct trace_array *tr); | ||
34 | static void __wakeup_reset(struct trace_array *tr); | 35 | static void __wakeup_reset(struct trace_array *tr); |
36 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace); | ||
37 | static void wakeup_graph_return(struct ftrace_graph_ret *trace); | ||
35 | 38 | ||
36 | static int save_lat_flag; | 39 | static int save_lat_flag; |
37 | 40 | ||
41 | #define TRACE_DISPLAY_GRAPH 1 | ||
42 | |||
43 | static struct tracer_opt trace_opts[] = { | ||
44 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | ||
45 | /* display latency trace as call graph */ | ||
46 | { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, | ||
47 | #endif | ||
48 | { } /* Empty entry */ | ||
49 | }; | ||
50 | |||
51 | static struct tracer_flags tracer_flags = { | ||
52 | .val = 0, | ||
53 | .opts = trace_opts, | ||
54 | }; | ||
55 | |||
56 | #define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) | ||
57 | |||
38 | #ifdef CONFIG_FUNCTION_TRACER | 58 | #ifdef CONFIG_FUNCTION_TRACER |
59 | |||
39 | /* | 60 | /* |
40 | * irqsoff uses its own tracer function to keep the overhead down: | 61 | * Prologue for the wakeup function tracers. |
62 | * | ||
63 | * Returns 1 if it is OK to continue, and preemption | ||
64 | * is disabled and data->disabled is incremented. | ||
65 | * 0 if the trace is to be ignored, and preemption | ||
66 | * is not disabled and data->disabled is | ||
67 | * kept the same. | ||
68 | * | ||
69 | * Note, this function is also used outside this ifdef but | ||
70 | * inside the #ifdef of the function graph tracer below. | ||
71 | * This is OK, since the function graph tracer is | ||
72 | * dependent on the function tracer. | ||
41 | */ | 73 | */ |
42 | static void | 74 | static int |
43 | wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | 75 | func_prolog_preempt_disable(struct trace_array *tr, |
76 | struct trace_array_cpu **data, | ||
77 | int *pc) | ||
44 | { | 78 | { |
45 | struct trace_array *tr = wakeup_trace; | ||
46 | struct trace_array_cpu *data; | ||
47 | unsigned long flags; | ||
48 | long disabled; | 79 | long disabled; |
49 | int resched; | ||
50 | int cpu; | 80 | int cpu; |
51 | int pc; | ||
52 | 81 | ||
53 | if (likely(!wakeup_task)) | 82 | if (likely(!wakeup_task)) |
54 | return; | 83 | return 0; |
55 | 84 | ||
56 | pc = preempt_count(); | 85 | *pc = preempt_count(); |
57 | resched = ftrace_preempt_disable(); | 86 | preempt_disable_notrace(); |
58 | 87 | ||
59 | cpu = raw_smp_processor_id(); | 88 | cpu = raw_smp_processor_id(); |
60 | if (cpu != wakeup_current_cpu) | 89 | if (cpu != wakeup_current_cpu) |
61 | goto out_enable; | 90 | goto out_enable; |
62 | 91 | ||
63 | data = tr->data[cpu]; | 92 | *data = tr->data[cpu]; |
64 | disabled = atomic_inc_return(&data->disabled); | 93 | disabled = atomic_inc_return(&(*data)->disabled); |
65 | if (unlikely(disabled != 1)) | 94 | if (unlikely(disabled != 1)) |
66 | goto out; | 95 | goto out; |
67 | 96 | ||
68 | local_irq_save(flags); | 97 | return 1; |
69 | 98 | ||
70 | trace_function(tr, ip, parent_ip, flags, pc); | 99 | out: |
100 | atomic_dec(&(*data)->disabled); | ||
101 | |||
102 | out_enable: | ||
103 | preempt_enable_notrace(); | ||
104 | return 0; | ||
105 | } | ||
71 | 106 | ||
107 | /* | ||
108 | * wakeup uses its own tracer function to keep the overhead down: | ||
109 | */ | ||
110 | static void | ||
111 | wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | ||
112 | { | ||
113 | struct trace_array *tr = wakeup_trace; | ||
114 | struct trace_array_cpu *data; | ||
115 | unsigned long flags; | ||
116 | int pc; | ||
117 | |||
118 | if (!func_prolog_preempt_disable(tr, &data, &pc)) | ||
119 | return; | ||
120 | |||
121 | local_irq_save(flags); | ||
122 | trace_function(tr, ip, parent_ip, flags, pc); | ||
72 | local_irq_restore(flags); | 123 | local_irq_restore(flags); |
73 | 124 | ||
74 | out: | ||
75 | atomic_dec(&data->disabled); | 125 | atomic_dec(&data->disabled); |
76 | out_enable: | 126 | preempt_enable_notrace(); |
77 | ftrace_preempt_enable(resched); | ||
78 | } | 127 | } |
79 | 128 | ||
80 | static struct ftrace_ops trace_ops __read_mostly = | 129 | static struct ftrace_ops trace_ops __read_mostly = |
@@ -83,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly = | |||
83 | }; | 132 | }; |
84 | #endif /* CONFIG_FUNCTION_TRACER */ | 133 | #endif /* CONFIG_FUNCTION_TRACER */ |
85 | 134 | ||
135 | static int start_func_tracer(int graph) | ||
136 | { | ||
137 | int ret; | ||
138 | |||
139 | if (!graph) | ||
140 | ret = register_ftrace_function(&trace_ops); | ||
141 | else | ||
142 | ret = register_ftrace_graph(&wakeup_graph_return, | ||
143 | &wakeup_graph_entry); | ||
144 | |||
145 | if (!ret && tracing_is_enabled()) | ||
146 | tracer_enabled = 1; | ||
147 | else | ||
148 | tracer_enabled = 0; | ||
149 | |||
150 | return ret; | ||
151 | } | ||
152 | |||
153 | static void stop_func_tracer(int graph) | ||
154 | { | ||
155 | tracer_enabled = 0; | ||
156 | |||
157 | if (!graph) | ||
158 | unregister_ftrace_function(&trace_ops); | ||
159 | else | ||
160 | unregister_ftrace_graph(); | ||
161 | } | ||
162 | |||
163 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | ||
164 | static int wakeup_set_flag(u32 old_flags, u32 bit, int set) | ||
165 | { | ||
166 | |||
167 | if (!(bit & TRACE_DISPLAY_GRAPH)) | ||
168 | return -EINVAL; | ||
169 | |||
170 | if (!(is_graph() ^ set)) | ||
171 | return 0; | ||
172 | |||
173 | stop_func_tracer(!set); | ||
174 | |||
175 | wakeup_reset(wakeup_trace); | ||
176 | tracing_max_latency = 0; | ||
177 | |||
178 | return start_func_tracer(set); | ||
179 | } | ||
180 | |||
181 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace) | ||
182 | { | ||
183 | struct trace_array *tr = wakeup_trace; | ||
184 | struct trace_array_cpu *data; | ||
185 | unsigned long flags; | ||
186 | int pc, ret = 0; | ||
187 | |||
188 | if (!func_prolog_preempt_disable(tr, &data, &pc)) | ||
189 | return 0; | ||
190 | |||
191 | local_save_flags(flags); | ||
192 | ret = __trace_graph_entry(tr, trace, flags, pc); | ||
193 | atomic_dec(&data->disabled); | ||
194 | preempt_enable_notrace(); | ||
195 | |||
196 | return ret; | ||
197 | } | ||
198 | |||
199 | static void wakeup_graph_return(struct ftrace_graph_ret *trace) | ||
200 | { | ||
201 | struct trace_array *tr = wakeup_trace; | ||
202 | struct trace_array_cpu *data; | ||
203 | unsigned long flags; | ||
204 | int pc; | ||
205 | |||
206 | if (!func_prolog_preempt_disable(tr, &data, &pc)) | ||
207 | return; | ||
208 | |||
209 | local_save_flags(flags); | ||
210 | __trace_graph_return(tr, trace, flags, pc); | ||
211 | atomic_dec(&data->disabled); | ||
212 | |||
213 | preempt_enable_notrace(); | ||
214 | return; | ||
215 | } | ||
216 | |||
217 | static void wakeup_trace_open(struct trace_iterator *iter) | ||
218 | { | ||
219 | if (is_graph()) | ||
220 | graph_trace_open(iter); | ||
221 | } | ||
222 | |||
223 | static void wakeup_trace_close(struct trace_iterator *iter) | ||
224 | { | ||
225 | if (iter->private) | ||
226 | graph_trace_close(iter); | ||
227 | } | ||
228 | |||
229 | #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) | ||
230 | |||
231 | static enum print_line_t wakeup_print_line(struct trace_iterator *iter) | ||
232 | { | ||
233 | /* | ||
234 | * In graph mode call the graph tracer output function, | ||
235 | * otherwise go with the TRACE_FN event handler | ||
236 | */ | ||
237 | if (is_graph()) | ||
238 | return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); | ||
239 | |||
240 | return TRACE_TYPE_UNHANDLED; | ||
241 | } | ||
242 | |||
243 | static void wakeup_print_header(struct seq_file *s) | ||
244 | { | ||
245 | if (is_graph()) | ||
246 | print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); | ||
247 | else | ||
248 | trace_default_header(s); | ||
249 | } | ||
250 | |||
251 | static void | ||
252 | __trace_function(struct trace_array *tr, | ||
253 | unsigned long ip, unsigned long parent_ip, | ||
254 | unsigned long flags, int pc) | ||
255 | { | ||
256 | if (is_graph()) | ||
257 | trace_graph_function(tr, ip, parent_ip, flags, pc); | ||
258 | else | ||
259 | trace_function(tr, ip, parent_ip, flags, pc); | ||
260 | } | ||
261 | #else | ||
262 | #define __trace_function trace_function | ||
263 | |||
264 | static int wakeup_set_flag(u32 old_flags, u32 bit, int set) | ||
265 | { | ||
266 | return -EINVAL; | ||
267 | } | ||
268 | |||
269 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace) | ||
270 | { | ||
271 | return -1; | ||
272 | } | ||
273 | |||
274 | static enum print_line_t wakeup_print_line(struct trace_iterator *iter) | ||
275 | { | ||
276 | return TRACE_TYPE_UNHANDLED; | ||
277 | } | ||
278 | |||
279 | static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } | ||
280 | static void wakeup_print_header(struct seq_file *s) { } | ||
281 | static void wakeup_trace_open(struct trace_iterator *iter) { } | ||
282 | static void wakeup_trace_close(struct trace_iterator *iter) { } | ||
283 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | ||
284 | |||
86 | /* | 285 | /* |
87 | * Should this new latency be reported/recorded? | 286 | * Should this new latency be reported/recorded? |
88 | */ | 287 | */ |
@@ -153,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore, | |||
153 | /* The task we are waiting for is waking up */ | 352 | /* The task we are waiting for is waking up */ |
154 | data = wakeup_trace->data[wakeup_cpu]; | 353 | data = wakeup_trace->data[wakeup_cpu]; |
155 | 354 | ||
156 | trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); | 355 | __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); |
157 | tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); | 356 | tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); |
158 | 357 | ||
159 | T0 = data->preempt_timestamp; | 358 | T0 = data->preempt_timestamp; |
@@ -253,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
253 | * is not called by an assembly function (where as schedule is) | 452 | * is not called by an assembly function (where as schedule is) |
254 | * it should be safe to use it here. | 453 | * it should be safe to use it here. |
255 | */ | 454 | */ |
256 | trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); | 455 | __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); |
257 | 456 | ||
258 | out_locked: | 457 | out_locked: |
259 | arch_spin_unlock(&wakeup_lock); | 458 | arch_spin_unlock(&wakeup_lock); |
@@ -304,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr) | |||
304 | */ | 503 | */ |
305 | smp_wmb(); | 504 | smp_wmb(); |
306 | 505 | ||
307 | register_ftrace_function(&trace_ops); | 506 | if (start_func_tracer(is_graph())) |
308 | 507 | printk(KERN_ERR "failed to start wakeup tracer\n"); | |
309 | if (tracing_is_enabled()) | ||
310 | tracer_enabled = 1; | ||
311 | else | ||
312 | tracer_enabled = 0; | ||
313 | 508 | ||
314 | return; | 509 | return; |
315 | fail_deprobe_wake_new: | 510 | fail_deprobe_wake_new: |
@@ -321,7 +516,7 @@ fail_deprobe: | |||
321 | static void stop_wakeup_tracer(struct trace_array *tr) | 516 | static void stop_wakeup_tracer(struct trace_array *tr) |
322 | { | 517 | { |
323 | tracer_enabled = 0; | 518 | tracer_enabled = 0; |
324 | unregister_ftrace_function(&trace_ops); | 519 | stop_func_tracer(is_graph()); |
325 | unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); | 520 | unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); |
326 | unregister_trace_sched_wakeup_new(probe_wakeup, NULL); | 521 | unregister_trace_sched_wakeup_new(probe_wakeup, NULL); |
327 | unregister_trace_sched_wakeup(probe_wakeup, NULL); | 522 | unregister_trace_sched_wakeup(probe_wakeup, NULL); |
@@ -380,9 +575,16 @@ static struct tracer wakeup_tracer __read_mostly = | |||
380 | .start = wakeup_tracer_start, | 575 | .start = wakeup_tracer_start, |
381 | .stop = wakeup_tracer_stop, | 576 | .stop = wakeup_tracer_stop, |
382 | .print_max = 1, | 577 | .print_max = 1, |
578 | .print_header = wakeup_print_header, | ||
579 | .print_line = wakeup_print_line, | ||
580 | .flags = &tracer_flags, | ||
581 | .set_flag = wakeup_set_flag, | ||
383 | #ifdef CONFIG_FTRACE_SELFTEST | 582 | #ifdef CONFIG_FTRACE_SELFTEST |
384 | .selftest = trace_selftest_startup_wakeup, | 583 | .selftest = trace_selftest_startup_wakeup, |
385 | #endif | 584 | #endif |
585 | .open = wakeup_trace_open, | ||
586 | .close = wakeup_trace_close, | ||
587 | .use_max_tr = 1, | ||
386 | }; | 588 | }; |
387 | 589 | ||
388 | static struct tracer wakeup_rt_tracer __read_mostly = | 590 | static struct tracer wakeup_rt_tracer __read_mostly = |
@@ -394,9 +596,16 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
394 | .stop = wakeup_tracer_stop, | 596 | .stop = wakeup_tracer_stop, |
395 | .wait_pipe = poll_wait_pipe, | 597 | .wait_pipe = poll_wait_pipe, |
396 | .print_max = 1, | 598 | .print_max = 1, |
599 | .print_header = wakeup_print_header, | ||
600 | .print_line = wakeup_print_line, | ||
601 | .flags = &tracer_flags, | ||
602 | .set_flag = wakeup_set_flag, | ||
397 | #ifdef CONFIG_FTRACE_SELFTEST | 603 | #ifdef CONFIG_FTRACE_SELFTEST |
398 | .selftest = trace_selftest_startup_wakeup, | 604 | .selftest = trace_selftest_startup_wakeup, |
399 | #endif | 605 | #endif |
606 | .open = wakeup_trace_open, | ||
607 | .close = wakeup_trace_close, | ||
608 | .use_max_tr = 1, | ||
400 | }; | 609 | }; |
401 | 610 | ||
402 | __init static int init_wakeup_tracer(void) | 611 | __init static int init_wakeup_tracer(void) |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 250e7f9bd2f0..155a415b3209 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -13,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry) | |||
13 | case TRACE_WAKE: | 13 | case TRACE_WAKE: |
14 | case TRACE_STACK: | 14 | case TRACE_STACK: |
15 | case TRACE_PRINT: | 15 | case TRACE_PRINT: |
16 | case TRACE_SPECIAL: | ||
17 | case TRACE_BRANCH: | 16 | case TRACE_BRANCH: |
18 | case TRACE_GRAPH_ENT: | 17 | case TRACE_GRAPH_ENT: |
19 | case TRACE_GRAPH_RET: | 18 | case TRACE_GRAPH_RET: |
20 | case TRACE_KSYM: | ||
21 | return 1; | 19 | return 1; |
22 | } | 20 | } |
23 | return 0; | 21 | return 0; |
@@ -691,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr | |||
691 | } | 689 | } |
692 | #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ | 690 | #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ |
693 | 691 | ||
694 | #ifdef CONFIG_SYSPROF_TRACER | ||
695 | int | ||
696 | trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) | ||
697 | { | ||
698 | unsigned long count; | ||
699 | int ret; | ||
700 | |||
701 | /* start the tracing */ | ||
702 | ret = tracer_init(trace, tr); | ||
703 | if (ret) { | ||
704 | warn_failed_init_tracer(trace, ret); | ||
705 | return ret; | ||
706 | } | ||
707 | |||
708 | /* Sleep for a 1/10 of a second */ | ||
709 | msleep(100); | ||
710 | /* stop the tracing. */ | ||
711 | tracing_stop(); | ||
712 | /* check the trace buffer */ | ||
713 | ret = trace_test_buffer(tr, &count); | ||
714 | trace->reset(tr); | ||
715 | tracing_start(); | ||
716 | |||
717 | if (!ret && !count) { | ||
718 | printk(KERN_CONT ".. no entries found .."); | ||
719 | ret = -1; | ||
720 | } | ||
721 | |||
722 | return ret; | ||
723 | } | ||
724 | #endif /* CONFIG_SYSPROF_TRACER */ | ||
725 | |||
726 | #ifdef CONFIG_BRANCH_TRACER | 692 | #ifdef CONFIG_BRANCH_TRACER |
727 | int | 693 | int |
728 | trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) | 694 | trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) |
@@ -755,56 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) | |||
755 | } | 721 | } |
756 | #endif /* CONFIG_BRANCH_TRACER */ | 722 | #endif /* CONFIG_BRANCH_TRACER */ |
757 | 723 | ||
758 | #ifdef CONFIG_KSYM_TRACER | ||
759 | static int ksym_selftest_dummy; | ||
760 | |||
761 | int | ||
762 | trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr) | ||
763 | { | ||
764 | unsigned long count; | ||
765 | int ret; | ||
766 | |||
767 | /* start the tracing */ | ||
768 | ret = tracer_init(trace, tr); | ||
769 | if (ret) { | ||
770 | warn_failed_init_tracer(trace, ret); | ||
771 | return ret; | ||
772 | } | ||
773 | |||
774 | ksym_selftest_dummy = 0; | ||
775 | /* Register the read-write tracing request */ | ||
776 | |||
777 | ret = process_new_ksym_entry("ksym_selftest_dummy", | ||
778 | HW_BREAKPOINT_R | HW_BREAKPOINT_W, | ||
779 | (unsigned long)(&ksym_selftest_dummy)); | ||
780 | |||
781 | if (ret < 0) { | ||
782 | printk(KERN_CONT "ksym_trace read-write startup test failed\n"); | ||
783 | goto ret_path; | ||
784 | } | ||
785 | /* Perform a read and a write operation over the dummy variable to | ||
786 | * trigger the tracer | ||
787 | */ | ||
788 | if (ksym_selftest_dummy == 0) | ||
789 | ksym_selftest_dummy++; | ||
790 | |||
791 | /* stop the tracing. */ | ||
792 | tracing_stop(); | ||
793 | /* check the trace buffer */ | ||
794 | ret = trace_test_buffer(tr, &count); | ||
795 | trace->reset(tr); | ||
796 | tracing_start(); | ||
797 | |||
798 | /* read & write operations - one each is performed on the dummy variable | ||
799 | * triggering two entries in the trace buffer | ||
800 | */ | ||
801 | if (!ret && count != 2) { | ||
802 | printk(KERN_CONT "Ksym tracer startup test failed"); | ||
803 | ret = -1; | ||
804 | } | ||
805 | |||
806 | ret_path: | ||
807 | return ret; | ||
808 | } | ||
809 | #endif /* CONFIG_KSYM_TRACER */ | ||
810 | |||
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index f4bc9b27de5f..4c5dead0c239 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -110,12 +110,12 @@ static inline void check_stack(void) | |||
110 | static void | 110 | static void |
111 | stack_trace_call(unsigned long ip, unsigned long parent_ip) | 111 | stack_trace_call(unsigned long ip, unsigned long parent_ip) |
112 | { | 112 | { |
113 | int cpu, resched; | 113 | int cpu; |
114 | 114 | ||
115 | if (unlikely(!ftrace_enabled || stack_trace_disabled)) | 115 | if (unlikely(!ftrace_enabled || stack_trace_disabled)) |
116 | return; | 116 | return; |
117 | 117 | ||
118 | resched = ftrace_preempt_disable(); | 118 | preempt_disable_notrace(); |
119 | 119 | ||
120 | cpu = raw_smp_processor_id(); | 120 | cpu = raw_smp_processor_id(); |
121 | /* no atomic needed, we only modify this variable by this cpu */ | 121 | /* no atomic needed, we only modify this variable by this cpu */ |
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
127 | out: | 127 | out: |
128 | per_cpu(trace_active, cpu)--; | 128 | per_cpu(trace_active, cpu)--; |
129 | /* prevent recursion in schedule */ | 129 | /* prevent recursion in schedule */ |
130 | ftrace_preempt_enable(resched); | 130 | preempt_enable_notrace(); |
131 | } | 131 | } |
132 | 132 | ||
133 | static struct ftrace_ops trace_ops __read_mostly = | 133 | static struct ftrace_ops trace_ops __read_mostly = |
@@ -195,6 +195,7 @@ static const struct file_operations stack_max_size_fops = { | |||
195 | .open = tracing_open_generic, | 195 | .open = tracing_open_generic, |
196 | .read = stack_max_size_read, | 196 | .read = stack_max_size_read, |
197 | .write = stack_max_size_write, | 197 | .write = stack_max_size_write, |
198 | .llseek = default_llseek, | ||
198 | }; | 199 | }; |
199 | 200 | ||
200 | static void * | 201 | static void * |
@@ -249,7 +250,7 @@ static int trace_lookup_stack(struct seq_file *m, long i) | |||
249 | { | 250 | { |
250 | unsigned long addr = stack_dump_trace[i]; | 251 | unsigned long addr = stack_dump_trace[i]; |
251 | 252 | ||
252 | return seq_printf(m, "%pF\n", (void *)addr); | 253 | return seq_printf(m, "%pS\n", (void *)addr); |
253 | } | 254 | } |
254 | 255 | ||
255 | static void print_disabled(struct seq_file *m) | 256 | static void print_disabled(struct seq_file *m) |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 34e35804304b..bac752f0cfb5 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event, | |||
23 | static int syscall_enter_define_fields(struct ftrace_event_call *call); | 23 | static int syscall_enter_define_fields(struct ftrace_event_call *call); |
24 | static int syscall_exit_define_fields(struct ftrace_event_call *call); | 24 | static int syscall_exit_define_fields(struct ftrace_event_call *call); |
25 | 25 | ||
26 | /* All syscall exit events have the same fields */ | ||
27 | static LIST_HEAD(syscall_exit_fields); | ||
28 | |||
26 | static struct list_head * | 29 | static struct list_head * |
27 | syscall_get_enter_fields(struct ftrace_event_call *call) | 30 | syscall_get_enter_fields(struct ftrace_event_call *call) |
28 | { | 31 | { |
@@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call) | |||
34 | static struct list_head * | 37 | static struct list_head * |
35 | syscall_get_exit_fields(struct ftrace_event_call *call) | 38 | syscall_get_exit_fields(struct ftrace_event_call *call) |
36 | { | 39 | { |
37 | struct syscall_metadata *entry = call->data; | 40 | return &syscall_exit_fields; |
38 | |||
39 | return &entry->exit_fields; | ||
40 | } | 41 | } |
41 | 42 | ||
42 | struct trace_event_functions enter_syscall_print_funcs = { | 43 | struct trace_event_functions enter_syscall_print_funcs = { |
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c deleted file mode 100644 index a7974a552ca9..000000000000 --- a/kernel/trace/trace_sysprof.c +++ /dev/null | |||
@@ -1,329 +0,0 @@ | |||
1 | /* | ||
2 | * trace stack traces | ||
3 | * | ||
4 | * Copyright (C) 2004-2008, Soeren Sandmann | ||
5 | * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> | ||
6 | * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> | ||
7 | */ | ||
8 | #include <linux/kallsyms.h> | ||
9 | #include <linux/debugfs.h> | ||
10 | #include <linux/hrtimer.h> | ||
11 | #include <linux/uaccess.h> | ||
12 | #include <linux/ftrace.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/irq.h> | ||
15 | #include <linux/fs.h> | ||
16 | |||
17 | #include <asm/stacktrace.h> | ||
18 | |||
19 | #include "trace.h" | ||
20 | |||
21 | static struct trace_array *sysprof_trace; | ||
22 | static int __read_mostly tracer_enabled; | ||
23 | |||
24 | /* | ||
25 | * 1 msec sample interval by default: | ||
26 | */ | ||
27 | static unsigned long sample_period = 1000000; | ||
28 | static const unsigned int sample_max_depth = 512; | ||
29 | |||
30 | static DEFINE_MUTEX(sample_timer_lock); | ||
31 | /* | ||
32 | * Per CPU hrtimers that do the profiling: | ||
33 | */ | ||
34 | static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); | ||
35 | |||
36 | struct stack_frame { | ||
37 | const void __user *next_fp; | ||
38 | unsigned long return_address; | ||
39 | }; | ||
40 | |||
41 | static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) | ||
42 | { | ||
43 | int ret; | ||
44 | |||
45 | if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) | ||
46 | return 0; | ||
47 | |||
48 | ret = 1; | ||
49 | pagefault_disable(); | ||
50 | if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) | ||
51 | ret = 0; | ||
52 | pagefault_enable(); | ||
53 | |||
54 | return ret; | ||
55 | } | ||
56 | |||
57 | struct backtrace_info { | ||
58 | struct trace_array_cpu *data; | ||
59 | struct trace_array *tr; | ||
60 | int pos; | ||
61 | }; | ||
62 | |||
63 | static void | ||
64 | backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
65 | { | ||
66 | /* Ignore warnings */ | ||
67 | } | ||
68 | |||
69 | static void backtrace_warning(void *data, char *msg) | ||
70 | { | ||
71 | /* Ignore warnings */ | ||
72 | } | ||
73 | |||
74 | static int backtrace_stack(void *data, char *name) | ||
75 | { | ||
76 | /* Don't bother with IRQ stacks for now */ | ||
77 | return -1; | ||
78 | } | ||
79 | |||
80 | static void backtrace_address(void *data, unsigned long addr, int reliable) | ||
81 | { | ||
82 | struct backtrace_info *info = data; | ||
83 | |||
84 | if (info->pos < sample_max_depth && reliable) { | ||
85 | __trace_special(info->tr, info->data, 1, addr, 0); | ||
86 | |||
87 | info->pos++; | ||
88 | } | ||
89 | } | ||
90 | |||
91 | static const struct stacktrace_ops backtrace_ops = { | ||
92 | .warning = backtrace_warning, | ||
93 | .warning_symbol = backtrace_warning_symbol, | ||
94 | .stack = backtrace_stack, | ||
95 | .address = backtrace_address, | ||
96 | .walk_stack = print_context_stack, | ||
97 | }; | ||
98 | |||
99 | static int | ||
100 | trace_kernel(struct pt_regs *regs, struct trace_array *tr, | ||
101 | struct trace_array_cpu *data) | ||
102 | { | ||
103 | struct backtrace_info info; | ||
104 | unsigned long bp; | ||
105 | char *stack; | ||
106 | |||
107 | info.tr = tr; | ||
108 | info.data = data; | ||
109 | info.pos = 1; | ||
110 | |||
111 | __trace_special(info.tr, info.data, 1, regs->ip, 0); | ||
112 | |||
113 | stack = ((char *)regs + sizeof(struct pt_regs)); | ||
114 | #ifdef CONFIG_FRAME_POINTER | ||
115 | bp = regs->bp; | ||
116 | #else | ||
117 | bp = 0; | ||
118 | #endif | ||
119 | |||
120 | dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info); | ||
121 | |||
122 | return info.pos; | ||
123 | } | ||
124 | |||
125 | static void timer_notify(struct pt_regs *regs, int cpu) | ||
126 | { | ||
127 | struct trace_array_cpu *data; | ||
128 | struct stack_frame frame; | ||
129 | struct trace_array *tr; | ||
130 | const void __user *fp; | ||
131 | int is_user; | ||
132 | int i; | ||
133 | |||
134 | if (!regs) | ||
135 | return; | ||
136 | |||
137 | tr = sysprof_trace; | ||
138 | data = tr->data[cpu]; | ||
139 | is_user = user_mode(regs); | ||
140 | |||
141 | if (!current || current->pid == 0) | ||
142 | return; | ||
143 | |||
144 | if (is_user && current->state != TASK_RUNNING) | ||
145 | return; | ||
146 | |||
147 | __trace_special(tr, data, 0, 0, current->pid); | ||
148 | |||
149 | if (!is_user) | ||
150 | i = trace_kernel(regs, tr, data); | ||
151 | else | ||
152 | i = 0; | ||
153 | |||
154 | /* | ||
155 | * Trace user stack if we are not a kernel thread | ||
156 | */ | ||
157 | if (current->mm && i < sample_max_depth) { | ||
158 | regs = (struct pt_regs *)current->thread.sp0 - 1; | ||
159 | |||
160 | fp = (void __user *)regs->bp; | ||
161 | |||
162 | __trace_special(tr, data, 2, regs->ip, 0); | ||
163 | |||
164 | while (i < sample_max_depth) { | ||
165 | frame.next_fp = NULL; | ||
166 | frame.return_address = 0; | ||
167 | if (!copy_stack_frame(fp, &frame)) | ||
168 | break; | ||
169 | if ((unsigned long)fp < regs->sp) | ||
170 | break; | ||
171 | |||
172 | __trace_special(tr, data, 2, frame.return_address, | ||
173 | (unsigned long)fp); | ||
174 | fp = frame.next_fp; | ||
175 | |||
176 | i++; | ||
177 | } | ||
178 | |||
179 | } | ||
180 | |||
181 | /* | ||
182 | * Special trace entry if we overflow the max depth: | ||
183 | */ | ||
184 | if (i == sample_max_depth) | ||
185 | __trace_special(tr, data, -1, -1, -1); | ||
186 | |||
187 | __trace_special(tr, data, 3, current->pid, i); | ||
188 | } | ||
189 | |||
190 | static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) | ||
191 | { | ||
192 | /* trace here */ | ||
193 | timer_notify(get_irq_regs(), smp_processor_id()); | ||
194 | |||
195 | hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); | ||
196 | |||
197 | return HRTIMER_RESTART; | ||
198 | } | ||
199 | |||
200 | static void start_stack_timer(void *unused) | ||
201 | { | ||
202 | struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer); | ||
203 | |||
204 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
205 | hrtimer->function = stack_trace_timer_fn; | ||
206 | |||
207 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), | ||
208 | HRTIMER_MODE_REL_PINNED); | ||
209 | } | ||
210 | |||
211 | static void start_stack_timers(void) | ||
212 | { | ||
213 | on_each_cpu(start_stack_timer, NULL, 1); | ||
214 | } | ||
215 | |||
216 | static void stop_stack_timer(int cpu) | ||
217 | { | ||
218 | struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); | ||
219 | |||
220 | hrtimer_cancel(hrtimer); | ||
221 | } | ||
222 | |||
223 | static void stop_stack_timers(void) | ||
224 | { | ||
225 | int cpu; | ||
226 | |||
227 | for_each_online_cpu(cpu) | ||
228 | stop_stack_timer(cpu); | ||
229 | } | ||
230 | |||
231 | static void stop_stack_trace(struct trace_array *tr) | ||
232 | { | ||
233 | mutex_lock(&sample_timer_lock); | ||
234 | stop_stack_timers(); | ||
235 | tracer_enabled = 0; | ||
236 | mutex_unlock(&sample_timer_lock); | ||
237 | } | ||
238 | |||
239 | static int stack_trace_init(struct trace_array *tr) | ||
240 | { | ||
241 | sysprof_trace = tr; | ||
242 | |||
243 | tracing_start_cmdline_record(); | ||
244 | |||
245 | mutex_lock(&sample_timer_lock); | ||
246 | start_stack_timers(); | ||
247 | tracer_enabled = 1; | ||
248 | mutex_unlock(&sample_timer_lock); | ||
249 | return 0; | ||
250 | } | ||
251 | |||
252 | static void stack_trace_reset(struct trace_array *tr) | ||
253 | { | ||
254 | tracing_stop_cmdline_record(); | ||
255 | stop_stack_trace(tr); | ||
256 | } | ||
257 | |||
258 | static struct tracer stack_trace __read_mostly = | ||
259 | { | ||
260 | .name = "sysprof", | ||
261 | .init = stack_trace_init, | ||
262 | .reset = stack_trace_reset, | ||
263 | #ifdef CONFIG_FTRACE_SELFTEST | ||
264 | .selftest = trace_selftest_startup_sysprof, | ||
265 | #endif | ||
266 | }; | ||
267 | |||
268 | __init static int init_stack_trace(void) | ||
269 | { | ||
270 | return register_tracer(&stack_trace); | ||
271 | } | ||
272 | device_initcall(init_stack_trace); | ||
273 | |||
274 | #define MAX_LONG_DIGITS 22 | ||
275 | |||
276 | static ssize_t | ||
277 | sysprof_sample_read(struct file *filp, char __user *ubuf, | ||
278 | size_t cnt, loff_t *ppos) | ||
279 | { | ||
280 | char buf[MAX_LONG_DIGITS]; | ||
281 | int r; | ||
282 | |||
283 | r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period)); | ||
284 | |||
285 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
286 | } | ||
287 | |||
288 | static ssize_t | ||
289 | sysprof_sample_write(struct file *filp, const char __user *ubuf, | ||
290 | size_t cnt, loff_t *ppos) | ||
291 | { | ||
292 | char buf[MAX_LONG_DIGITS]; | ||
293 | unsigned long val; | ||
294 | |||
295 | if (cnt > MAX_LONG_DIGITS-1) | ||
296 | cnt = MAX_LONG_DIGITS-1; | ||
297 | |||
298 | if (copy_from_user(&buf, ubuf, cnt)) | ||
299 | return -EFAULT; | ||
300 | |||
301 | buf[cnt] = 0; | ||
302 | |||
303 | val = simple_strtoul(buf, NULL, 10); | ||
304 | /* | ||
305 | * Enforce a minimum sample period of 100 usecs: | ||
306 | */ | ||
307 | if (val < 100) | ||
308 | val = 100; | ||
309 | |||
310 | mutex_lock(&sample_timer_lock); | ||
311 | stop_stack_timers(); | ||
312 | sample_period = val * 1000; | ||
313 | start_stack_timers(); | ||
314 | mutex_unlock(&sample_timer_lock); | ||
315 | |||
316 | return cnt; | ||
317 | } | ||
318 | |||
319 | static const struct file_operations sysprof_sample_fops = { | ||
320 | .read = sysprof_sample_read, | ||
321 | .write = sysprof_sample_write, | ||
322 | }; | ||
323 | |||
324 | void init_tracer_sysprof_debugfs(struct dentry *d_tracer) | ||
325 | { | ||
326 | |||
327 | trace_create_file("sysprof_sample_period", 0644, | ||
328 | d_tracer, NULL, &sysprof_sample_fops); | ||
329 | } | ||
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index a7cc3793baf6..209b379a4721 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c | |||
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void) | |||
263 | { | 263 | { |
264 | int ret, cpu; | 264 | int ret, cpu; |
265 | 265 | ||
266 | for_each_possible_cpu(cpu) { | ||
267 | spin_lock_init(&workqueue_cpu_stat(cpu)->lock); | ||
268 | INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); | ||
269 | } | ||
270 | |||
266 | ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); | 271 | ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); |
267 | if (ret) | 272 | if (ret) |
268 | goto out; | 273 | goto out; |
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void) | |||
279 | if (ret) | 284 | if (ret) |
280 | goto no_creation; | 285 | goto no_creation; |
281 | 286 | ||
282 | for_each_possible_cpu(cpu) { | ||
283 | spin_lock_init(&workqueue_cpu_stat(cpu)->lock); | ||
284 | INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); | ||
285 | } | ||
286 | |||
287 | return 0; | 287 | return 0; |
288 | 288 | ||
289 | no_creation: | 289 | no_creation: |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index c77f3eceea25..e95ee7f31d43 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/err.h> | 25 | #include <linux/err.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
28 | #include <linux/jump_label.h> | ||
28 | 29 | ||
29 | extern struct tracepoint __start___tracepoints[]; | 30 | extern struct tracepoint __start___tracepoints[]; |
30 | extern struct tracepoint __stop___tracepoints[]; | 31 | extern struct tracepoint __stop___tracepoints[]; |
@@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
263 | * is used. | 264 | * is used. |
264 | */ | 265 | */ |
265 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); | 266 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); |
266 | elem->state = active; | 267 | if (!elem->state && active) { |
268 | jump_label_enable(&elem->state); | ||
269 | elem->state = active; | ||
270 | } else if (elem->state && !active) { | ||
271 | jump_label_disable(&elem->state); | ||
272 | elem->state = active; | ||
273 | } | ||
267 | } | 274 | } |
268 | 275 | ||
269 | /* | 276 | /* |
@@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem) | |||
277 | if (elem->unregfunc && elem->state) | 284 | if (elem->unregfunc && elem->state) |
278 | elem->unregfunc(); | 285 | elem->unregfunc(); |
279 | 286 | ||
280 | elem->state = 0; | 287 | if (elem->state) { |
288 | jump_label_disable(&elem->state); | ||
289 | elem->state = 0; | ||
290 | } | ||
281 | rcu_assign_pointer(elem->funcs, NULL); | 291 | rcu_assign_pointer(elem->funcs, NULL); |
282 | } | 292 | } |
283 | 293 | ||
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 0a67e041edf8..24dc60d9fa1f 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) | |||
63 | stats->ac_ppid = pid_alive(tsk) ? | 63 | stats->ac_ppid = pid_alive(tsk) ? |
64 | rcu_dereference(tsk->real_parent)->tgid : 0; | 64 | rcu_dereference(tsk->real_parent)->tgid : 0; |
65 | rcu_read_unlock(); | 65 | rcu_read_unlock(); |
66 | stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; | 66 | stats->ac_utime = cputime_to_usecs(tsk->utime); |
67 | stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; | 67 | stats->ac_stime = cputime_to_usecs(tsk->stime); |
68 | stats->ac_utimescaled = | 68 | stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); |
69 | cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; | 69 | stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); |
70 | stats->ac_stimescaled = | ||
71 | cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC; | ||
72 | stats->ac_minflt = tsk->min_flt; | 70 | stats->ac_minflt = tsk->min_flt; |
73 | stats->ac_majflt = tsk->maj_flt; | 71 | stats->ac_majflt = tsk->maj_flt; |
74 | 72 | ||
diff --git a/kernel/user.c b/kernel/user.c index 7e72614b736d..2c7d8d5914b1 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | |||
91 | * upon function exit. | 91 | * upon function exit. |
92 | */ | 92 | */ |
93 | static void free_user(struct user_struct *up, unsigned long flags) | 93 | static void free_user(struct user_struct *up, unsigned long flags) |
94 | __releases(&uidhash_lock) | ||
94 | { | 95 | { |
95 | uid_hash_remove(up); | 96 | uid_hash_remove(up); |
96 | spin_unlock_irqrestore(&uidhash_lock, flags); | 97 | spin_unlock_irqrestore(&uidhash_lock, flags); |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b2d70d38dff4..25915832291a 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/nsproxy.h> | 9 | #include <linux/nsproxy.h> |
10 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
11 | #include <linux/user_namespace.h> | 11 | #include <linux/user_namespace.h> |
12 | #include <linux/highuid.h> | ||
12 | #include <linux/cred.h> | 13 | #include <linux/cred.h> |
13 | 14 | ||
14 | /* | 15 | /* |
@@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref) | |||
82 | schedule_work(&ns->destroyer); | 83 | schedule_work(&ns->destroyer); |
83 | } | 84 | } |
84 | EXPORT_SYMBOL(free_user_ns); | 85 | EXPORT_SYMBOL(free_user_ns); |
86 | |||
87 | uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) | ||
88 | { | ||
89 | struct user_namespace *tmp; | ||
90 | |||
91 | if (likely(to == cred->user->user_ns)) | ||
92 | return uid; | ||
93 | |||
94 | |||
95 | /* Is cred->user the creator of the target user_ns | ||
96 | * or the creator of one of it's parents? | ||
97 | */ | ||
98 | for ( tmp = to; tmp != &init_user_ns; | ||
99 | tmp = tmp->creator->user_ns ) { | ||
100 | if (cred->user == tmp->creator) { | ||
101 | return (uid_t)0; | ||
102 | } | ||
103 | } | ||
104 | |||
105 | /* No useful relationship so no mapping */ | ||
106 | return overflowuid; | ||
107 | } | ||
108 | |||
109 | gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) | ||
110 | { | ||
111 | struct user_namespace *tmp; | ||
112 | |||
113 | if (likely(to == cred->user->user_ns)) | ||
114 | return gid; | ||
115 | |||
116 | /* Is cred->user the creator of the target user_ns | ||
117 | * or the creator of one of it's parents? | ||
118 | */ | ||
119 | for ( tmp = to; tmp != &init_user_ns; | ||
120 | tmp = tmp->creator->user_ns ) { | ||
121 | if (cred->user == tmp->creator) { | ||
122 | return (gid_t)0; | ||
123 | } | ||
124 | } | ||
125 | |||
126 | /* No useful relationship so no mapping */ | ||
127 | return overflowgid; | ||
128 | } | ||
diff --git a/kernel/wait.c b/kernel/wait.c index c4bd3d825f35..b0310eb6cc1e 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) | |||
92 | } | 92 | } |
93 | EXPORT_SYMBOL(prepare_to_wait_exclusive); | 93 | EXPORT_SYMBOL(prepare_to_wait_exclusive); |
94 | 94 | ||
95 | /* | 95 | /** |
96 | * finish_wait - clean up after waiting in a queue | 96 | * finish_wait - clean up after waiting in a queue |
97 | * @q: waitqueue waited on | 97 | * @q: waitqueue waited on |
98 | * @wait: wait descriptor | 98 | * @wait: wait descriptor |
@@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) | |||
127 | } | 127 | } |
128 | EXPORT_SYMBOL(finish_wait); | 128 | EXPORT_SYMBOL(finish_wait); |
129 | 129 | ||
130 | /* | 130 | /** |
131 | * abort_exclusive_wait - abort exclusive waiting in a queue | 131 | * abort_exclusive_wait - abort exclusive waiting in a queue |
132 | * @q: waitqueue waited on | 132 | * @q: waitqueue waited on |
133 | * @wait: wait descriptor | 133 | * @wait: wait descriptor |
134 | * @state: runstate of the waiter to be woken | 134 | * @mode: runstate of the waiter to be woken |
135 | * @key: key to identify a wait bit queue or %NULL | 135 | * @key: key to identify a wait bit queue or %NULL |
136 | * | 136 | * |
137 | * Sets current thread back to running state and removes | 137 | * Sets current thread back to running state and removes |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c new file mode 100644 index 000000000000..bafba687a6d8 --- /dev/null +++ b/kernel/watchdog.c | |||
@@ -0,0 +1,566 @@ | |||
1 | /* | ||
2 | * Detect hard and soft lockups on a system | ||
3 | * | ||
4 | * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. | ||
5 | * | ||
6 | * this code detects hard lockups: incidents in where on a CPU | ||
7 | * the kernel does not respond to anything except NMI. | ||
8 | * | ||
9 | * Note: Most of this code is borrowed heavily from softlockup.c, | ||
10 | * so thanks to Ingo for the initial implementation. | ||
11 | * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks | ||
12 | * to those contributors as well. | ||
13 | */ | ||
14 | |||
15 | #include <linux/mm.h> | ||
16 | #include <linux/cpu.h> | ||
17 | #include <linux/nmi.h> | ||
18 | #include <linux/init.h> | ||
19 | #include <linux/delay.h> | ||
20 | #include <linux/freezer.h> | ||
21 | #include <linux/kthread.h> | ||
22 | #include <linux/lockdep.h> | ||
23 | #include <linux/notifier.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/sysctl.h> | ||
26 | |||
27 | #include <asm/irq_regs.h> | ||
28 | #include <linux/perf_event.h> | ||
29 | |||
30 | int watchdog_enabled; | ||
31 | int __read_mostly softlockup_thresh = 60; | ||
32 | |||
33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | ||
34 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | ||
35 | static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); | ||
36 | static DEFINE_PER_CPU(bool, softlockup_touch_sync); | ||
37 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); | ||
38 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
39 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); | ||
40 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | ||
41 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); | ||
42 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | ||
43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | ||
44 | #endif | ||
45 | |||
46 | static int __initdata no_watchdog; | ||
47 | |||
48 | |||
49 | /* boot commands */ | ||
50 | /* | ||
51 | * Should we panic when a soft-lockup or hard-lockup occurs: | ||
52 | */ | ||
53 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
54 | static int hardlockup_panic; | ||
55 | |||
56 | static int __init hardlockup_panic_setup(char *str) | ||
57 | { | ||
58 | if (!strncmp(str, "panic", 5)) | ||
59 | hardlockup_panic = 1; | ||
60 | return 1; | ||
61 | } | ||
62 | __setup("nmi_watchdog=", hardlockup_panic_setup); | ||
63 | #endif | ||
64 | |||
65 | unsigned int __read_mostly softlockup_panic = | ||
66 | CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; | ||
67 | |||
68 | static int __init softlockup_panic_setup(char *str) | ||
69 | { | ||
70 | softlockup_panic = simple_strtoul(str, NULL, 0); | ||
71 | |||
72 | return 1; | ||
73 | } | ||
74 | __setup("softlockup_panic=", softlockup_panic_setup); | ||
75 | |||
76 | static int __init nowatchdog_setup(char *str) | ||
77 | { | ||
78 | no_watchdog = 1; | ||
79 | return 1; | ||
80 | } | ||
81 | __setup("nowatchdog", nowatchdog_setup); | ||
82 | |||
83 | /* deprecated */ | ||
84 | static int __init nosoftlockup_setup(char *str) | ||
85 | { | ||
86 | no_watchdog = 1; | ||
87 | return 1; | ||
88 | } | ||
89 | __setup("nosoftlockup", nosoftlockup_setup); | ||
90 | /* */ | ||
91 | |||
92 | |||
93 | /* | ||
94 | * Returns seconds, approximately. We don't need nanosecond | ||
95 | * resolution, and we don't need to waste time with a big divide when | ||
96 | * 2^30ns == 1.074s. | ||
97 | */ | ||
98 | static unsigned long get_timestamp(int this_cpu) | ||
99 | { | ||
100 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ | ||
101 | } | ||
102 | |||
103 | static unsigned long get_sample_period(void) | ||
104 | { | ||
105 | /* | ||
106 | * convert softlockup_thresh from seconds to ns | ||
107 | * the divide by 5 is to give hrtimer 5 chances to | ||
108 | * increment before the hardlockup detector generates | ||
109 | * a warning | ||
110 | */ | ||
111 | return softlockup_thresh / 5 * NSEC_PER_SEC; | ||
112 | } | ||
113 | |||
114 | /* Commands for resetting the watchdog */ | ||
115 | static void __touch_watchdog(void) | ||
116 | { | ||
117 | int this_cpu = smp_processor_id(); | ||
118 | |||
119 | __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); | ||
120 | } | ||
121 | |||
122 | void touch_softlockup_watchdog(void) | ||
123 | { | ||
124 | __raw_get_cpu_var(watchdog_touch_ts) = 0; | ||
125 | } | ||
126 | EXPORT_SYMBOL(touch_softlockup_watchdog); | ||
127 | |||
128 | void touch_all_softlockup_watchdogs(void) | ||
129 | { | ||
130 | int cpu; | ||
131 | |||
132 | /* | ||
133 | * this is done lockless | ||
134 | * do we care if a 0 races with a timestamp? | ||
135 | * all it means is the softlock check starts one cycle later | ||
136 | */ | ||
137 | for_each_online_cpu(cpu) | ||
138 | per_cpu(watchdog_touch_ts, cpu) = 0; | ||
139 | } | ||
140 | |||
141 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
142 | void touch_nmi_watchdog(void) | ||
143 | { | ||
144 | if (watchdog_enabled) { | ||
145 | unsigned cpu; | ||
146 | |||
147 | for_each_present_cpu(cpu) { | ||
148 | if (per_cpu(watchdog_nmi_touch, cpu) != true) | ||
149 | per_cpu(watchdog_nmi_touch, cpu) = true; | ||
150 | } | ||
151 | } | ||
152 | touch_softlockup_watchdog(); | ||
153 | } | ||
154 | EXPORT_SYMBOL(touch_nmi_watchdog); | ||
155 | |||
156 | #endif | ||
157 | |||
158 | void touch_softlockup_watchdog_sync(void) | ||
159 | { | ||
160 | __raw_get_cpu_var(softlockup_touch_sync) = true; | ||
161 | __raw_get_cpu_var(watchdog_touch_ts) = 0; | ||
162 | } | ||
163 | |||
164 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
165 | /* watchdog detector functions */ | ||
166 | static int is_hardlockup(void) | ||
167 | { | ||
168 | unsigned long hrint = __get_cpu_var(hrtimer_interrupts); | ||
169 | |||
170 | if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) | ||
171 | return 1; | ||
172 | |||
173 | __get_cpu_var(hrtimer_interrupts_saved) = hrint; | ||
174 | return 0; | ||
175 | } | ||
176 | #endif | ||
177 | |||
178 | static int is_softlockup(unsigned long touch_ts) | ||
179 | { | ||
180 | unsigned long now = get_timestamp(smp_processor_id()); | ||
181 | |||
182 | /* Warn about unreasonable delays: */ | ||
183 | if (time_after(now, touch_ts + softlockup_thresh)) | ||
184 | return now - touch_ts; | ||
185 | |||
186 | return 0; | ||
187 | } | ||
188 | |||
189 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
190 | static struct perf_event_attr wd_hw_attr = { | ||
191 | .type = PERF_TYPE_HARDWARE, | ||
192 | .config = PERF_COUNT_HW_CPU_CYCLES, | ||
193 | .size = sizeof(struct perf_event_attr), | ||
194 | .pinned = 1, | ||
195 | .disabled = 1, | ||
196 | }; | ||
197 | |||
198 | /* Callback function for perf event subsystem */ | ||
199 | static void watchdog_overflow_callback(struct perf_event *event, int nmi, | ||
200 | struct perf_sample_data *data, | ||
201 | struct pt_regs *regs) | ||
202 | { | ||
203 | /* Ensure the watchdog never gets throttled */ | ||
204 | event->hw.interrupts = 0; | ||
205 | |||
206 | if (__get_cpu_var(watchdog_nmi_touch) == true) { | ||
207 | __get_cpu_var(watchdog_nmi_touch) = false; | ||
208 | return; | ||
209 | } | ||
210 | |||
211 | /* check for a hardlockup | ||
212 | * This is done by making sure our timer interrupt | ||
213 | * is incrementing. The timer interrupt should have | ||
214 | * fired multiple times before we overflow'd. If it hasn't | ||
215 | * then this is a good indication the cpu is stuck | ||
216 | */ | ||
217 | if (is_hardlockup()) { | ||
218 | int this_cpu = smp_processor_id(); | ||
219 | |||
220 | /* only print hardlockups once */ | ||
221 | if (__get_cpu_var(hard_watchdog_warn) == true) | ||
222 | return; | ||
223 | |||
224 | if (hardlockup_panic) | ||
225 | panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); | ||
226 | else | ||
227 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); | ||
228 | |||
229 | __get_cpu_var(hard_watchdog_warn) = true; | ||
230 | return; | ||
231 | } | ||
232 | |||
233 | __get_cpu_var(hard_watchdog_warn) = false; | ||
234 | return; | ||
235 | } | ||
236 | static void watchdog_interrupt_count(void) | ||
237 | { | ||
238 | __get_cpu_var(hrtimer_interrupts)++; | ||
239 | } | ||
240 | #else | ||
241 | static inline void watchdog_interrupt_count(void) { return; } | ||
242 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | ||
243 | |||
244 | /* watchdog kicker functions */ | ||
245 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | ||
246 | { | ||
247 | unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); | ||
248 | struct pt_regs *regs = get_irq_regs(); | ||
249 | int duration; | ||
250 | |||
251 | /* kick the hardlockup detector */ | ||
252 | watchdog_interrupt_count(); | ||
253 | |||
254 | /* kick the softlockup detector */ | ||
255 | wake_up_process(__get_cpu_var(softlockup_watchdog)); | ||
256 | |||
257 | /* .. and repeat */ | ||
258 | hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); | ||
259 | |||
260 | if (touch_ts == 0) { | ||
261 | if (unlikely(__get_cpu_var(softlockup_touch_sync))) { | ||
262 | /* | ||
263 | * If the time stamp was touched atomically | ||
264 | * make sure the scheduler tick is up to date. | ||
265 | */ | ||
266 | __get_cpu_var(softlockup_touch_sync) = false; | ||
267 | sched_clock_tick(); | ||
268 | } | ||
269 | __touch_watchdog(); | ||
270 | return HRTIMER_RESTART; | ||
271 | } | ||
272 | |||
273 | /* check for a softlockup | ||
274 | * This is done by making sure a high priority task is | ||
275 | * being scheduled. The task touches the watchdog to | ||
276 | * indicate it is getting cpu time. If it hasn't then | ||
277 | * this is a good indication some task is hogging the cpu | ||
278 | */ | ||
279 | duration = is_softlockup(touch_ts); | ||
280 | if (unlikely(duration)) { | ||
281 | /* only warn once */ | ||
282 | if (__get_cpu_var(soft_watchdog_warn) == true) | ||
283 | return HRTIMER_RESTART; | ||
284 | |||
285 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", | ||
286 | smp_processor_id(), duration, | ||
287 | current->comm, task_pid_nr(current)); | ||
288 | print_modules(); | ||
289 | print_irqtrace_events(current); | ||
290 | if (regs) | ||
291 | show_regs(regs); | ||
292 | else | ||
293 | dump_stack(); | ||
294 | |||
295 | if (softlockup_panic) | ||
296 | panic("softlockup: hung tasks"); | ||
297 | __get_cpu_var(soft_watchdog_warn) = true; | ||
298 | } else | ||
299 | __get_cpu_var(soft_watchdog_warn) = false; | ||
300 | |||
301 | return HRTIMER_RESTART; | ||
302 | } | ||
303 | |||
304 | |||
305 | /* | ||
306 | * The watchdog thread - touches the timestamp. | ||
307 | */ | ||
308 | static int watchdog(void *unused) | ||
309 | { | ||
310 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | ||
311 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | ||
312 | |||
313 | sched_setscheduler(current, SCHED_FIFO, ¶m); | ||
314 | |||
315 | /* initialize timestamp */ | ||
316 | __touch_watchdog(); | ||
317 | |||
318 | /* kick off the timer for the hardlockup detector */ | ||
319 | /* done here because hrtimer_start can only pin to smp_processor_id() */ | ||
320 | hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), | ||
321 | HRTIMER_MODE_REL_PINNED); | ||
322 | |||
323 | set_current_state(TASK_INTERRUPTIBLE); | ||
324 | /* | ||
325 | * Run briefly once per second to reset the softlockup timestamp. | ||
326 | * If this gets delayed for more than 60 seconds then the | ||
327 | * debug-printout triggers in watchdog_timer_fn(). | ||
328 | */ | ||
329 | while (!kthread_should_stop()) { | ||
330 | __touch_watchdog(); | ||
331 | schedule(); | ||
332 | |||
333 | if (kthread_should_stop()) | ||
334 | break; | ||
335 | |||
336 | set_current_state(TASK_INTERRUPTIBLE); | ||
337 | } | ||
338 | __set_current_state(TASK_RUNNING); | ||
339 | |||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | |||
344 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
345 | static int watchdog_nmi_enable(int cpu) | ||
346 | { | ||
347 | struct perf_event_attr *wd_attr; | ||
348 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
349 | |||
350 | /* is it already setup and enabled? */ | ||
351 | if (event && event->state > PERF_EVENT_STATE_OFF) | ||
352 | goto out; | ||
353 | |||
354 | /* it is setup but not enabled */ | ||
355 | if (event != NULL) | ||
356 | goto out_enable; | ||
357 | |||
358 | /* Try to register using hardware perf events */ | ||
359 | wd_attr = &wd_hw_attr; | ||
360 | wd_attr->sample_period = hw_nmi_get_sample_period(); | ||
361 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); | ||
362 | if (!IS_ERR(event)) { | ||
363 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); | ||
364 | goto out_save; | ||
365 | } | ||
366 | |||
367 | printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); | ||
368 | return PTR_ERR(event); | ||
369 | |||
370 | /* success path */ | ||
371 | out_save: | ||
372 | per_cpu(watchdog_ev, cpu) = event; | ||
373 | out_enable: | ||
374 | perf_event_enable(per_cpu(watchdog_ev, cpu)); | ||
375 | out: | ||
376 | return 0; | ||
377 | } | ||
378 | |||
379 | static void watchdog_nmi_disable(int cpu) | ||
380 | { | ||
381 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
382 | |||
383 | if (event) { | ||
384 | perf_event_disable(event); | ||
385 | per_cpu(watchdog_ev, cpu) = NULL; | ||
386 | |||
387 | /* should be in cleanup, but blocks oprofile */ | ||
388 | perf_event_release_kernel(event); | ||
389 | } | ||
390 | return; | ||
391 | } | ||
392 | #else | ||
393 | static int watchdog_nmi_enable(int cpu) { return 0; } | ||
394 | static void watchdog_nmi_disable(int cpu) { return; } | ||
395 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | ||
396 | |||
397 | /* prepare/enable/disable routines */ | ||
398 | static int watchdog_prepare_cpu(int cpu) | ||
399 | { | ||
400 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); | ||
401 | |||
402 | WARN_ON(per_cpu(softlockup_watchdog, cpu)); | ||
403 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
404 | hrtimer->function = watchdog_timer_fn; | ||
405 | |||
406 | return 0; | ||
407 | } | ||
408 | |||
409 | static int watchdog_enable(int cpu) | ||
410 | { | ||
411 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); | ||
412 | int err; | ||
413 | |||
414 | /* enable the perf event */ | ||
415 | err = watchdog_nmi_enable(cpu); | ||
416 | if (err) | ||
417 | return err; | ||
418 | |||
419 | /* create the watchdog thread */ | ||
420 | if (!p) { | ||
421 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); | ||
422 | if (IS_ERR(p)) { | ||
423 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); | ||
424 | return PTR_ERR(p); | ||
425 | } | ||
426 | kthread_bind(p, cpu); | ||
427 | per_cpu(watchdog_touch_ts, cpu) = 0; | ||
428 | per_cpu(softlockup_watchdog, cpu) = p; | ||
429 | wake_up_process(p); | ||
430 | } | ||
431 | |||
432 | /* if any cpu succeeds, watchdog is considered enabled for the system */ | ||
433 | watchdog_enabled = 1; | ||
434 | |||
435 | return 0; | ||
436 | } | ||
437 | |||
438 | static void watchdog_disable(int cpu) | ||
439 | { | ||
440 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); | ||
441 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); | ||
442 | |||
443 | /* | ||
444 | * cancel the timer first to stop incrementing the stats | ||
445 | * and waking up the kthread | ||
446 | */ | ||
447 | hrtimer_cancel(hrtimer); | ||
448 | |||
449 | /* disable the perf event */ | ||
450 | watchdog_nmi_disable(cpu); | ||
451 | |||
452 | /* stop the watchdog thread */ | ||
453 | if (p) { | ||
454 | per_cpu(softlockup_watchdog, cpu) = NULL; | ||
455 | kthread_stop(p); | ||
456 | } | ||
457 | } | ||
458 | |||
459 | static void watchdog_enable_all_cpus(void) | ||
460 | { | ||
461 | int cpu; | ||
462 | int result = 0; | ||
463 | |||
464 | for_each_online_cpu(cpu) | ||
465 | result += watchdog_enable(cpu); | ||
466 | |||
467 | if (result) | ||
468 | printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); | ||
469 | |||
470 | } | ||
471 | |||
472 | static void watchdog_disable_all_cpus(void) | ||
473 | { | ||
474 | int cpu; | ||
475 | |||
476 | if (no_watchdog) | ||
477 | return; | ||
478 | |||
479 | for_each_online_cpu(cpu) | ||
480 | watchdog_disable(cpu); | ||
481 | |||
482 | /* if all watchdogs are disabled, then they are disabled for the system */ | ||
483 | watchdog_enabled = 0; | ||
484 | } | ||
485 | |||
486 | |||
487 | /* sysctl functions */ | ||
488 | #ifdef CONFIG_SYSCTL | ||
489 | /* | ||
490 | * proc handler for /proc/sys/kernel/nmi_watchdog | ||
491 | */ | ||
492 | |||
493 | int proc_dowatchdog_enabled(struct ctl_table *table, int write, | ||
494 | void __user *buffer, size_t *length, loff_t *ppos) | ||
495 | { | ||
496 | proc_dointvec(table, write, buffer, length, ppos); | ||
497 | |||
498 | if (watchdog_enabled) | ||
499 | watchdog_enable_all_cpus(); | ||
500 | else | ||
501 | watchdog_disable_all_cpus(); | ||
502 | return 0; | ||
503 | } | ||
504 | |||
505 | int proc_dowatchdog_thresh(struct ctl_table *table, int write, | ||
506 | void __user *buffer, | ||
507 | size_t *lenp, loff_t *ppos) | ||
508 | { | ||
509 | return proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
510 | } | ||
511 | #endif /* CONFIG_SYSCTL */ | ||
512 | |||
513 | |||
514 | /* | ||
515 | * Create/destroy watchdog threads as CPUs come and go: | ||
516 | */ | ||
517 | static int __cpuinit | ||
518 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
519 | { | ||
520 | int hotcpu = (unsigned long)hcpu; | ||
521 | int err = 0; | ||
522 | |||
523 | switch (action) { | ||
524 | case CPU_UP_PREPARE: | ||
525 | case CPU_UP_PREPARE_FROZEN: | ||
526 | err = watchdog_prepare_cpu(hotcpu); | ||
527 | break; | ||
528 | case CPU_ONLINE: | ||
529 | case CPU_ONLINE_FROZEN: | ||
530 | err = watchdog_enable(hotcpu); | ||
531 | break; | ||
532 | #ifdef CONFIG_HOTPLUG_CPU | ||
533 | case CPU_UP_CANCELED: | ||
534 | case CPU_UP_CANCELED_FROZEN: | ||
535 | watchdog_disable(hotcpu); | ||
536 | break; | ||
537 | case CPU_DEAD: | ||
538 | case CPU_DEAD_FROZEN: | ||
539 | watchdog_disable(hotcpu); | ||
540 | break; | ||
541 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
542 | } | ||
543 | return notifier_from_errno(err); | ||
544 | } | ||
545 | |||
546 | static struct notifier_block __cpuinitdata cpu_nfb = { | ||
547 | .notifier_call = cpu_callback | ||
548 | }; | ||
549 | |||
550 | static int __init spawn_watchdog_task(void) | ||
551 | { | ||
552 | void *cpu = (void *)(long)smp_processor_id(); | ||
553 | int err; | ||
554 | |||
555 | if (no_watchdog) | ||
556 | return 0; | ||
557 | |||
558 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | ||
559 | WARN_ON(notifier_to_errno(err)); | ||
560 | |||
561 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | ||
562 | register_cpu_notifier(&cpu_nfb); | ||
563 | |||
564 | return 0; | ||
565 | } | ||
566 | early_initcall(spawn_watchdog_task); | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 327d2deb4451..90db1bd1a978 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -1,19 +1,26 @@ | |||
1 | /* | 1 | /* |
2 | * linux/kernel/workqueue.c | 2 | * kernel/workqueue.c - generic async execution with shared worker pool |
3 | * | 3 | * |
4 | * Generic mechanism for defining kernel helper threads for running | 4 | * Copyright (C) 2002 Ingo Molnar |
5 | * arbitrary tasks in process context. | ||
6 | * | 5 | * |
7 | * Started by Ingo Molnar, Copyright (C) 2002 | 6 | * Derived from the taskqueue/keventd code by: |
7 | * David Woodhouse <dwmw2@infradead.org> | ||
8 | * Andrew Morton | ||
9 | * Kai Petzke <wpp@marie.physik.tu-berlin.de> | ||
10 | * Theodore Ts'o <tytso@mit.edu> | ||
8 | * | 11 | * |
9 | * Derived from the taskqueue/keventd code by: | 12 | * Made to use alloc_percpu by Christoph Lameter. |
10 | * | 13 | * |
11 | * David Woodhouse <dwmw2@infradead.org> | 14 | * Copyright (C) 2010 SUSE Linux Products GmbH |
12 | * Andrew Morton | 15 | * Copyright (C) 2010 Tejun Heo <tj@kernel.org> |
13 | * Kai Petzke <wpp@marie.physik.tu-berlin.de> | ||
14 | * Theodore Ts'o <tytso@mit.edu> | ||
15 | * | 16 | * |
16 | * Made to use alloc_percpu by Christoph Lameter. | 17 | * This is the generic async execution mechanism. Work items as are |
18 | * executed in process context. The worker pool is shared and | ||
19 | * automatically managed. There is one worker pool for each CPU and | ||
20 | * one extra for works which are better served by workers which are | ||
21 | * not bound to any specific CPU. | ||
22 | * | ||
23 | * Please read Documentation/workqueue.txt for details. | ||
17 | */ | 24 | */ |
18 | 25 | ||
19 | #include <linux/module.h> | 26 | #include <linux/module.h> |
@@ -33,41 +40,276 @@ | |||
33 | #include <linux/kallsyms.h> | 40 | #include <linux/kallsyms.h> |
34 | #include <linux/debug_locks.h> | 41 | #include <linux/debug_locks.h> |
35 | #include <linux/lockdep.h> | 42 | #include <linux/lockdep.h> |
36 | #define CREATE_TRACE_POINTS | 43 | #include <linux/idr.h> |
37 | #include <trace/events/workqueue.h> | 44 | |
45 | #include "workqueue_sched.h" | ||
46 | |||
47 | enum { | ||
48 | /* global_cwq flags */ | ||
49 | GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ | ||
50 | GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ | ||
51 | GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ | ||
52 | GCWQ_FREEZING = 1 << 3, /* freeze in progress */ | ||
53 | GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ | ||
54 | |||
55 | /* worker flags */ | ||
56 | WORKER_STARTED = 1 << 0, /* started */ | ||
57 | WORKER_DIE = 1 << 1, /* die die die */ | ||
58 | WORKER_IDLE = 1 << 2, /* is idle */ | ||
59 | WORKER_PREP = 1 << 3, /* preparing to run works */ | ||
60 | WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ | ||
61 | WORKER_REBIND = 1 << 5, /* mom is home, come back */ | ||
62 | WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ | ||
63 | WORKER_UNBOUND = 1 << 7, /* worker is unbound */ | ||
64 | |||
65 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | | ||
66 | WORKER_CPU_INTENSIVE | WORKER_UNBOUND, | ||
67 | |||
68 | /* gcwq->trustee_state */ | ||
69 | TRUSTEE_START = 0, /* start */ | ||
70 | TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ | ||
71 | TRUSTEE_BUTCHER = 2, /* butcher workers */ | ||
72 | TRUSTEE_RELEASE = 3, /* release workers */ | ||
73 | TRUSTEE_DONE = 4, /* trustee is done */ | ||
74 | |||
75 | BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ | ||
76 | BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, | ||
77 | BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1, | ||
78 | |||
79 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ | ||
80 | IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ | ||
81 | |||
82 | MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ | ||
83 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ | ||
84 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ | ||
85 | TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ | ||
86 | |||
87 | /* | ||
88 | * Rescue workers are used only on emergencies and shared by | ||
89 | * all cpus. Give -20. | ||
90 | */ | ||
91 | RESCUER_NICE_LEVEL = -20, | ||
92 | }; | ||
38 | 93 | ||
39 | /* | 94 | /* |
40 | * The per-CPU workqueue (if single thread, we always use the first | 95 | * Structure fields follow one of the following exclusion rules. |
41 | * possible cpu). | 96 | * |
97 | * I: Modifiable by initialization/destruction paths and read-only for | ||
98 | * everyone else. | ||
99 | * | ||
100 | * P: Preemption protected. Disabling preemption is enough and should | ||
101 | * only be modified and accessed from the local cpu. | ||
102 | * | ||
103 | * L: gcwq->lock protected. Access with gcwq->lock held. | ||
104 | * | ||
105 | * X: During normal operation, modification requires gcwq->lock and | ||
106 | * should be done only from local cpu. Either disabling preemption | ||
107 | * on local cpu or grabbing gcwq->lock is enough for read access. | ||
108 | * If GCWQ_DISASSOCIATED is set, it's identical to L. | ||
109 | * | ||
110 | * F: wq->flush_mutex protected. | ||
111 | * | ||
112 | * W: workqueue_lock protected. | ||
42 | */ | 113 | */ |
43 | struct cpu_workqueue_struct { | ||
44 | 114 | ||
45 | spinlock_t lock; | 115 | struct global_cwq; |
46 | 116 | ||
47 | struct list_head worklist; | 117 | /* |
48 | wait_queue_head_t more_work; | 118 | * The poor guys doing the actual heavy lifting. All on-duty workers |
49 | struct work_struct *current_work; | 119 | * are either serving the manager role, on idle list or on busy hash. |
120 | */ | ||
121 | struct worker { | ||
122 | /* on idle list while idle, on busy hash table while busy */ | ||
123 | union { | ||
124 | struct list_head entry; /* L: while idle */ | ||
125 | struct hlist_node hentry; /* L: while busy */ | ||
126 | }; | ||
50 | 127 | ||
51 | struct workqueue_struct *wq; | 128 | struct work_struct *current_work; /* L: work being processed */ |
52 | struct task_struct *thread; | 129 | struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ |
53 | } ____cacheline_aligned; | 130 | struct list_head scheduled; /* L: scheduled works */ |
131 | struct task_struct *task; /* I: worker task */ | ||
132 | struct global_cwq *gcwq; /* I: the associated gcwq */ | ||
133 | /* 64 bytes boundary on 64bit, 32 on 32bit */ | ||
134 | unsigned long last_active; /* L: last active timestamp */ | ||
135 | unsigned int flags; /* X: flags */ | ||
136 | int id; /* I: worker id */ | ||
137 | struct work_struct rebind_work; /* L: rebind worker to cpu */ | ||
138 | }; | ||
139 | |||
140 | /* | ||
141 | * Global per-cpu workqueue. There's one and only one for each cpu | ||
142 | * and all works are queued and processed here regardless of their | ||
143 | * target workqueues. | ||
144 | */ | ||
145 | struct global_cwq { | ||
146 | spinlock_t lock; /* the gcwq lock */ | ||
147 | struct list_head worklist; /* L: list of pending works */ | ||
148 | unsigned int cpu; /* I: the associated cpu */ | ||
149 | unsigned int flags; /* L: GCWQ_* flags */ | ||
150 | |||
151 | int nr_workers; /* L: total number of workers */ | ||
152 | int nr_idle; /* L: currently idle ones */ | ||
153 | |||
154 | /* workers are chained either in the idle_list or busy_hash */ | ||
155 | struct list_head idle_list; /* X: list of idle workers */ | ||
156 | struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; | ||
157 | /* L: hash of busy workers */ | ||
158 | |||
159 | struct timer_list idle_timer; /* L: worker idle timeout */ | ||
160 | struct timer_list mayday_timer; /* L: SOS timer for dworkers */ | ||
161 | |||
162 | struct ida worker_ida; /* L: for worker IDs */ | ||
163 | |||
164 | struct task_struct *trustee; /* L: for gcwq shutdown */ | ||
165 | unsigned int trustee_state; /* L: trustee state */ | ||
166 | wait_queue_head_t trustee_wait; /* trustee wait */ | ||
167 | struct worker *first_idle; /* L: first idle worker */ | ||
168 | } ____cacheline_aligned_in_smp; | ||
169 | |||
170 | /* | ||
171 | * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of | ||
172 | * work_struct->data are used for flags and thus cwqs need to be | ||
173 | * aligned at two's power of the number of flag bits. | ||
174 | */ | ||
175 | struct cpu_workqueue_struct { | ||
176 | struct global_cwq *gcwq; /* I: the associated gcwq */ | ||
177 | struct workqueue_struct *wq; /* I: the owning workqueue */ | ||
178 | int work_color; /* L: current color */ | ||
179 | int flush_color; /* L: flushing color */ | ||
180 | int nr_in_flight[WORK_NR_COLORS]; | ||
181 | /* L: nr of in_flight works */ | ||
182 | int nr_active; /* L: nr of active works */ | ||
183 | int max_active; /* L: max active works */ | ||
184 | struct list_head delayed_works; /* L: delayed works */ | ||
185 | }; | ||
186 | |||
187 | /* | ||
188 | * Structure used to wait for workqueue flush. | ||
189 | */ | ||
190 | struct wq_flusher { | ||
191 | struct list_head list; /* F: list of flushers */ | ||
192 | int flush_color; /* F: flush color waiting for */ | ||
193 | struct completion done; /* flush completion */ | ||
194 | }; | ||
195 | |||
196 | /* | ||
197 | * All cpumasks are assumed to be always set on UP and thus can't be | ||
198 | * used to determine whether there's something to be done. | ||
199 | */ | ||
200 | #ifdef CONFIG_SMP | ||
201 | typedef cpumask_var_t mayday_mask_t; | ||
202 | #define mayday_test_and_set_cpu(cpu, mask) \ | ||
203 | cpumask_test_and_set_cpu((cpu), (mask)) | ||
204 | #define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) | ||
205 | #define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) | ||
206 | #define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp)) | ||
207 | #define free_mayday_mask(mask) free_cpumask_var((mask)) | ||
208 | #else | ||
209 | typedef unsigned long mayday_mask_t; | ||
210 | #define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask)) | ||
211 | #define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask)) | ||
212 | #define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask)) | ||
213 | #define alloc_mayday_mask(maskp, gfp) true | ||
214 | #define free_mayday_mask(mask) do { } while (0) | ||
215 | #endif | ||
54 | 216 | ||
55 | /* | 217 | /* |
56 | * The externally visible workqueue abstraction is an array of | 218 | * The externally visible workqueue abstraction is an array of |
57 | * per-CPU workqueues: | 219 | * per-CPU workqueues: |
58 | */ | 220 | */ |
59 | struct workqueue_struct { | 221 | struct workqueue_struct { |
60 | struct cpu_workqueue_struct *cpu_wq; | 222 | unsigned int flags; /* I: WQ_* flags */ |
61 | struct list_head list; | 223 | union { |
62 | const char *name; | 224 | struct cpu_workqueue_struct __percpu *pcpu; |
63 | int singlethread; | 225 | struct cpu_workqueue_struct *single; |
64 | int freezeable; /* Freeze threads during suspend */ | 226 | unsigned long v; |
65 | int rt; | 227 | } cpu_wq; /* I: cwq's */ |
228 | struct list_head list; /* W: list of all workqueues */ | ||
229 | |||
230 | struct mutex flush_mutex; /* protects wq flushing */ | ||
231 | int work_color; /* F: current work color */ | ||
232 | int flush_color; /* F: current flush color */ | ||
233 | atomic_t nr_cwqs_to_flush; /* flush in progress */ | ||
234 | struct wq_flusher *first_flusher; /* F: first flusher */ | ||
235 | struct list_head flusher_queue; /* F: flush waiters */ | ||
236 | struct list_head flusher_overflow; /* F: flush overflow list */ | ||
237 | |||
238 | mayday_mask_t mayday_mask; /* cpus requesting rescue */ | ||
239 | struct worker *rescuer; /* I: rescue worker */ | ||
240 | |||
241 | int saved_max_active; /* W: saved cwq max_active */ | ||
242 | const char *name; /* I: workqueue name */ | ||
66 | #ifdef CONFIG_LOCKDEP | 243 | #ifdef CONFIG_LOCKDEP |
67 | struct lockdep_map lockdep_map; | 244 | struct lockdep_map lockdep_map; |
68 | #endif | 245 | #endif |
69 | }; | 246 | }; |
70 | 247 | ||
248 | struct workqueue_struct *system_wq __read_mostly; | ||
249 | struct workqueue_struct *system_long_wq __read_mostly; | ||
250 | struct workqueue_struct *system_nrt_wq __read_mostly; | ||
251 | struct workqueue_struct *system_unbound_wq __read_mostly; | ||
252 | EXPORT_SYMBOL_GPL(system_wq); | ||
253 | EXPORT_SYMBOL_GPL(system_long_wq); | ||
254 | EXPORT_SYMBOL_GPL(system_nrt_wq); | ||
255 | EXPORT_SYMBOL_GPL(system_unbound_wq); | ||
256 | |||
257 | #define CREATE_TRACE_POINTS | ||
258 | #include <trace/events/workqueue.h> | ||
259 | |||
260 | #define for_each_busy_worker(worker, i, pos, gcwq) \ | ||
261 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ | ||
262 | hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) | ||
263 | |||
264 | static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, | ||
265 | unsigned int sw) | ||
266 | { | ||
267 | if (cpu < nr_cpu_ids) { | ||
268 | if (sw & 1) { | ||
269 | cpu = cpumask_next(cpu, mask); | ||
270 | if (cpu < nr_cpu_ids) | ||
271 | return cpu; | ||
272 | } | ||
273 | if (sw & 2) | ||
274 | return WORK_CPU_UNBOUND; | ||
275 | } | ||
276 | return WORK_CPU_NONE; | ||
277 | } | ||
278 | |||
279 | static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, | ||
280 | struct workqueue_struct *wq) | ||
281 | { | ||
282 | return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | * CPU iterators | ||
287 | * | ||
288 | * An extra gcwq is defined for an invalid cpu number | ||
289 | * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any | ||
290 | * specific CPU. The following iterators are similar to | ||
291 | * for_each_*_cpu() iterators but also considers the unbound gcwq. | ||
292 | * | ||
293 | * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND | ||
294 | * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND | ||
295 | * for_each_cwq_cpu() : possible CPUs for bound workqueues, | ||
296 | * WORK_CPU_UNBOUND for unbound workqueues | ||
297 | */ | ||
298 | #define for_each_gcwq_cpu(cpu) \ | ||
299 | for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ | ||
300 | (cpu) < WORK_CPU_NONE; \ | ||
301 | (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) | ||
302 | |||
303 | #define for_each_online_gcwq_cpu(cpu) \ | ||
304 | for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ | ||
305 | (cpu) < WORK_CPU_NONE; \ | ||
306 | (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) | ||
307 | |||
308 | #define for_each_cwq_cpu(cpu, wq) \ | ||
309 | for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ | ||
310 | (cpu) < WORK_CPU_NONE; \ | ||
311 | (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) | ||
312 | |||
71 | #ifdef CONFIG_DEBUG_OBJECTS_WORK | 313 | #ifdef CONFIG_DEBUG_OBJECTS_WORK |
72 | 314 | ||
73 | static struct debug_obj_descr work_debug_descr; | 315 | static struct debug_obj_descr work_debug_descr; |
@@ -107,7 +349,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) | |||
107 | * statically initialized. We just make sure that it | 349 | * statically initialized. We just make sure that it |
108 | * is tracked in the object tracker. | 350 | * is tracked in the object tracker. |
109 | */ | 351 | */ |
110 | if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { | 352 | if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { |
111 | debug_object_init(work, &work_debug_descr); | 353 | debug_object_init(work, &work_debug_descr); |
112 | debug_object_activate(work, &work_debug_descr); | 354 | debug_object_activate(work, &work_debug_descr); |
113 | return 0; | 355 | return 0; |
@@ -181,94 +423,586 @@ static inline void debug_work_deactivate(struct work_struct *work) { } | |||
181 | /* Serializes the accesses to the list of workqueues. */ | 423 | /* Serializes the accesses to the list of workqueues. */ |
182 | static DEFINE_SPINLOCK(workqueue_lock); | 424 | static DEFINE_SPINLOCK(workqueue_lock); |
183 | static LIST_HEAD(workqueues); | 425 | static LIST_HEAD(workqueues); |
426 | static bool workqueue_freezing; /* W: have wqs started freezing? */ | ||
184 | 427 | ||
185 | static int singlethread_cpu __read_mostly; | ||
186 | static const struct cpumask *cpu_singlethread_map __read_mostly; | ||
187 | /* | 428 | /* |
188 | * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD | 429 | * The almighty global cpu workqueues. nr_running is the only field |
189 | * flushes cwq->worklist. This means that flush_workqueue/wait_on_work | 430 | * which is expected to be used frequently by other cpus via |
190 | * which comes in between can't use for_each_online_cpu(). We could | 431 | * try_to_wake_up(). Put it in a separate cacheline. |
191 | * use cpu_possible_map, the cpumask below is more a documentation | ||
192 | * than optimization. | ||
193 | */ | 432 | */ |
194 | static cpumask_var_t cpu_populated_map __read_mostly; | 433 | static DEFINE_PER_CPU(struct global_cwq, global_cwq); |
434 | static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); | ||
195 | 435 | ||
196 | /* If it's single threaded, it isn't in the list of workqueues. */ | 436 | /* |
197 | static inline int is_wq_single_threaded(struct workqueue_struct *wq) | 437 | * Global cpu workqueue and nr_running counter for unbound gcwq. The |
438 | * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its | ||
439 | * workers have WORKER_UNBOUND set. | ||
440 | */ | ||
441 | static struct global_cwq unbound_global_cwq; | ||
442 | static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ | ||
443 | |||
444 | static int worker_thread(void *__worker); | ||
445 | |||
446 | static struct global_cwq *get_gcwq(unsigned int cpu) | ||
447 | { | ||
448 | if (cpu != WORK_CPU_UNBOUND) | ||
449 | return &per_cpu(global_cwq, cpu); | ||
450 | else | ||
451 | return &unbound_global_cwq; | ||
452 | } | ||
453 | |||
454 | static atomic_t *get_gcwq_nr_running(unsigned int cpu) | ||
455 | { | ||
456 | if (cpu != WORK_CPU_UNBOUND) | ||
457 | return &per_cpu(gcwq_nr_running, cpu); | ||
458 | else | ||
459 | return &unbound_gcwq_nr_running; | ||
460 | } | ||
461 | |||
462 | static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, | ||
463 | struct workqueue_struct *wq) | ||
464 | { | ||
465 | if (!(wq->flags & WQ_UNBOUND)) { | ||
466 | if (likely(cpu < nr_cpu_ids)) { | ||
467 | #ifdef CONFIG_SMP | ||
468 | return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); | ||
469 | #else | ||
470 | return wq->cpu_wq.single; | ||
471 | #endif | ||
472 | } | ||
473 | } else if (likely(cpu == WORK_CPU_UNBOUND)) | ||
474 | return wq->cpu_wq.single; | ||
475 | return NULL; | ||
476 | } | ||
477 | |||
478 | static unsigned int work_color_to_flags(int color) | ||
198 | { | 479 | { |
199 | return wq->singlethread; | 480 | return color << WORK_STRUCT_COLOR_SHIFT; |
200 | } | 481 | } |
201 | 482 | ||
202 | static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) | 483 | static int get_work_color(struct work_struct *work) |
203 | { | 484 | { |
204 | return is_wq_single_threaded(wq) | 485 | return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & |
205 | ? cpu_singlethread_map : cpu_populated_map; | 486 | ((1 << WORK_STRUCT_COLOR_BITS) - 1); |
206 | } | 487 | } |
207 | 488 | ||
208 | static | 489 | static int work_next_color(int color) |
209 | struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) | ||
210 | { | 490 | { |
211 | if (unlikely(is_wq_single_threaded(wq))) | 491 | return (color + 1) % WORK_NR_COLORS; |
212 | cpu = singlethread_cpu; | ||
213 | return per_cpu_ptr(wq->cpu_wq, cpu); | ||
214 | } | 492 | } |
215 | 493 | ||
216 | /* | 494 | /* |
217 | * Set the workqueue on which a work item is to be run | 495 | * A work's data points to the cwq with WORK_STRUCT_CWQ set while the |
218 | * - Must *only* be called if the pending flag is set | 496 | * work is on queue. Once execution starts, WORK_STRUCT_CWQ is |
497 | * cleared and the work data contains the cpu number it was last on. | ||
498 | * | ||
499 | * set_work_{cwq|cpu}() and clear_work_data() can be used to set the | ||
500 | * cwq, cpu or clear work->data. These functions should only be | ||
501 | * called while the work is owned - ie. while the PENDING bit is set. | ||
502 | * | ||
503 | * get_work_[g]cwq() can be used to obtain the gcwq or cwq | ||
504 | * corresponding to a work. gcwq is available once the work has been | ||
505 | * queued anywhere after initialization. cwq is available only from | ||
506 | * queueing until execution starts. | ||
219 | */ | 507 | */ |
220 | static inline void set_wq_data(struct work_struct *work, | 508 | static inline void set_work_data(struct work_struct *work, unsigned long data, |
221 | struct cpu_workqueue_struct *cwq) | 509 | unsigned long flags) |
222 | { | 510 | { |
223 | unsigned long new; | ||
224 | |||
225 | BUG_ON(!work_pending(work)); | 511 | BUG_ON(!work_pending(work)); |
512 | atomic_long_set(&work->data, data | flags | work_static(work)); | ||
513 | } | ||
514 | |||
515 | static void set_work_cwq(struct work_struct *work, | ||
516 | struct cpu_workqueue_struct *cwq, | ||
517 | unsigned long extra_flags) | ||
518 | { | ||
519 | set_work_data(work, (unsigned long)cwq, | ||
520 | WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); | ||
521 | } | ||
522 | |||
523 | static void set_work_cpu(struct work_struct *work, unsigned int cpu) | ||
524 | { | ||
525 | set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); | ||
526 | } | ||
527 | |||
528 | static void clear_work_data(struct work_struct *work) | ||
529 | { | ||
530 | set_work_data(work, WORK_STRUCT_NO_CPU, 0); | ||
531 | } | ||
532 | |||
533 | static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) | ||
534 | { | ||
535 | unsigned long data = atomic_long_read(&work->data); | ||
536 | |||
537 | if (data & WORK_STRUCT_CWQ) | ||
538 | return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); | ||
539 | else | ||
540 | return NULL; | ||
541 | } | ||
542 | |||
543 | static struct global_cwq *get_work_gcwq(struct work_struct *work) | ||
544 | { | ||
545 | unsigned long data = atomic_long_read(&work->data); | ||
546 | unsigned int cpu; | ||
547 | |||
548 | if (data & WORK_STRUCT_CWQ) | ||
549 | return ((struct cpu_workqueue_struct *) | ||
550 | (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; | ||
551 | |||
552 | cpu = data >> WORK_STRUCT_FLAG_BITS; | ||
553 | if (cpu == WORK_CPU_NONE) | ||
554 | return NULL; | ||
555 | |||
556 | BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); | ||
557 | return get_gcwq(cpu); | ||
558 | } | ||
559 | |||
560 | /* | ||
561 | * Policy functions. These define the policies on how the global | ||
562 | * worker pool is managed. Unless noted otherwise, these functions | ||
563 | * assume that they're being called with gcwq->lock held. | ||
564 | */ | ||
565 | |||
566 | static bool __need_more_worker(struct global_cwq *gcwq) | ||
567 | { | ||
568 | return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || | ||
569 | gcwq->flags & GCWQ_HIGHPRI_PENDING; | ||
570 | } | ||
571 | |||
572 | /* | ||
573 | * Need to wake up a worker? Called from anything but currently | ||
574 | * running workers. | ||
575 | */ | ||
576 | static bool need_more_worker(struct global_cwq *gcwq) | ||
577 | { | ||
578 | return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); | ||
579 | } | ||
580 | |||
581 | /* Can I start working? Called from busy but !running workers. */ | ||
582 | static bool may_start_working(struct global_cwq *gcwq) | ||
583 | { | ||
584 | return gcwq->nr_idle; | ||
585 | } | ||
586 | |||
587 | /* Do I need to keep working? Called from currently running workers. */ | ||
588 | static bool keep_working(struct global_cwq *gcwq) | ||
589 | { | ||
590 | atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); | ||
591 | |||
592 | return !list_empty(&gcwq->worklist) && | ||
593 | (atomic_read(nr_running) <= 1 || | ||
594 | gcwq->flags & GCWQ_HIGHPRI_PENDING); | ||
595 | } | ||
596 | |||
597 | /* Do we need a new worker? Called from manager. */ | ||
598 | static bool need_to_create_worker(struct global_cwq *gcwq) | ||
599 | { | ||
600 | return need_more_worker(gcwq) && !may_start_working(gcwq); | ||
601 | } | ||
602 | |||
603 | /* Do I need to be the manager? */ | ||
604 | static bool need_to_manage_workers(struct global_cwq *gcwq) | ||
605 | { | ||
606 | return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; | ||
607 | } | ||
226 | 608 | ||
227 | new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); | 609 | /* Do we have too many workers and should some go away? */ |
228 | new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); | 610 | static bool too_many_workers(struct global_cwq *gcwq) |
229 | atomic_long_set(&work->data, new); | 611 | { |
612 | bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; | ||
613 | int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ | ||
614 | int nr_busy = gcwq->nr_workers - nr_idle; | ||
615 | |||
616 | return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; | ||
230 | } | 617 | } |
231 | 618 | ||
232 | /* | 619 | /* |
233 | * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued. | 620 | * Wake up functions. |
621 | */ | ||
622 | |||
623 | /* Return the first worker. Safe with preemption disabled */ | ||
624 | static struct worker *first_worker(struct global_cwq *gcwq) | ||
625 | { | ||
626 | if (unlikely(list_empty(&gcwq->idle_list))) | ||
627 | return NULL; | ||
628 | |||
629 | return list_first_entry(&gcwq->idle_list, struct worker, entry); | ||
630 | } | ||
631 | |||
632 | /** | ||
633 | * wake_up_worker - wake up an idle worker | ||
634 | * @gcwq: gcwq to wake worker for | ||
635 | * | ||
636 | * Wake up the first idle worker of @gcwq. | ||
637 | * | ||
638 | * CONTEXT: | ||
639 | * spin_lock_irq(gcwq->lock). | ||
640 | */ | ||
641 | static void wake_up_worker(struct global_cwq *gcwq) | ||
642 | { | ||
643 | struct worker *worker = first_worker(gcwq); | ||
644 | |||
645 | if (likely(worker)) | ||
646 | wake_up_process(worker->task); | ||
647 | } | ||
648 | |||
649 | /** | ||
650 | * wq_worker_waking_up - a worker is waking up | ||
651 | * @task: task waking up | ||
652 | * @cpu: CPU @task is waking up to | ||
653 | * | ||
654 | * This function is called during try_to_wake_up() when a worker is | ||
655 | * being awoken. | ||
656 | * | ||
657 | * CONTEXT: | ||
658 | * spin_lock_irq(rq->lock) | ||
659 | */ | ||
660 | void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) | ||
661 | { | ||
662 | struct worker *worker = kthread_data(task); | ||
663 | |||
664 | if (likely(!(worker->flags & WORKER_NOT_RUNNING))) | ||
665 | atomic_inc(get_gcwq_nr_running(cpu)); | ||
666 | } | ||
667 | |||
668 | /** | ||
669 | * wq_worker_sleeping - a worker is going to sleep | ||
670 | * @task: task going to sleep | ||
671 | * @cpu: CPU in question, must be the current CPU number | ||
672 | * | ||
673 | * This function is called during schedule() when a busy worker is | ||
674 | * going to sleep. Worker on the same cpu can be woken up by | ||
675 | * returning pointer to its task. | ||
676 | * | ||
677 | * CONTEXT: | ||
678 | * spin_lock_irq(rq->lock) | ||
679 | * | ||
680 | * RETURNS: | ||
681 | * Worker task on @cpu to wake up, %NULL if none. | ||
682 | */ | ||
683 | struct task_struct *wq_worker_sleeping(struct task_struct *task, | ||
684 | unsigned int cpu) | ||
685 | { | ||
686 | struct worker *worker = kthread_data(task), *to_wakeup = NULL; | ||
687 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
688 | atomic_t *nr_running = get_gcwq_nr_running(cpu); | ||
689 | |||
690 | if (unlikely(worker->flags & WORKER_NOT_RUNNING)) | ||
691 | return NULL; | ||
692 | |||
693 | /* this can only happen on the local cpu */ | ||
694 | BUG_ON(cpu != raw_smp_processor_id()); | ||
695 | |||
696 | /* | ||
697 | * The counterpart of the following dec_and_test, implied mb, | ||
698 | * worklist not empty test sequence is in insert_work(). | ||
699 | * Please read comment there. | ||
700 | * | ||
701 | * NOT_RUNNING is clear. This means that trustee is not in | ||
702 | * charge and we're running on the local cpu w/ rq lock held | ||
703 | * and preemption disabled, which in turn means that none else | ||
704 | * could be manipulating idle_list, so dereferencing idle_list | ||
705 | * without gcwq lock is safe. | ||
706 | */ | ||
707 | if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) | ||
708 | to_wakeup = first_worker(gcwq); | ||
709 | return to_wakeup ? to_wakeup->task : NULL; | ||
710 | } | ||
711 | |||
712 | /** | ||
713 | * worker_set_flags - set worker flags and adjust nr_running accordingly | ||
714 | * @worker: self | ||
715 | * @flags: flags to set | ||
716 | * @wakeup: wakeup an idle worker if necessary | ||
717 | * | ||
718 | * Set @flags in @worker->flags and adjust nr_running accordingly. If | ||
719 | * nr_running becomes zero and @wakeup is %true, an idle worker is | ||
720 | * woken up. | ||
721 | * | ||
722 | * CONTEXT: | ||
723 | * spin_lock_irq(gcwq->lock) | ||
234 | */ | 724 | */ |
235 | static inline void clear_wq_data(struct work_struct *work) | 725 | static inline void worker_set_flags(struct worker *worker, unsigned int flags, |
726 | bool wakeup) | ||
236 | { | 727 | { |
237 | unsigned long flags = *work_data_bits(work) & | 728 | struct global_cwq *gcwq = worker->gcwq; |
238 | (1UL << WORK_STRUCT_STATIC); | 729 | |
239 | atomic_long_set(&work->data, flags); | 730 | WARN_ON_ONCE(worker->task != current); |
731 | |||
732 | /* | ||
733 | * If transitioning into NOT_RUNNING, adjust nr_running and | ||
734 | * wake up an idle worker as necessary if requested by | ||
735 | * @wakeup. | ||
736 | */ | ||
737 | if ((flags & WORKER_NOT_RUNNING) && | ||
738 | !(worker->flags & WORKER_NOT_RUNNING)) { | ||
739 | atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); | ||
740 | |||
741 | if (wakeup) { | ||
742 | if (atomic_dec_and_test(nr_running) && | ||
743 | !list_empty(&gcwq->worklist)) | ||
744 | wake_up_worker(gcwq); | ||
745 | } else | ||
746 | atomic_dec(nr_running); | ||
747 | } | ||
748 | |||
749 | worker->flags |= flags; | ||
240 | } | 750 | } |
241 | 751 | ||
242 | static inline | 752 | /** |
243 | struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) | 753 | * worker_clr_flags - clear worker flags and adjust nr_running accordingly |
754 | * @worker: self | ||
755 | * @flags: flags to clear | ||
756 | * | ||
757 | * Clear @flags in @worker->flags and adjust nr_running accordingly. | ||
758 | * | ||
759 | * CONTEXT: | ||
760 | * spin_lock_irq(gcwq->lock) | ||
761 | */ | ||
762 | static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | ||
244 | { | 763 | { |
245 | return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); | 764 | struct global_cwq *gcwq = worker->gcwq; |
765 | unsigned int oflags = worker->flags; | ||
766 | |||
767 | WARN_ON_ONCE(worker->task != current); | ||
768 | |||
769 | worker->flags &= ~flags; | ||
770 | |||
771 | /* if transitioning out of NOT_RUNNING, increment nr_running */ | ||
772 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) | ||
773 | if (!(worker->flags & WORKER_NOT_RUNNING)) | ||
774 | atomic_inc(get_gcwq_nr_running(gcwq->cpu)); | ||
246 | } | 775 | } |
247 | 776 | ||
777 | /** | ||
778 | * busy_worker_head - return the busy hash head for a work | ||
779 | * @gcwq: gcwq of interest | ||
780 | * @work: work to be hashed | ||
781 | * | ||
782 | * Return hash head of @gcwq for @work. | ||
783 | * | ||
784 | * CONTEXT: | ||
785 | * spin_lock_irq(gcwq->lock). | ||
786 | * | ||
787 | * RETURNS: | ||
788 | * Pointer to the hash head. | ||
789 | */ | ||
790 | static struct hlist_head *busy_worker_head(struct global_cwq *gcwq, | ||
791 | struct work_struct *work) | ||
792 | { | ||
793 | const int base_shift = ilog2(sizeof(struct work_struct)); | ||
794 | unsigned long v = (unsigned long)work; | ||
795 | |||
796 | /* simple shift and fold hash, do we need something better? */ | ||
797 | v >>= base_shift; | ||
798 | v += v >> BUSY_WORKER_HASH_ORDER; | ||
799 | v &= BUSY_WORKER_HASH_MASK; | ||
800 | |||
801 | return &gcwq->busy_hash[v]; | ||
802 | } | ||
803 | |||
804 | /** | ||
805 | * __find_worker_executing_work - find worker which is executing a work | ||
806 | * @gcwq: gcwq of interest | ||
807 | * @bwh: hash head as returned by busy_worker_head() | ||
808 | * @work: work to find worker for | ||
809 | * | ||
810 | * Find a worker which is executing @work on @gcwq. @bwh should be | ||
811 | * the hash head obtained by calling busy_worker_head() with the same | ||
812 | * work. | ||
813 | * | ||
814 | * CONTEXT: | ||
815 | * spin_lock_irq(gcwq->lock). | ||
816 | * | ||
817 | * RETURNS: | ||
818 | * Pointer to worker which is executing @work if found, NULL | ||
819 | * otherwise. | ||
820 | */ | ||
821 | static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, | ||
822 | struct hlist_head *bwh, | ||
823 | struct work_struct *work) | ||
824 | { | ||
825 | struct worker *worker; | ||
826 | struct hlist_node *tmp; | ||
827 | |||
828 | hlist_for_each_entry(worker, tmp, bwh, hentry) | ||
829 | if (worker->current_work == work) | ||
830 | return worker; | ||
831 | return NULL; | ||
832 | } | ||
833 | |||
834 | /** | ||
835 | * find_worker_executing_work - find worker which is executing a work | ||
836 | * @gcwq: gcwq of interest | ||
837 | * @work: work to find worker for | ||
838 | * | ||
839 | * Find a worker which is executing @work on @gcwq. This function is | ||
840 | * identical to __find_worker_executing_work() except that this | ||
841 | * function calculates @bwh itself. | ||
842 | * | ||
843 | * CONTEXT: | ||
844 | * spin_lock_irq(gcwq->lock). | ||
845 | * | ||
846 | * RETURNS: | ||
847 | * Pointer to worker which is executing @work if found, NULL | ||
848 | * otherwise. | ||
849 | */ | ||
850 | static struct worker *find_worker_executing_work(struct global_cwq *gcwq, | ||
851 | struct work_struct *work) | ||
852 | { | ||
853 | return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work), | ||
854 | work); | ||
855 | } | ||
856 | |||
857 | /** | ||
858 | * gcwq_determine_ins_pos - find insertion position | ||
859 | * @gcwq: gcwq of interest | ||
860 | * @cwq: cwq a work is being queued for | ||
861 | * | ||
862 | * A work for @cwq is about to be queued on @gcwq, determine insertion | ||
863 | * position for the work. If @cwq is for HIGHPRI wq, the work is | ||
864 | * queued at the head of the queue but in FIFO order with respect to | ||
865 | * other HIGHPRI works; otherwise, at the end of the queue. This | ||
866 | * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that | ||
867 | * there are HIGHPRI works pending. | ||
868 | * | ||
869 | * CONTEXT: | ||
870 | * spin_lock_irq(gcwq->lock). | ||
871 | * | ||
872 | * RETURNS: | ||
873 | * Pointer to inserstion position. | ||
874 | */ | ||
875 | static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, | ||
876 | struct cpu_workqueue_struct *cwq) | ||
877 | { | ||
878 | struct work_struct *twork; | ||
879 | |||
880 | if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) | ||
881 | return &gcwq->worklist; | ||
882 | |||
883 | list_for_each_entry(twork, &gcwq->worklist, entry) { | ||
884 | struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); | ||
885 | |||
886 | if (!(tcwq->wq->flags & WQ_HIGHPRI)) | ||
887 | break; | ||
888 | } | ||
889 | |||
890 | gcwq->flags |= GCWQ_HIGHPRI_PENDING; | ||
891 | return &twork->entry; | ||
892 | } | ||
893 | |||
894 | /** | ||
895 | * insert_work - insert a work into gcwq | ||
896 | * @cwq: cwq @work belongs to | ||
897 | * @work: work to insert | ||
898 | * @head: insertion point | ||
899 | * @extra_flags: extra WORK_STRUCT_* flags to set | ||
900 | * | ||
901 | * Insert @work which belongs to @cwq into @gcwq after @head. | ||
902 | * @extra_flags is or'd to work_struct flags. | ||
903 | * | ||
904 | * CONTEXT: | ||
905 | * spin_lock_irq(gcwq->lock). | ||
906 | */ | ||
248 | static void insert_work(struct cpu_workqueue_struct *cwq, | 907 | static void insert_work(struct cpu_workqueue_struct *cwq, |
249 | struct work_struct *work, struct list_head *head) | 908 | struct work_struct *work, struct list_head *head, |
909 | unsigned int extra_flags) | ||
250 | { | 910 | { |
251 | trace_workqueue_insertion(cwq->thread, work); | 911 | struct global_cwq *gcwq = cwq->gcwq; |
912 | |||
913 | /* we own @work, set data and link */ | ||
914 | set_work_cwq(work, cwq, extra_flags); | ||
252 | 915 | ||
253 | set_wq_data(work, cwq); | ||
254 | /* | 916 | /* |
255 | * Ensure that we get the right work->data if we see the | 917 | * Ensure that we get the right work->data if we see the |
256 | * result of list_add() below, see try_to_grab_pending(). | 918 | * result of list_add() below, see try_to_grab_pending(). |
257 | */ | 919 | */ |
258 | smp_wmb(); | 920 | smp_wmb(); |
921 | |||
259 | list_add_tail(&work->entry, head); | 922 | list_add_tail(&work->entry, head); |
260 | wake_up(&cwq->more_work); | 923 | |
924 | /* | ||
925 | * Ensure either worker_sched_deactivated() sees the above | ||
926 | * list_add_tail() or we see zero nr_running to avoid workers | ||
927 | * lying around lazily while there are works to be processed. | ||
928 | */ | ||
929 | smp_mb(); | ||
930 | |||
931 | if (__need_more_worker(gcwq)) | ||
932 | wake_up_worker(gcwq); | ||
261 | } | 933 | } |
262 | 934 | ||
263 | static void __queue_work(struct cpu_workqueue_struct *cwq, | 935 | static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, |
264 | struct work_struct *work) | 936 | struct work_struct *work) |
265 | { | 937 | { |
938 | struct global_cwq *gcwq; | ||
939 | struct cpu_workqueue_struct *cwq; | ||
940 | struct list_head *worklist; | ||
941 | unsigned int work_flags; | ||
266 | unsigned long flags; | 942 | unsigned long flags; |
267 | 943 | ||
268 | debug_work_activate(work); | 944 | debug_work_activate(work); |
269 | spin_lock_irqsave(&cwq->lock, flags); | 945 | |
270 | insert_work(cwq, work, &cwq->worklist); | 946 | if (WARN_ON_ONCE(wq->flags & WQ_DYING)) |
271 | spin_unlock_irqrestore(&cwq->lock, flags); | 947 | return; |
948 | |||
949 | /* determine gcwq to use */ | ||
950 | if (!(wq->flags & WQ_UNBOUND)) { | ||
951 | struct global_cwq *last_gcwq; | ||
952 | |||
953 | if (unlikely(cpu == WORK_CPU_UNBOUND)) | ||
954 | cpu = raw_smp_processor_id(); | ||
955 | |||
956 | /* | ||
957 | * It's multi cpu. If @wq is non-reentrant and @work | ||
958 | * was previously on a different cpu, it might still | ||
959 | * be running there, in which case the work needs to | ||
960 | * be queued on that cpu to guarantee non-reentrance. | ||
961 | */ | ||
962 | gcwq = get_gcwq(cpu); | ||
963 | if (wq->flags & WQ_NON_REENTRANT && | ||
964 | (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { | ||
965 | struct worker *worker; | ||
966 | |||
967 | spin_lock_irqsave(&last_gcwq->lock, flags); | ||
968 | |||
969 | worker = find_worker_executing_work(last_gcwq, work); | ||
970 | |||
971 | if (worker && worker->current_cwq->wq == wq) | ||
972 | gcwq = last_gcwq; | ||
973 | else { | ||
974 | /* meh... not running there, queue here */ | ||
975 | spin_unlock_irqrestore(&last_gcwq->lock, flags); | ||
976 | spin_lock_irqsave(&gcwq->lock, flags); | ||
977 | } | ||
978 | } else | ||
979 | spin_lock_irqsave(&gcwq->lock, flags); | ||
980 | } else { | ||
981 | gcwq = get_gcwq(WORK_CPU_UNBOUND); | ||
982 | spin_lock_irqsave(&gcwq->lock, flags); | ||
983 | } | ||
984 | |||
985 | /* gcwq determined, get cwq and queue */ | ||
986 | cwq = get_cwq(gcwq->cpu, wq); | ||
987 | trace_workqueue_queue_work(cpu, cwq, work); | ||
988 | |||
989 | BUG_ON(!list_empty(&work->entry)); | ||
990 | |||
991 | cwq->nr_in_flight[cwq->work_color]++; | ||
992 | work_flags = work_color_to_flags(cwq->work_color); | ||
993 | |||
994 | if (likely(cwq->nr_active < cwq->max_active)) { | ||
995 | trace_workqueue_activate_work(work); | ||
996 | cwq->nr_active++; | ||
997 | worklist = gcwq_determine_ins_pos(gcwq, cwq); | ||
998 | } else { | ||
999 | work_flags |= WORK_STRUCT_DELAYED; | ||
1000 | worklist = &cwq->delayed_works; | ||
1001 | } | ||
1002 | |||
1003 | insert_work(cwq, work, worklist, work_flags); | ||
1004 | |||
1005 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
272 | } | 1006 | } |
273 | 1007 | ||
274 | /** | 1008 | /** |
@@ -308,9 +1042,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) | |||
308 | { | 1042 | { |
309 | int ret = 0; | 1043 | int ret = 0; |
310 | 1044 | ||
311 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { | 1045 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { |
312 | BUG_ON(!list_empty(&work->entry)); | 1046 | __queue_work(cpu, wq, work); |
313 | __queue_work(wq_per_cpu(wq, cpu), work); | ||
314 | ret = 1; | 1047 | ret = 1; |
315 | } | 1048 | } |
316 | return ret; | 1049 | return ret; |
@@ -320,10 +1053,9 @@ EXPORT_SYMBOL_GPL(queue_work_on); | |||
320 | static void delayed_work_timer_fn(unsigned long __data) | 1053 | static void delayed_work_timer_fn(unsigned long __data) |
321 | { | 1054 | { |
322 | struct delayed_work *dwork = (struct delayed_work *)__data; | 1055 | struct delayed_work *dwork = (struct delayed_work *)__data; |
323 | struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); | 1056 | struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); |
324 | struct workqueue_struct *wq = cwq->wq; | ||
325 | 1057 | ||
326 | __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); | 1058 | __queue_work(smp_processor_id(), cwq->wq, &dwork->work); |
327 | } | 1059 | } |
328 | 1060 | ||
329 | /** | 1061 | /** |
@@ -360,14 +1092,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | |||
360 | struct timer_list *timer = &dwork->timer; | 1092 | struct timer_list *timer = &dwork->timer; |
361 | struct work_struct *work = &dwork->work; | 1093 | struct work_struct *work = &dwork->work; |
362 | 1094 | ||
363 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { | 1095 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { |
1096 | unsigned int lcpu; | ||
1097 | |||
364 | BUG_ON(timer_pending(timer)); | 1098 | BUG_ON(timer_pending(timer)); |
365 | BUG_ON(!list_empty(&work->entry)); | 1099 | BUG_ON(!list_empty(&work->entry)); |
366 | 1100 | ||
367 | timer_stats_timer_set_start_info(&dwork->timer); | 1101 | timer_stats_timer_set_start_info(&dwork->timer); |
368 | 1102 | ||
369 | /* This stores cwq for the moment, for the timer_fn */ | 1103 | /* |
370 | set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); | 1104 | * This stores cwq for the moment, for the timer_fn. |
1105 | * Note that the work's gcwq is preserved to allow | ||
1106 | * reentrance detection for delayed works. | ||
1107 | */ | ||
1108 | if (!(wq->flags & WQ_UNBOUND)) { | ||
1109 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
1110 | |||
1111 | if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) | ||
1112 | lcpu = gcwq->cpu; | ||
1113 | else | ||
1114 | lcpu = raw_smp_processor_id(); | ||
1115 | } else | ||
1116 | lcpu = WORK_CPU_UNBOUND; | ||
1117 | |||
1118 | set_work_cwq(work, get_cwq(lcpu, wq), 0); | ||
1119 | |||
371 | timer->expires = jiffies + delay; | 1120 | timer->expires = jiffies + delay; |
372 | timer->data = (unsigned long)dwork; | 1121 | timer->data = (unsigned long)dwork; |
373 | timer->function = delayed_work_timer_fn; | 1122 | timer->function = delayed_work_timer_fn; |
@@ -382,80 +1131,889 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | |||
382 | } | 1131 | } |
383 | EXPORT_SYMBOL_GPL(queue_delayed_work_on); | 1132 | EXPORT_SYMBOL_GPL(queue_delayed_work_on); |
384 | 1133 | ||
385 | static void run_workqueue(struct cpu_workqueue_struct *cwq) | 1134 | /** |
1135 | * worker_enter_idle - enter idle state | ||
1136 | * @worker: worker which is entering idle state | ||
1137 | * | ||
1138 | * @worker is entering idle state. Update stats and idle timer if | ||
1139 | * necessary. | ||
1140 | * | ||
1141 | * LOCKING: | ||
1142 | * spin_lock_irq(gcwq->lock). | ||
1143 | */ | ||
1144 | static void worker_enter_idle(struct worker *worker) | ||
386 | { | 1145 | { |
387 | spin_lock_irq(&cwq->lock); | 1146 | struct global_cwq *gcwq = worker->gcwq; |
388 | while (!list_empty(&cwq->worklist)) { | 1147 | |
389 | struct work_struct *work = list_entry(cwq->worklist.next, | 1148 | BUG_ON(worker->flags & WORKER_IDLE); |
390 | struct work_struct, entry); | 1149 | BUG_ON(!list_empty(&worker->entry) && |
391 | work_func_t f = work->func; | 1150 | (worker->hentry.next || worker->hentry.pprev)); |
392 | #ifdef CONFIG_LOCKDEP | 1151 | |
1152 | /* can't use worker_set_flags(), also called from start_worker() */ | ||
1153 | worker->flags |= WORKER_IDLE; | ||
1154 | gcwq->nr_idle++; | ||
1155 | worker->last_active = jiffies; | ||
1156 | |||
1157 | /* idle_list is LIFO */ | ||
1158 | list_add(&worker->entry, &gcwq->idle_list); | ||
1159 | |||
1160 | if (likely(!(worker->flags & WORKER_ROGUE))) { | ||
1161 | if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) | ||
1162 | mod_timer(&gcwq->idle_timer, | ||
1163 | jiffies + IDLE_WORKER_TIMEOUT); | ||
1164 | } else | ||
1165 | wake_up_all(&gcwq->trustee_wait); | ||
1166 | |||
1167 | /* sanity check nr_running */ | ||
1168 | WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && | ||
1169 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); | ||
1170 | } | ||
1171 | |||
1172 | /** | ||
1173 | * worker_leave_idle - leave idle state | ||
1174 | * @worker: worker which is leaving idle state | ||
1175 | * | ||
1176 | * @worker is leaving idle state. Update stats. | ||
1177 | * | ||
1178 | * LOCKING: | ||
1179 | * spin_lock_irq(gcwq->lock). | ||
1180 | */ | ||
1181 | static void worker_leave_idle(struct worker *worker) | ||
1182 | { | ||
1183 | struct global_cwq *gcwq = worker->gcwq; | ||
1184 | |||
1185 | BUG_ON(!(worker->flags & WORKER_IDLE)); | ||
1186 | worker_clr_flags(worker, WORKER_IDLE); | ||
1187 | gcwq->nr_idle--; | ||
1188 | list_del_init(&worker->entry); | ||
1189 | } | ||
1190 | |||
1191 | /** | ||
1192 | * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq | ||
1193 | * @worker: self | ||
1194 | * | ||
1195 | * Works which are scheduled while the cpu is online must at least be | ||
1196 | * scheduled to a worker which is bound to the cpu so that if they are | ||
1197 | * flushed from cpu callbacks while cpu is going down, they are | ||
1198 | * guaranteed to execute on the cpu. | ||
1199 | * | ||
1200 | * This function is to be used by rogue workers and rescuers to bind | ||
1201 | * themselves to the target cpu and may race with cpu going down or | ||
1202 | * coming online. kthread_bind() can't be used because it may put the | ||
1203 | * worker to already dead cpu and set_cpus_allowed_ptr() can't be used | ||
1204 | * verbatim as it's best effort and blocking and gcwq may be | ||
1205 | * [dis]associated in the meantime. | ||
1206 | * | ||
1207 | * This function tries set_cpus_allowed() and locks gcwq and verifies | ||
1208 | * the binding against GCWQ_DISASSOCIATED which is set during | ||
1209 | * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters | ||
1210 | * idle state or fetches works without dropping lock, it can guarantee | ||
1211 | * the scheduling requirement described in the first paragraph. | ||
1212 | * | ||
1213 | * CONTEXT: | ||
1214 | * Might sleep. Called without any lock but returns with gcwq->lock | ||
1215 | * held. | ||
1216 | * | ||
1217 | * RETURNS: | ||
1218 | * %true if the associated gcwq is online (@worker is successfully | ||
1219 | * bound), %false if offline. | ||
1220 | */ | ||
1221 | static bool worker_maybe_bind_and_lock(struct worker *worker) | ||
1222 | __acquires(&gcwq->lock) | ||
1223 | { | ||
1224 | struct global_cwq *gcwq = worker->gcwq; | ||
1225 | struct task_struct *task = worker->task; | ||
1226 | |||
1227 | while (true) { | ||
393 | /* | 1228 | /* |
394 | * It is permissible to free the struct work_struct | 1229 | * The following call may fail, succeed or succeed |
395 | * from inside the function that is called from it, | 1230 | * without actually migrating the task to the cpu if |
396 | * this we need to take into account for lockdep too. | 1231 | * it races with cpu hotunplug operation. Verify |
397 | * To avoid bogus "held lock freed" warnings as well | 1232 | * against GCWQ_DISASSOCIATED. |
398 | * as problems when looking into work->lockdep_map, | ||
399 | * make a copy and use that here. | ||
400 | */ | 1233 | */ |
401 | struct lockdep_map lockdep_map = work->lockdep_map; | 1234 | if (!(gcwq->flags & GCWQ_DISASSOCIATED)) |
402 | #endif | 1235 | set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); |
403 | trace_workqueue_execution(cwq->thread, work); | 1236 | |
404 | debug_work_deactivate(work); | 1237 | spin_lock_irq(&gcwq->lock); |
405 | cwq->current_work = work; | 1238 | if (gcwq->flags & GCWQ_DISASSOCIATED) |
406 | list_del_init(cwq->worklist.next); | 1239 | return false; |
407 | spin_unlock_irq(&cwq->lock); | 1240 | if (task_cpu(task) == gcwq->cpu && |
408 | 1241 | cpumask_equal(¤t->cpus_allowed, | |
409 | BUG_ON(get_wq_data(work) != cwq); | 1242 | get_cpu_mask(gcwq->cpu))) |
410 | work_clear_pending(work); | 1243 | return true; |
411 | lock_map_acquire(&cwq->wq->lockdep_map); | 1244 | spin_unlock_irq(&gcwq->lock); |
412 | lock_map_acquire(&lockdep_map); | 1245 | |
413 | f(work); | 1246 | /* CPU has come up inbetween, retry migration */ |
414 | lock_map_release(&lockdep_map); | 1247 | cpu_relax(); |
415 | lock_map_release(&cwq->wq->lockdep_map); | 1248 | } |
416 | 1249 | } | |
417 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { | 1250 | |
418 | printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " | 1251 | /* |
419 | "%s/0x%08x/%d\n", | 1252 | * Function for worker->rebind_work used to rebind rogue busy workers |
420 | current->comm, preempt_count(), | 1253 | * to the associated cpu which is coming back online. This is |
421 | task_pid_nr(current)); | 1254 | * scheduled by cpu up but can race with other cpu hotplug operations |
422 | printk(KERN_ERR " last function: "); | 1255 | * and may be executed twice without intervening cpu down. |
423 | print_symbol("%s\n", (unsigned long)f); | 1256 | */ |
424 | debug_show_held_locks(current); | 1257 | static void worker_rebind_fn(struct work_struct *work) |
425 | dump_stack(); | 1258 | { |
1259 | struct worker *worker = container_of(work, struct worker, rebind_work); | ||
1260 | struct global_cwq *gcwq = worker->gcwq; | ||
1261 | |||
1262 | if (worker_maybe_bind_and_lock(worker)) | ||
1263 | worker_clr_flags(worker, WORKER_REBIND); | ||
1264 | |||
1265 | spin_unlock_irq(&gcwq->lock); | ||
1266 | } | ||
1267 | |||
1268 | static struct worker *alloc_worker(void) | ||
1269 | { | ||
1270 | struct worker *worker; | ||
1271 | |||
1272 | worker = kzalloc(sizeof(*worker), GFP_KERNEL); | ||
1273 | if (worker) { | ||
1274 | INIT_LIST_HEAD(&worker->entry); | ||
1275 | INIT_LIST_HEAD(&worker->scheduled); | ||
1276 | INIT_WORK(&worker->rebind_work, worker_rebind_fn); | ||
1277 | /* on creation a worker is in !idle && prep state */ | ||
1278 | worker->flags = WORKER_PREP; | ||
1279 | } | ||
1280 | return worker; | ||
1281 | } | ||
1282 | |||
1283 | /** | ||
1284 | * create_worker - create a new workqueue worker | ||
1285 | * @gcwq: gcwq the new worker will belong to | ||
1286 | * @bind: whether to set affinity to @cpu or not | ||
1287 | * | ||
1288 | * Create a new worker which is bound to @gcwq. The returned worker | ||
1289 | * can be started by calling start_worker() or destroyed using | ||
1290 | * destroy_worker(). | ||
1291 | * | ||
1292 | * CONTEXT: | ||
1293 | * Might sleep. Does GFP_KERNEL allocations. | ||
1294 | * | ||
1295 | * RETURNS: | ||
1296 | * Pointer to the newly created worker. | ||
1297 | */ | ||
1298 | static struct worker *create_worker(struct global_cwq *gcwq, bool bind) | ||
1299 | { | ||
1300 | bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; | ||
1301 | struct worker *worker = NULL; | ||
1302 | int id = -1; | ||
1303 | |||
1304 | spin_lock_irq(&gcwq->lock); | ||
1305 | while (ida_get_new(&gcwq->worker_ida, &id)) { | ||
1306 | spin_unlock_irq(&gcwq->lock); | ||
1307 | if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) | ||
1308 | goto fail; | ||
1309 | spin_lock_irq(&gcwq->lock); | ||
1310 | } | ||
1311 | spin_unlock_irq(&gcwq->lock); | ||
1312 | |||
1313 | worker = alloc_worker(); | ||
1314 | if (!worker) | ||
1315 | goto fail; | ||
1316 | |||
1317 | worker->gcwq = gcwq; | ||
1318 | worker->id = id; | ||
1319 | |||
1320 | if (!on_unbound_cpu) | ||
1321 | worker->task = kthread_create(worker_thread, worker, | ||
1322 | "kworker/%u:%d", gcwq->cpu, id); | ||
1323 | else | ||
1324 | worker->task = kthread_create(worker_thread, worker, | ||
1325 | "kworker/u:%d", id); | ||
1326 | if (IS_ERR(worker->task)) | ||
1327 | goto fail; | ||
1328 | |||
1329 | /* | ||
1330 | * A rogue worker will become a regular one if CPU comes | ||
1331 | * online later on. Make sure every worker has | ||
1332 | * PF_THREAD_BOUND set. | ||
1333 | */ | ||
1334 | if (bind && !on_unbound_cpu) | ||
1335 | kthread_bind(worker->task, gcwq->cpu); | ||
1336 | else { | ||
1337 | worker->task->flags |= PF_THREAD_BOUND; | ||
1338 | if (on_unbound_cpu) | ||
1339 | worker->flags |= WORKER_UNBOUND; | ||
1340 | } | ||
1341 | |||
1342 | return worker; | ||
1343 | fail: | ||
1344 | if (id >= 0) { | ||
1345 | spin_lock_irq(&gcwq->lock); | ||
1346 | ida_remove(&gcwq->worker_ida, id); | ||
1347 | spin_unlock_irq(&gcwq->lock); | ||
1348 | } | ||
1349 | kfree(worker); | ||
1350 | return NULL; | ||
1351 | } | ||
1352 | |||
1353 | /** | ||
1354 | * start_worker - start a newly created worker | ||
1355 | * @worker: worker to start | ||
1356 | * | ||
1357 | * Make the gcwq aware of @worker and start it. | ||
1358 | * | ||
1359 | * CONTEXT: | ||
1360 | * spin_lock_irq(gcwq->lock). | ||
1361 | */ | ||
1362 | static void start_worker(struct worker *worker) | ||
1363 | { | ||
1364 | worker->flags |= WORKER_STARTED; | ||
1365 | worker->gcwq->nr_workers++; | ||
1366 | worker_enter_idle(worker); | ||
1367 | wake_up_process(worker->task); | ||
1368 | } | ||
1369 | |||
1370 | /** | ||
1371 | * destroy_worker - destroy a workqueue worker | ||
1372 | * @worker: worker to be destroyed | ||
1373 | * | ||
1374 | * Destroy @worker and adjust @gcwq stats accordingly. | ||
1375 | * | ||
1376 | * CONTEXT: | ||
1377 | * spin_lock_irq(gcwq->lock) which is released and regrabbed. | ||
1378 | */ | ||
1379 | static void destroy_worker(struct worker *worker) | ||
1380 | { | ||
1381 | struct global_cwq *gcwq = worker->gcwq; | ||
1382 | int id = worker->id; | ||
1383 | |||
1384 | /* sanity check frenzy */ | ||
1385 | BUG_ON(worker->current_work); | ||
1386 | BUG_ON(!list_empty(&worker->scheduled)); | ||
1387 | |||
1388 | if (worker->flags & WORKER_STARTED) | ||
1389 | gcwq->nr_workers--; | ||
1390 | if (worker->flags & WORKER_IDLE) | ||
1391 | gcwq->nr_idle--; | ||
1392 | |||
1393 | list_del_init(&worker->entry); | ||
1394 | worker->flags |= WORKER_DIE; | ||
1395 | |||
1396 | spin_unlock_irq(&gcwq->lock); | ||
1397 | |||
1398 | kthread_stop(worker->task); | ||
1399 | kfree(worker); | ||
1400 | |||
1401 | spin_lock_irq(&gcwq->lock); | ||
1402 | ida_remove(&gcwq->worker_ida, id); | ||
1403 | } | ||
1404 | |||
1405 | static void idle_worker_timeout(unsigned long __gcwq) | ||
1406 | { | ||
1407 | struct global_cwq *gcwq = (void *)__gcwq; | ||
1408 | |||
1409 | spin_lock_irq(&gcwq->lock); | ||
1410 | |||
1411 | if (too_many_workers(gcwq)) { | ||
1412 | struct worker *worker; | ||
1413 | unsigned long expires; | ||
1414 | |||
1415 | /* idle_list is kept in LIFO order, check the last one */ | ||
1416 | worker = list_entry(gcwq->idle_list.prev, struct worker, entry); | ||
1417 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; | ||
1418 | |||
1419 | if (time_before(jiffies, expires)) | ||
1420 | mod_timer(&gcwq->idle_timer, expires); | ||
1421 | else { | ||
1422 | /* it's been idle for too long, wake up manager */ | ||
1423 | gcwq->flags |= GCWQ_MANAGE_WORKERS; | ||
1424 | wake_up_worker(gcwq); | ||
426 | } | 1425 | } |
1426 | } | ||
1427 | |||
1428 | spin_unlock_irq(&gcwq->lock); | ||
1429 | } | ||
1430 | |||
1431 | static bool send_mayday(struct work_struct *work) | ||
1432 | { | ||
1433 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); | ||
1434 | struct workqueue_struct *wq = cwq->wq; | ||
1435 | unsigned int cpu; | ||
1436 | |||
1437 | if (!(wq->flags & WQ_RESCUER)) | ||
1438 | return false; | ||
1439 | |||
1440 | /* mayday mayday mayday */ | ||
1441 | cpu = cwq->gcwq->cpu; | ||
1442 | /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ | ||
1443 | if (cpu == WORK_CPU_UNBOUND) | ||
1444 | cpu = 0; | ||
1445 | if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) | ||
1446 | wake_up_process(wq->rescuer->task); | ||
1447 | return true; | ||
1448 | } | ||
1449 | |||
1450 | static void gcwq_mayday_timeout(unsigned long __gcwq) | ||
1451 | { | ||
1452 | struct global_cwq *gcwq = (void *)__gcwq; | ||
1453 | struct work_struct *work; | ||
1454 | |||
1455 | spin_lock_irq(&gcwq->lock); | ||
427 | 1456 | ||
428 | spin_lock_irq(&cwq->lock); | 1457 | if (need_to_create_worker(gcwq)) { |
429 | cwq->current_work = NULL; | 1458 | /* |
1459 | * We've been trying to create a new worker but | ||
1460 | * haven't been successful. We might be hitting an | ||
1461 | * allocation deadlock. Send distress signals to | ||
1462 | * rescuers. | ||
1463 | */ | ||
1464 | list_for_each_entry(work, &gcwq->worklist, entry) | ||
1465 | send_mayday(work); | ||
430 | } | 1466 | } |
431 | spin_unlock_irq(&cwq->lock); | 1467 | |
1468 | spin_unlock_irq(&gcwq->lock); | ||
1469 | |||
1470 | mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); | ||
432 | } | 1471 | } |
433 | 1472 | ||
434 | static int worker_thread(void *__cwq) | 1473 | /** |
1474 | * maybe_create_worker - create a new worker if necessary | ||
1475 | * @gcwq: gcwq to create a new worker for | ||
1476 | * | ||
1477 | * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to | ||
1478 | * have at least one idle worker on return from this function. If | ||
1479 | * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is | ||
1480 | * sent to all rescuers with works scheduled on @gcwq to resolve | ||
1481 | * possible allocation deadlock. | ||
1482 | * | ||
1483 | * On return, need_to_create_worker() is guaranteed to be false and | ||
1484 | * may_start_working() true. | ||
1485 | * | ||
1486 | * LOCKING: | ||
1487 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
1488 | * multiple times. Does GFP_KERNEL allocations. Called only from | ||
1489 | * manager. | ||
1490 | * | ||
1491 | * RETURNS: | ||
1492 | * false if no action was taken and gcwq->lock stayed locked, true | ||
1493 | * otherwise. | ||
1494 | */ | ||
1495 | static bool maybe_create_worker(struct global_cwq *gcwq) | ||
1496 | __releases(&gcwq->lock) | ||
1497 | __acquires(&gcwq->lock) | ||
435 | { | 1498 | { |
436 | struct cpu_workqueue_struct *cwq = __cwq; | 1499 | if (!need_to_create_worker(gcwq)) |
437 | DEFINE_WAIT(wait); | 1500 | return false; |
1501 | restart: | ||
1502 | spin_unlock_irq(&gcwq->lock); | ||
1503 | |||
1504 | /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ | ||
1505 | mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); | ||
1506 | |||
1507 | while (true) { | ||
1508 | struct worker *worker; | ||
1509 | |||
1510 | worker = create_worker(gcwq, true); | ||
1511 | if (worker) { | ||
1512 | del_timer_sync(&gcwq->mayday_timer); | ||
1513 | spin_lock_irq(&gcwq->lock); | ||
1514 | start_worker(worker); | ||
1515 | BUG_ON(need_to_create_worker(gcwq)); | ||
1516 | return true; | ||
1517 | } | ||
1518 | |||
1519 | if (!need_to_create_worker(gcwq)) | ||
1520 | break; | ||
1521 | |||
1522 | __set_current_state(TASK_INTERRUPTIBLE); | ||
1523 | schedule_timeout(CREATE_COOLDOWN); | ||
1524 | |||
1525 | if (!need_to_create_worker(gcwq)) | ||
1526 | break; | ||
1527 | } | ||
1528 | |||
1529 | del_timer_sync(&gcwq->mayday_timer); | ||
1530 | spin_lock_irq(&gcwq->lock); | ||
1531 | if (need_to_create_worker(gcwq)) | ||
1532 | goto restart; | ||
1533 | return true; | ||
1534 | } | ||
438 | 1535 | ||
439 | if (cwq->wq->freezeable) | 1536 | /** |
440 | set_freezable(); | 1537 | * maybe_destroy_worker - destroy workers which have been idle for a while |
1538 | * @gcwq: gcwq to destroy workers for | ||
1539 | * | ||
1540 | * Destroy @gcwq workers which have been idle for longer than | ||
1541 | * IDLE_WORKER_TIMEOUT. | ||
1542 | * | ||
1543 | * LOCKING: | ||
1544 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
1545 | * multiple times. Called only from manager. | ||
1546 | * | ||
1547 | * RETURNS: | ||
1548 | * false if no action was taken and gcwq->lock stayed locked, true | ||
1549 | * otherwise. | ||
1550 | */ | ||
1551 | static bool maybe_destroy_workers(struct global_cwq *gcwq) | ||
1552 | { | ||
1553 | bool ret = false; | ||
441 | 1554 | ||
442 | for (;;) { | 1555 | while (too_many_workers(gcwq)) { |
443 | prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); | 1556 | struct worker *worker; |
444 | if (!freezing(current) && | 1557 | unsigned long expires; |
445 | !kthread_should_stop() && | ||
446 | list_empty(&cwq->worklist)) | ||
447 | schedule(); | ||
448 | finish_wait(&cwq->more_work, &wait); | ||
449 | 1558 | ||
450 | try_to_freeze(); | 1559 | worker = list_entry(gcwq->idle_list.prev, struct worker, entry); |
1560 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; | ||
451 | 1561 | ||
452 | if (kthread_should_stop()) | 1562 | if (time_before(jiffies, expires)) { |
1563 | mod_timer(&gcwq->idle_timer, expires); | ||
453 | break; | 1564 | break; |
1565 | } | ||
454 | 1566 | ||
455 | run_workqueue(cwq); | 1567 | destroy_worker(worker); |
1568 | ret = true; | ||
456 | } | 1569 | } |
457 | 1570 | ||
458 | return 0; | 1571 | return ret; |
1572 | } | ||
1573 | |||
1574 | /** | ||
1575 | * manage_workers - manage worker pool | ||
1576 | * @worker: self | ||
1577 | * | ||
1578 | * Assume the manager role and manage gcwq worker pool @worker belongs | ||
1579 | * to. At any given time, there can be only zero or one manager per | ||
1580 | * gcwq. The exclusion is handled automatically by this function. | ||
1581 | * | ||
1582 | * The caller can safely start processing works on false return. On | ||
1583 | * true return, it's guaranteed that need_to_create_worker() is false | ||
1584 | * and may_start_working() is true. | ||
1585 | * | ||
1586 | * CONTEXT: | ||
1587 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
1588 | * multiple times. Does GFP_KERNEL allocations. | ||
1589 | * | ||
1590 | * RETURNS: | ||
1591 | * false if no action was taken and gcwq->lock stayed locked, true if | ||
1592 | * some action was taken. | ||
1593 | */ | ||
1594 | static bool manage_workers(struct worker *worker) | ||
1595 | { | ||
1596 | struct global_cwq *gcwq = worker->gcwq; | ||
1597 | bool ret = false; | ||
1598 | |||
1599 | if (gcwq->flags & GCWQ_MANAGING_WORKERS) | ||
1600 | return ret; | ||
1601 | |||
1602 | gcwq->flags &= ~GCWQ_MANAGE_WORKERS; | ||
1603 | gcwq->flags |= GCWQ_MANAGING_WORKERS; | ||
1604 | |||
1605 | /* | ||
1606 | * Destroy and then create so that may_start_working() is true | ||
1607 | * on return. | ||
1608 | */ | ||
1609 | ret |= maybe_destroy_workers(gcwq); | ||
1610 | ret |= maybe_create_worker(gcwq); | ||
1611 | |||
1612 | gcwq->flags &= ~GCWQ_MANAGING_WORKERS; | ||
1613 | |||
1614 | /* | ||
1615 | * The trustee might be waiting to take over the manager | ||
1616 | * position, tell it we're done. | ||
1617 | */ | ||
1618 | if (unlikely(gcwq->trustee)) | ||
1619 | wake_up_all(&gcwq->trustee_wait); | ||
1620 | |||
1621 | return ret; | ||
1622 | } | ||
1623 | |||
1624 | /** | ||
1625 | * move_linked_works - move linked works to a list | ||
1626 | * @work: start of series of works to be scheduled | ||
1627 | * @head: target list to append @work to | ||
1628 | * @nextp: out paramter for nested worklist walking | ||
1629 | * | ||
1630 | * Schedule linked works starting from @work to @head. Work series to | ||
1631 | * be scheduled starts at @work and includes any consecutive work with | ||
1632 | * WORK_STRUCT_LINKED set in its predecessor. | ||
1633 | * | ||
1634 | * If @nextp is not NULL, it's updated to point to the next work of | ||
1635 | * the last scheduled work. This allows move_linked_works() to be | ||
1636 | * nested inside outer list_for_each_entry_safe(). | ||
1637 | * | ||
1638 | * CONTEXT: | ||
1639 | * spin_lock_irq(gcwq->lock). | ||
1640 | */ | ||
1641 | static void move_linked_works(struct work_struct *work, struct list_head *head, | ||
1642 | struct work_struct **nextp) | ||
1643 | { | ||
1644 | struct work_struct *n; | ||
1645 | |||
1646 | /* | ||
1647 | * Linked worklist will always end before the end of the list, | ||
1648 | * use NULL for list head. | ||
1649 | */ | ||
1650 | list_for_each_entry_safe_from(work, n, NULL, entry) { | ||
1651 | list_move_tail(&work->entry, head); | ||
1652 | if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) | ||
1653 | break; | ||
1654 | } | ||
1655 | |||
1656 | /* | ||
1657 | * If we're already inside safe list traversal and have moved | ||
1658 | * multiple works to the scheduled queue, the next position | ||
1659 | * needs to be updated. | ||
1660 | */ | ||
1661 | if (nextp) | ||
1662 | *nextp = n; | ||
1663 | } | ||
1664 | |||
1665 | static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) | ||
1666 | { | ||
1667 | struct work_struct *work = list_first_entry(&cwq->delayed_works, | ||
1668 | struct work_struct, entry); | ||
1669 | struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); | ||
1670 | |||
1671 | trace_workqueue_activate_work(work); | ||
1672 | move_linked_works(work, pos, NULL); | ||
1673 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); | ||
1674 | cwq->nr_active++; | ||
1675 | } | ||
1676 | |||
1677 | /** | ||
1678 | * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight | ||
1679 | * @cwq: cwq of interest | ||
1680 | * @color: color of work which left the queue | ||
1681 | * @delayed: for a delayed work | ||
1682 | * | ||
1683 | * A work either has completed or is removed from pending queue, | ||
1684 | * decrement nr_in_flight of its cwq and handle workqueue flushing. | ||
1685 | * | ||
1686 | * CONTEXT: | ||
1687 | * spin_lock_irq(gcwq->lock). | ||
1688 | */ | ||
1689 | static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, | ||
1690 | bool delayed) | ||
1691 | { | ||
1692 | /* ignore uncolored works */ | ||
1693 | if (color == WORK_NO_COLOR) | ||
1694 | return; | ||
1695 | |||
1696 | cwq->nr_in_flight[color]--; | ||
1697 | |||
1698 | if (!delayed) { | ||
1699 | cwq->nr_active--; | ||
1700 | if (!list_empty(&cwq->delayed_works)) { | ||
1701 | /* one down, submit a delayed one */ | ||
1702 | if (cwq->nr_active < cwq->max_active) | ||
1703 | cwq_activate_first_delayed(cwq); | ||
1704 | } | ||
1705 | } | ||
1706 | |||
1707 | /* is flush in progress and are we at the flushing tip? */ | ||
1708 | if (likely(cwq->flush_color != color)) | ||
1709 | return; | ||
1710 | |||
1711 | /* are there still in-flight works? */ | ||
1712 | if (cwq->nr_in_flight[color]) | ||
1713 | return; | ||
1714 | |||
1715 | /* this cwq is done, clear flush_color */ | ||
1716 | cwq->flush_color = -1; | ||
1717 | |||
1718 | /* | ||
1719 | * If this was the last cwq, wake up the first flusher. It | ||
1720 | * will handle the rest. | ||
1721 | */ | ||
1722 | if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) | ||
1723 | complete(&cwq->wq->first_flusher->done); | ||
1724 | } | ||
1725 | |||
1726 | /** | ||
1727 | * process_one_work - process single work | ||
1728 | * @worker: self | ||
1729 | * @work: work to process | ||
1730 | * | ||
1731 | * Process @work. This function contains all the logics necessary to | ||
1732 | * process a single work including synchronization against and | ||
1733 | * interaction with other workers on the same cpu, queueing and | ||
1734 | * flushing. As long as context requirement is met, any worker can | ||
1735 | * call this function to process a work. | ||
1736 | * | ||
1737 | * CONTEXT: | ||
1738 | * spin_lock_irq(gcwq->lock) which is released and regrabbed. | ||
1739 | */ | ||
1740 | static void process_one_work(struct worker *worker, struct work_struct *work) | ||
1741 | __releases(&gcwq->lock) | ||
1742 | __acquires(&gcwq->lock) | ||
1743 | { | ||
1744 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); | ||
1745 | struct global_cwq *gcwq = cwq->gcwq; | ||
1746 | struct hlist_head *bwh = busy_worker_head(gcwq, work); | ||
1747 | bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; | ||
1748 | work_func_t f = work->func; | ||
1749 | int work_color; | ||
1750 | struct worker *collision; | ||
1751 | #ifdef CONFIG_LOCKDEP | ||
1752 | /* | ||
1753 | * It is permissible to free the struct work_struct from | ||
1754 | * inside the function that is called from it, this we need to | ||
1755 | * take into account for lockdep too. To avoid bogus "held | ||
1756 | * lock freed" warnings as well as problems when looking into | ||
1757 | * work->lockdep_map, make a copy and use that here. | ||
1758 | */ | ||
1759 | struct lockdep_map lockdep_map = work->lockdep_map; | ||
1760 | #endif | ||
1761 | /* | ||
1762 | * A single work shouldn't be executed concurrently by | ||
1763 | * multiple workers on a single cpu. Check whether anyone is | ||
1764 | * already processing the work. If so, defer the work to the | ||
1765 | * currently executing one. | ||
1766 | */ | ||
1767 | collision = __find_worker_executing_work(gcwq, bwh, work); | ||
1768 | if (unlikely(collision)) { | ||
1769 | move_linked_works(work, &collision->scheduled, NULL); | ||
1770 | return; | ||
1771 | } | ||
1772 | |||
1773 | /* claim and process */ | ||
1774 | debug_work_deactivate(work); | ||
1775 | hlist_add_head(&worker->hentry, bwh); | ||
1776 | worker->current_work = work; | ||
1777 | worker->current_cwq = cwq; | ||
1778 | work_color = get_work_color(work); | ||
1779 | |||
1780 | /* record the current cpu number in the work data and dequeue */ | ||
1781 | set_work_cpu(work, gcwq->cpu); | ||
1782 | list_del_init(&work->entry); | ||
1783 | |||
1784 | /* | ||
1785 | * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, | ||
1786 | * wake up another worker; otherwise, clear HIGHPRI_PENDING. | ||
1787 | */ | ||
1788 | if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { | ||
1789 | struct work_struct *nwork = list_first_entry(&gcwq->worklist, | ||
1790 | struct work_struct, entry); | ||
1791 | |||
1792 | if (!list_empty(&gcwq->worklist) && | ||
1793 | get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) | ||
1794 | wake_up_worker(gcwq); | ||
1795 | else | ||
1796 | gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; | ||
1797 | } | ||
1798 | |||
1799 | /* | ||
1800 | * CPU intensive works don't participate in concurrency | ||
1801 | * management. They're the scheduler's responsibility. | ||
1802 | */ | ||
1803 | if (unlikely(cpu_intensive)) | ||
1804 | worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); | ||
1805 | |||
1806 | spin_unlock_irq(&gcwq->lock); | ||
1807 | |||
1808 | work_clear_pending(work); | ||
1809 | lock_map_acquire(&cwq->wq->lockdep_map); | ||
1810 | lock_map_acquire(&lockdep_map); | ||
1811 | trace_workqueue_execute_start(work); | ||
1812 | f(work); | ||
1813 | /* | ||
1814 | * While we must be careful to not use "work" after this, the trace | ||
1815 | * point will only record its address. | ||
1816 | */ | ||
1817 | trace_workqueue_execute_end(work); | ||
1818 | lock_map_release(&lockdep_map); | ||
1819 | lock_map_release(&cwq->wq->lockdep_map); | ||
1820 | |||
1821 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { | ||
1822 | printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " | ||
1823 | "%s/0x%08x/%d\n", | ||
1824 | current->comm, preempt_count(), task_pid_nr(current)); | ||
1825 | printk(KERN_ERR " last function: "); | ||
1826 | print_symbol("%s\n", (unsigned long)f); | ||
1827 | debug_show_held_locks(current); | ||
1828 | dump_stack(); | ||
1829 | } | ||
1830 | |||
1831 | spin_lock_irq(&gcwq->lock); | ||
1832 | |||
1833 | /* clear cpu intensive status */ | ||
1834 | if (unlikely(cpu_intensive)) | ||
1835 | worker_clr_flags(worker, WORKER_CPU_INTENSIVE); | ||
1836 | |||
1837 | /* we're done with it, release */ | ||
1838 | hlist_del_init(&worker->hentry); | ||
1839 | worker->current_work = NULL; | ||
1840 | worker->current_cwq = NULL; | ||
1841 | cwq_dec_nr_in_flight(cwq, work_color, false); | ||
1842 | } | ||
1843 | |||
1844 | /** | ||
1845 | * process_scheduled_works - process scheduled works | ||
1846 | * @worker: self | ||
1847 | * | ||
1848 | * Process all scheduled works. Please note that the scheduled list | ||
1849 | * may change while processing a work, so this function repeatedly | ||
1850 | * fetches a work from the top and executes it. | ||
1851 | * | ||
1852 | * CONTEXT: | ||
1853 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
1854 | * multiple times. | ||
1855 | */ | ||
1856 | static void process_scheduled_works(struct worker *worker) | ||
1857 | { | ||
1858 | while (!list_empty(&worker->scheduled)) { | ||
1859 | struct work_struct *work = list_first_entry(&worker->scheduled, | ||
1860 | struct work_struct, entry); | ||
1861 | process_one_work(worker, work); | ||
1862 | } | ||
1863 | } | ||
1864 | |||
1865 | /** | ||
1866 | * worker_thread - the worker thread function | ||
1867 | * @__worker: self | ||
1868 | * | ||
1869 | * The gcwq worker thread function. There's a single dynamic pool of | ||
1870 | * these per each cpu. These workers process all works regardless of | ||
1871 | * their specific target workqueue. The only exception is works which | ||
1872 | * belong to workqueues with a rescuer which will be explained in | ||
1873 | * rescuer_thread(). | ||
1874 | */ | ||
1875 | static int worker_thread(void *__worker) | ||
1876 | { | ||
1877 | struct worker *worker = __worker; | ||
1878 | struct global_cwq *gcwq = worker->gcwq; | ||
1879 | |||
1880 | /* tell the scheduler that this is a workqueue worker */ | ||
1881 | worker->task->flags |= PF_WQ_WORKER; | ||
1882 | woke_up: | ||
1883 | spin_lock_irq(&gcwq->lock); | ||
1884 | |||
1885 | /* DIE can be set only while we're idle, checking here is enough */ | ||
1886 | if (worker->flags & WORKER_DIE) { | ||
1887 | spin_unlock_irq(&gcwq->lock); | ||
1888 | worker->task->flags &= ~PF_WQ_WORKER; | ||
1889 | return 0; | ||
1890 | } | ||
1891 | |||
1892 | worker_leave_idle(worker); | ||
1893 | recheck: | ||
1894 | /* no more worker necessary? */ | ||
1895 | if (!need_more_worker(gcwq)) | ||
1896 | goto sleep; | ||
1897 | |||
1898 | /* do we need to manage? */ | ||
1899 | if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) | ||
1900 | goto recheck; | ||
1901 | |||
1902 | /* | ||
1903 | * ->scheduled list can only be filled while a worker is | ||
1904 | * preparing to process a work or actually processing it. | ||
1905 | * Make sure nobody diddled with it while I was sleeping. | ||
1906 | */ | ||
1907 | BUG_ON(!list_empty(&worker->scheduled)); | ||
1908 | |||
1909 | /* | ||
1910 | * When control reaches this point, we're guaranteed to have | ||
1911 | * at least one idle worker or that someone else has already | ||
1912 | * assumed the manager role. | ||
1913 | */ | ||
1914 | worker_clr_flags(worker, WORKER_PREP); | ||
1915 | |||
1916 | do { | ||
1917 | struct work_struct *work = | ||
1918 | list_first_entry(&gcwq->worklist, | ||
1919 | struct work_struct, entry); | ||
1920 | |||
1921 | if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { | ||
1922 | /* optimization path, not strictly necessary */ | ||
1923 | process_one_work(worker, work); | ||
1924 | if (unlikely(!list_empty(&worker->scheduled))) | ||
1925 | process_scheduled_works(worker); | ||
1926 | } else { | ||
1927 | move_linked_works(work, &worker->scheduled, NULL); | ||
1928 | process_scheduled_works(worker); | ||
1929 | } | ||
1930 | } while (keep_working(gcwq)); | ||
1931 | |||
1932 | worker_set_flags(worker, WORKER_PREP, false); | ||
1933 | sleep: | ||
1934 | if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) | ||
1935 | goto recheck; | ||
1936 | |||
1937 | /* | ||
1938 | * gcwq->lock is held and there's no work to process and no | ||
1939 | * need to manage, sleep. Workers are woken up only while | ||
1940 | * holding gcwq->lock or from local cpu, so setting the | ||
1941 | * current state before releasing gcwq->lock is enough to | ||
1942 | * prevent losing any event. | ||
1943 | */ | ||
1944 | worker_enter_idle(worker); | ||
1945 | __set_current_state(TASK_INTERRUPTIBLE); | ||
1946 | spin_unlock_irq(&gcwq->lock); | ||
1947 | schedule(); | ||
1948 | goto woke_up; | ||
1949 | } | ||
1950 | |||
1951 | /** | ||
1952 | * rescuer_thread - the rescuer thread function | ||
1953 | * @__wq: the associated workqueue | ||
1954 | * | ||
1955 | * Workqueue rescuer thread function. There's one rescuer for each | ||
1956 | * workqueue which has WQ_RESCUER set. | ||
1957 | * | ||
1958 | * Regular work processing on a gcwq may block trying to create a new | ||
1959 | * worker which uses GFP_KERNEL allocation which has slight chance of | ||
1960 | * developing into deadlock if some works currently on the same queue | ||
1961 | * need to be processed to satisfy the GFP_KERNEL allocation. This is | ||
1962 | * the problem rescuer solves. | ||
1963 | * | ||
1964 | * When such condition is possible, the gcwq summons rescuers of all | ||
1965 | * workqueues which have works queued on the gcwq and let them process | ||
1966 | * those works so that forward progress can be guaranteed. | ||
1967 | * | ||
1968 | * This should happen rarely. | ||
1969 | */ | ||
1970 | static int rescuer_thread(void *__wq) | ||
1971 | { | ||
1972 | struct workqueue_struct *wq = __wq; | ||
1973 | struct worker *rescuer = wq->rescuer; | ||
1974 | struct list_head *scheduled = &rescuer->scheduled; | ||
1975 | bool is_unbound = wq->flags & WQ_UNBOUND; | ||
1976 | unsigned int cpu; | ||
1977 | |||
1978 | set_user_nice(current, RESCUER_NICE_LEVEL); | ||
1979 | repeat: | ||
1980 | set_current_state(TASK_INTERRUPTIBLE); | ||
1981 | |||
1982 | if (kthread_should_stop()) | ||
1983 | return 0; | ||
1984 | |||
1985 | /* | ||
1986 | * See whether any cpu is asking for help. Unbounded | ||
1987 | * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. | ||
1988 | */ | ||
1989 | for_each_mayday_cpu(cpu, wq->mayday_mask) { | ||
1990 | unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; | ||
1991 | struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); | ||
1992 | struct global_cwq *gcwq = cwq->gcwq; | ||
1993 | struct work_struct *work, *n; | ||
1994 | |||
1995 | __set_current_state(TASK_RUNNING); | ||
1996 | mayday_clear_cpu(cpu, wq->mayday_mask); | ||
1997 | |||
1998 | /* migrate to the target cpu if possible */ | ||
1999 | rescuer->gcwq = gcwq; | ||
2000 | worker_maybe_bind_and_lock(rescuer); | ||
2001 | |||
2002 | /* | ||
2003 | * Slurp in all works issued via this workqueue and | ||
2004 | * process'em. | ||
2005 | */ | ||
2006 | BUG_ON(!list_empty(&rescuer->scheduled)); | ||
2007 | list_for_each_entry_safe(work, n, &gcwq->worklist, entry) | ||
2008 | if (get_work_cwq(work) == cwq) | ||
2009 | move_linked_works(work, scheduled, &n); | ||
2010 | |||
2011 | process_scheduled_works(rescuer); | ||
2012 | spin_unlock_irq(&gcwq->lock); | ||
2013 | } | ||
2014 | |||
2015 | schedule(); | ||
2016 | goto repeat; | ||
459 | } | 2017 | } |
460 | 2018 | ||
461 | struct wq_barrier { | 2019 | struct wq_barrier { |
@@ -469,44 +2027,137 @@ static void wq_barrier_func(struct work_struct *work) | |||
469 | complete(&barr->done); | 2027 | complete(&barr->done); |
470 | } | 2028 | } |
471 | 2029 | ||
2030 | /** | ||
2031 | * insert_wq_barrier - insert a barrier work | ||
2032 | * @cwq: cwq to insert barrier into | ||
2033 | * @barr: wq_barrier to insert | ||
2034 | * @target: target work to attach @barr to | ||
2035 | * @worker: worker currently executing @target, NULL if @target is not executing | ||
2036 | * | ||
2037 | * @barr is linked to @target such that @barr is completed only after | ||
2038 | * @target finishes execution. Please note that the ordering | ||
2039 | * guarantee is observed only with respect to @target and on the local | ||
2040 | * cpu. | ||
2041 | * | ||
2042 | * Currently, a queued barrier can't be canceled. This is because | ||
2043 | * try_to_grab_pending() can't determine whether the work to be | ||
2044 | * grabbed is at the head of the queue and thus can't clear LINKED | ||
2045 | * flag of the previous work while there must be a valid next work | ||
2046 | * after a work with LINKED flag set. | ||
2047 | * | ||
2048 | * Note that when @worker is non-NULL, @target may be modified | ||
2049 | * underneath us, so we can't reliably determine cwq from @target. | ||
2050 | * | ||
2051 | * CONTEXT: | ||
2052 | * spin_lock_irq(gcwq->lock). | ||
2053 | */ | ||
472 | static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, | 2054 | static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, |
473 | struct wq_barrier *barr, struct list_head *head) | 2055 | struct wq_barrier *barr, |
2056 | struct work_struct *target, struct worker *worker) | ||
474 | { | 2057 | { |
2058 | struct list_head *head; | ||
2059 | unsigned int linked = 0; | ||
2060 | |||
475 | /* | 2061 | /* |
476 | * debugobject calls are safe here even with cwq->lock locked | 2062 | * debugobject calls are safe here even with gcwq->lock locked |
477 | * as we know for sure that this will not trigger any of the | 2063 | * as we know for sure that this will not trigger any of the |
478 | * checks and call back into the fixup functions where we | 2064 | * checks and call back into the fixup functions where we |
479 | * might deadlock. | 2065 | * might deadlock. |
480 | */ | 2066 | */ |
481 | INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); | 2067 | INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); |
482 | __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); | 2068 | __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); |
483 | |||
484 | init_completion(&barr->done); | 2069 | init_completion(&barr->done); |
485 | 2070 | ||
2071 | /* | ||
2072 | * If @target is currently being executed, schedule the | ||
2073 | * barrier to the worker; otherwise, put it after @target. | ||
2074 | */ | ||
2075 | if (worker) | ||
2076 | head = worker->scheduled.next; | ||
2077 | else { | ||
2078 | unsigned long *bits = work_data_bits(target); | ||
2079 | |||
2080 | head = target->entry.next; | ||
2081 | /* there can already be other linked works, inherit and set */ | ||
2082 | linked = *bits & WORK_STRUCT_LINKED; | ||
2083 | __set_bit(WORK_STRUCT_LINKED_BIT, bits); | ||
2084 | } | ||
2085 | |||
486 | debug_work_activate(&barr->work); | 2086 | debug_work_activate(&barr->work); |
487 | insert_work(cwq, &barr->work, head); | 2087 | insert_work(cwq, &barr->work, head, |
2088 | work_color_to_flags(WORK_NO_COLOR) | linked); | ||
488 | } | 2089 | } |
489 | 2090 | ||
490 | static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) | 2091 | /** |
2092 | * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing | ||
2093 | * @wq: workqueue being flushed | ||
2094 | * @flush_color: new flush color, < 0 for no-op | ||
2095 | * @work_color: new work color, < 0 for no-op | ||
2096 | * | ||
2097 | * Prepare cwqs for workqueue flushing. | ||
2098 | * | ||
2099 | * If @flush_color is non-negative, flush_color on all cwqs should be | ||
2100 | * -1. If no cwq has in-flight commands at the specified color, all | ||
2101 | * cwq->flush_color's stay at -1 and %false is returned. If any cwq | ||
2102 | * has in flight commands, its cwq->flush_color is set to | ||
2103 | * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq | ||
2104 | * wakeup logic is armed and %true is returned. | ||
2105 | * | ||
2106 | * The caller should have initialized @wq->first_flusher prior to | ||
2107 | * calling this function with non-negative @flush_color. If | ||
2108 | * @flush_color is negative, no flush color update is done and %false | ||
2109 | * is returned. | ||
2110 | * | ||
2111 | * If @work_color is non-negative, all cwqs should have the same | ||
2112 | * work_color which is previous to @work_color and all will be | ||
2113 | * advanced to @work_color. | ||
2114 | * | ||
2115 | * CONTEXT: | ||
2116 | * mutex_lock(wq->flush_mutex). | ||
2117 | * | ||
2118 | * RETURNS: | ||
2119 | * %true if @flush_color >= 0 and there's something to flush. %false | ||
2120 | * otherwise. | ||
2121 | */ | ||
2122 | static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, | ||
2123 | int flush_color, int work_color) | ||
491 | { | 2124 | { |
492 | int active = 0; | 2125 | bool wait = false; |
493 | struct wq_barrier barr; | 2126 | unsigned int cpu; |
494 | |||
495 | WARN_ON(cwq->thread == current); | ||
496 | 2127 | ||
497 | spin_lock_irq(&cwq->lock); | 2128 | if (flush_color >= 0) { |
498 | if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { | 2129 | BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); |
499 | insert_wq_barrier(cwq, &barr, &cwq->worklist); | 2130 | atomic_set(&wq->nr_cwqs_to_flush, 1); |
500 | active = 1; | ||
501 | } | 2131 | } |
502 | spin_unlock_irq(&cwq->lock); | ||
503 | 2132 | ||
504 | if (active) { | 2133 | for_each_cwq_cpu(cpu, wq) { |
505 | wait_for_completion(&barr.done); | 2134 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
506 | destroy_work_on_stack(&barr.work); | 2135 | struct global_cwq *gcwq = cwq->gcwq; |
2136 | |||
2137 | spin_lock_irq(&gcwq->lock); | ||
2138 | |||
2139 | if (flush_color >= 0) { | ||
2140 | BUG_ON(cwq->flush_color != -1); | ||
2141 | |||
2142 | if (cwq->nr_in_flight[flush_color]) { | ||
2143 | cwq->flush_color = flush_color; | ||
2144 | atomic_inc(&wq->nr_cwqs_to_flush); | ||
2145 | wait = true; | ||
2146 | } | ||
2147 | } | ||
2148 | |||
2149 | if (work_color >= 0) { | ||
2150 | BUG_ON(work_color != work_next_color(cwq->work_color)); | ||
2151 | cwq->work_color = work_color; | ||
2152 | } | ||
2153 | |||
2154 | spin_unlock_irq(&gcwq->lock); | ||
507 | } | 2155 | } |
508 | 2156 | ||
509 | return active; | 2157 | if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) |
2158 | complete(&wq->first_flusher->done); | ||
2159 | |||
2160 | return wait; | ||
510 | } | 2161 | } |
511 | 2162 | ||
512 | /** | 2163 | /** |
@@ -518,158 +2169,340 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) | |||
518 | * | 2169 | * |
519 | * We sleep until all works which were queued on entry have been handled, | 2170 | * We sleep until all works which were queued on entry have been handled, |
520 | * but we are not livelocked by new incoming ones. | 2171 | * but we are not livelocked by new incoming ones. |
521 | * | ||
522 | * This function used to run the workqueues itself. Now we just wait for the | ||
523 | * helper threads to do it. | ||
524 | */ | 2172 | */ |
525 | void flush_workqueue(struct workqueue_struct *wq) | 2173 | void flush_workqueue(struct workqueue_struct *wq) |
526 | { | 2174 | { |
527 | const struct cpumask *cpu_map = wq_cpu_map(wq); | 2175 | struct wq_flusher this_flusher = { |
528 | int cpu; | 2176 | .list = LIST_HEAD_INIT(this_flusher.list), |
2177 | .flush_color = -1, | ||
2178 | .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done), | ||
2179 | }; | ||
2180 | int next_color; | ||
529 | 2181 | ||
530 | might_sleep(); | ||
531 | lock_map_acquire(&wq->lockdep_map); | 2182 | lock_map_acquire(&wq->lockdep_map); |
532 | lock_map_release(&wq->lockdep_map); | 2183 | lock_map_release(&wq->lockdep_map); |
533 | for_each_cpu(cpu, cpu_map) | 2184 | |
534 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); | 2185 | mutex_lock(&wq->flush_mutex); |
2186 | |||
2187 | /* | ||
2188 | * Start-to-wait phase | ||
2189 | */ | ||
2190 | next_color = work_next_color(wq->work_color); | ||
2191 | |||
2192 | if (next_color != wq->flush_color) { | ||
2193 | /* | ||
2194 | * Color space is not full. The current work_color | ||
2195 | * becomes our flush_color and work_color is advanced | ||
2196 | * by one. | ||
2197 | */ | ||
2198 | BUG_ON(!list_empty(&wq->flusher_overflow)); | ||
2199 | this_flusher.flush_color = wq->work_color; | ||
2200 | wq->work_color = next_color; | ||
2201 | |||
2202 | if (!wq->first_flusher) { | ||
2203 | /* no flush in progress, become the first flusher */ | ||
2204 | BUG_ON(wq->flush_color != this_flusher.flush_color); | ||
2205 | |||
2206 | wq->first_flusher = &this_flusher; | ||
2207 | |||
2208 | if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, | ||
2209 | wq->work_color)) { | ||
2210 | /* nothing to flush, done */ | ||
2211 | wq->flush_color = next_color; | ||
2212 | wq->first_flusher = NULL; | ||
2213 | goto out_unlock; | ||
2214 | } | ||
2215 | } else { | ||
2216 | /* wait in queue */ | ||
2217 | BUG_ON(wq->flush_color == this_flusher.flush_color); | ||
2218 | list_add_tail(&this_flusher.list, &wq->flusher_queue); | ||
2219 | flush_workqueue_prep_cwqs(wq, -1, wq->work_color); | ||
2220 | } | ||
2221 | } else { | ||
2222 | /* | ||
2223 | * Oops, color space is full, wait on overflow queue. | ||
2224 | * The next flush completion will assign us | ||
2225 | * flush_color and transfer to flusher_queue. | ||
2226 | */ | ||
2227 | list_add_tail(&this_flusher.list, &wq->flusher_overflow); | ||
2228 | } | ||
2229 | |||
2230 | mutex_unlock(&wq->flush_mutex); | ||
2231 | |||
2232 | wait_for_completion(&this_flusher.done); | ||
2233 | |||
2234 | /* | ||
2235 | * Wake-up-and-cascade phase | ||
2236 | * | ||
2237 | * First flushers are responsible for cascading flushes and | ||
2238 | * handling overflow. Non-first flushers can simply return. | ||
2239 | */ | ||
2240 | if (wq->first_flusher != &this_flusher) | ||
2241 | return; | ||
2242 | |||
2243 | mutex_lock(&wq->flush_mutex); | ||
2244 | |||
2245 | /* we might have raced, check again with mutex held */ | ||
2246 | if (wq->first_flusher != &this_flusher) | ||
2247 | goto out_unlock; | ||
2248 | |||
2249 | wq->first_flusher = NULL; | ||
2250 | |||
2251 | BUG_ON(!list_empty(&this_flusher.list)); | ||
2252 | BUG_ON(wq->flush_color != this_flusher.flush_color); | ||
2253 | |||
2254 | while (true) { | ||
2255 | struct wq_flusher *next, *tmp; | ||
2256 | |||
2257 | /* complete all the flushers sharing the current flush color */ | ||
2258 | list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { | ||
2259 | if (next->flush_color != wq->flush_color) | ||
2260 | break; | ||
2261 | list_del_init(&next->list); | ||
2262 | complete(&next->done); | ||
2263 | } | ||
2264 | |||
2265 | BUG_ON(!list_empty(&wq->flusher_overflow) && | ||
2266 | wq->flush_color != work_next_color(wq->work_color)); | ||
2267 | |||
2268 | /* this flush_color is finished, advance by one */ | ||
2269 | wq->flush_color = work_next_color(wq->flush_color); | ||
2270 | |||
2271 | /* one color has been freed, handle overflow queue */ | ||
2272 | if (!list_empty(&wq->flusher_overflow)) { | ||
2273 | /* | ||
2274 | * Assign the same color to all overflowed | ||
2275 | * flushers, advance work_color and append to | ||
2276 | * flusher_queue. This is the start-to-wait | ||
2277 | * phase for these overflowed flushers. | ||
2278 | */ | ||
2279 | list_for_each_entry(tmp, &wq->flusher_overflow, list) | ||
2280 | tmp->flush_color = wq->work_color; | ||
2281 | |||
2282 | wq->work_color = work_next_color(wq->work_color); | ||
2283 | |||
2284 | list_splice_tail_init(&wq->flusher_overflow, | ||
2285 | &wq->flusher_queue); | ||
2286 | flush_workqueue_prep_cwqs(wq, -1, wq->work_color); | ||
2287 | } | ||
2288 | |||
2289 | if (list_empty(&wq->flusher_queue)) { | ||
2290 | BUG_ON(wq->flush_color != wq->work_color); | ||
2291 | break; | ||
2292 | } | ||
2293 | |||
2294 | /* | ||
2295 | * Need to flush more colors. Make the next flusher | ||
2296 | * the new first flusher and arm cwqs. | ||
2297 | */ | ||
2298 | BUG_ON(wq->flush_color == wq->work_color); | ||
2299 | BUG_ON(wq->flush_color != next->flush_color); | ||
2300 | |||
2301 | list_del_init(&next->list); | ||
2302 | wq->first_flusher = next; | ||
2303 | |||
2304 | if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) | ||
2305 | break; | ||
2306 | |||
2307 | /* | ||
2308 | * Meh... this color is already done, clear first | ||
2309 | * flusher and repeat cascading. | ||
2310 | */ | ||
2311 | wq->first_flusher = NULL; | ||
2312 | } | ||
2313 | |||
2314 | out_unlock: | ||
2315 | mutex_unlock(&wq->flush_mutex); | ||
535 | } | 2316 | } |
536 | EXPORT_SYMBOL_GPL(flush_workqueue); | 2317 | EXPORT_SYMBOL_GPL(flush_workqueue); |
537 | 2318 | ||
538 | /** | 2319 | static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, |
539 | * flush_work - block until a work_struct's callback has terminated | 2320 | bool wait_executing) |
540 | * @work: the work which is to be flushed | ||
541 | * | ||
542 | * Returns false if @work has already terminated. | ||
543 | * | ||
544 | * It is expected that, prior to calling flush_work(), the caller has | ||
545 | * arranged for the work to not be requeued, otherwise it doesn't make | ||
546 | * sense to use this function. | ||
547 | */ | ||
548 | int flush_work(struct work_struct *work) | ||
549 | { | 2321 | { |
2322 | struct worker *worker = NULL; | ||
2323 | struct global_cwq *gcwq; | ||
550 | struct cpu_workqueue_struct *cwq; | 2324 | struct cpu_workqueue_struct *cwq; |
551 | struct list_head *prev; | ||
552 | struct wq_barrier barr; | ||
553 | 2325 | ||
554 | might_sleep(); | 2326 | might_sleep(); |
555 | cwq = get_wq_data(work); | 2327 | gcwq = get_work_gcwq(work); |
556 | if (!cwq) | 2328 | if (!gcwq) |
557 | return 0; | 2329 | return false; |
558 | |||
559 | lock_map_acquire(&cwq->wq->lockdep_map); | ||
560 | lock_map_release(&cwq->wq->lockdep_map); | ||
561 | 2330 | ||
562 | prev = NULL; | 2331 | spin_lock_irq(&gcwq->lock); |
563 | spin_lock_irq(&cwq->lock); | ||
564 | if (!list_empty(&work->entry)) { | 2332 | if (!list_empty(&work->entry)) { |
565 | /* | 2333 | /* |
566 | * See the comment near try_to_grab_pending()->smp_rmb(). | 2334 | * See the comment near try_to_grab_pending()->smp_rmb(). |
567 | * If it was re-queued under us we are not going to wait. | 2335 | * If it was re-queued to a different gcwq under us, we |
2336 | * are not going to wait. | ||
568 | */ | 2337 | */ |
569 | smp_rmb(); | 2338 | smp_rmb(); |
570 | if (unlikely(cwq != get_wq_data(work))) | 2339 | cwq = get_work_cwq(work); |
571 | goto out; | 2340 | if (unlikely(!cwq || gcwq != cwq->gcwq)) |
572 | prev = &work->entry; | 2341 | goto already_gone; |
573 | } else { | 2342 | } else if (wait_executing) { |
574 | if (cwq->current_work != work) | 2343 | worker = find_worker_executing_work(gcwq, work); |
575 | goto out; | 2344 | if (!worker) |
576 | prev = &cwq->worklist; | 2345 | goto already_gone; |
577 | } | 2346 | cwq = worker->current_cwq; |
578 | insert_wq_barrier(cwq, &barr, prev->next); | 2347 | } else |
579 | out: | 2348 | goto already_gone; |
580 | spin_unlock_irq(&cwq->lock); | 2349 | |
581 | if (!prev) | 2350 | insert_wq_barrier(cwq, barr, work, worker); |
582 | return 0; | 2351 | spin_unlock_irq(&gcwq->lock); |
583 | 2352 | ||
584 | wait_for_completion(&barr.done); | 2353 | lock_map_acquire(&cwq->wq->lockdep_map); |
585 | destroy_work_on_stack(&barr.work); | 2354 | lock_map_release(&cwq->wq->lockdep_map); |
586 | return 1; | 2355 | return true; |
2356 | already_gone: | ||
2357 | spin_unlock_irq(&gcwq->lock); | ||
2358 | return false; | ||
2359 | } | ||
2360 | |||
2361 | /** | ||
2362 | * flush_work - wait for a work to finish executing the last queueing instance | ||
2363 | * @work: the work to flush | ||
2364 | * | ||
2365 | * Wait until @work has finished execution. This function considers | ||
2366 | * only the last queueing instance of @work. If @work has been | ||
2367 | * enqueued across different CPUs on a non-reentrant workqueue or on | ||
2368 | * multiple workqueues, @work might still be executing on return on | ||
2369 | * some of the CPUs from earlier queueing. | ||
2370 | * | ||
2371 | * If @work was queued only on a non-reentrant, ordered or unbound | ||
2372 | * workqueue, @work is guaranteed to be idle on return if it hasn't | ||
2373 | * been requeued since flush started. | ||
2374 | * | ||
2375 | * RETURNS: | ||
2376 | * %true if flush_work() waited for the work to finish execution, | ||
2377 | * %false if it was already idle. | ||
2378 | */ | ||
2379 | bool flush_work(struct work_struct *work) | ||
2380 | { | ||
2381 | struct wq_barrier barr; | ||
2382 | |||
2383 | if (start_flush_work(work, &barr, true)) { | ||
2384 | wait_for_completion(&barr.done); | ||
2385 | destroy_work_on_stack(&barr.work); | ||
2386 | return true; | ||
2387 | } else | ||
2388 | return false; | ||
587 | } | 2389 | } |
588 | EXPORT_SYMBOL_GPL(flush_work); | 2390 | EXPORT_SYMBOL_GPL(flush_work); |
589 | 2391 | ||
2392 | static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) | ||
2393 | { | ||
2394 | struct wq_barrier barr; | ||
2395 | struct worker *worker; | ||
2396 | |||
2397 | spin_lock_irq(&gcwq->lock); | ||
2398 | |||
2399 | worker = find_worker_executing_work(gcwq, work); | ||
2400 | if (unlikely(worker)) | ||
2401 | insert_wq_barrier(worker->current_cwq, &barr, work, worker); | ||
2402 | |||
2403 | spin_unlock_irq(&gcwq->lock); | ||
2404 | |||
2405 | if (unlikely(worker)) { | ||
2406 | wait_for_completion(&barr.done); | ||
2407 | destroy_work_on_stack(&barr.work); | ||
2408 | return true; | ||
2409 | } else | ||
2410 | return false; | ||
2411 | } | ||
2412 | |||
2413 | static bool wait_on_work(struct work_struct *work) | ||
2414 | { | ||
2415 | bool ret = false; | ||
2416 | int cpu; | ||
2417 | |||
2418 | might_sleep(); | ||
2419 | |||
2420 | lock_map_acquire(&work->lockdep_map); | ||
2421 | lock_map_release(&work->lockdep_map); | ||
2422 | |||
2423 | for_each_gcwq_cpu(cpu) | ||
2424 | ret |= wait_on_cpu_work(get_gcwq(cpu), work); | ||
2425 | return ret; | ||
2426 | } | ||
2427 | |||
2428 | /** | ||
2429 | * flush_work_sync - wait until a work has finished execution | ||
2430 | * @work: the work to flush | ||
2431 | * | ||
2432 | * Wait until @work has finished execution. On return, it's | ||
2433 | * guaranteed that all queueing instances of @work which happened | ||
2434 | * before this function is called are finished. In other words, if | ||
2435 | * @work hasn't been requeued since this function was called, @work is | ||
2436 | * guaranteed to be idle on return. | ||
2437 | * | ||
2438 | * RETURNS: | ||
2439 | * %true if flush_work_sync() waited for the work to finish execution, | ||
2440 | * %false if it was already idle. | ||
2441 | */ | ||
2442 | bool flush_work_sync(struct work_struct *work) | ||
2443 | { | ||
2444 | struct wq_barrier barr; | ||
2445 | bool pending, waited; | ||
2446 | |||
2447 | /* we'll wait for executions separately, queue barr only if pending */ | ||
2448 | pending = start_flush_work(work, &barr, false); | ||
2449 | |||
2450 | /* wait for executions to finish */ | ||
2451 | waited = wait_on_work(work); | ||
2452 | |||
2453 | /* wait for the pending one */ | ||
2454 | if (pending) { | ||
2455 | wait_for_completion(&barr.done); | ||
2456 | destroy_work_on_stack(&barr.work); | ||
2457 | } | ||
2458 | |||
2459 | return pending || waited; | ||
2460 | } | ||
2461 | EXPORT_SYMBOL_GPL(flush_work_sync); | ||
2462 | |||
590 | /* | 2463 | /* |
591 | * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, | 2464 | * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, |
592 | * so this work can't be re-armed in any way. | 2465 | * so this work can't be re-armed in any way. |
593 | */ | 2466 | */ |
594 | static int try_to_grab_pending(struct work_struct *work) | 2467 | static int try_to_grab_pending(struct work_struct *work) |
595 | { | 2468 | { |
596 | struct cpu_workqueue_struct *cwq; | 2469 | struct global_cwq *gcwq; |
597 | int ret = -1; | 2470 | int ret = -1; |
598 | 2471 | ||
599 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) | 2472 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) |
600 | return 0; | 2473 | return 0; |
601 | 2474 | ||
602 | /* | 2475 | /* |
603 | * The queueing is in progress, or it is already queued. Try to | 2476 | * The queueing is in progress, or it is already queued. Try to |
604 | * steal it from ->worklist without clearing WORK_STRUCT_PENDING. | 2477 | * steal it from ->worklist without clearing WORK_STRUCT_PENDING. |
605 | */ | 2478 | */ |
606 | 2479 | gcwq = get_work_gcwq(work); | |
607 | cwq = get_wq_data(work); | 2480 | if (!gcwq) |
608 | if (!cwq) | ||
609 | return ret; | 2481 | return ret; |
610 | 2482 | ||
611 | spin_lock_irq(&cwq->lock); | 2483 | spin_lock_irq(&gcwq->lock); |
612 | if (!list_empty(&work->entry)) { | 2484 | if (!list_empty(&work->entry)) { |
613 | /* | 2485 | /* |
614 | * This work is queued, but perhaps we locked the wrong cwq. | 2486 | * This work is queued, but perhaps we locked the wrong gcwq. |
615 | * In that case we must see the new value after rmb(), see | 2487 | * In that case we must see the new value after rmb(), see |
616 | * insert_work()->wmb(). | 2488 | * insert_work()->wmb(). |
617 | */ | 2489 | */ |
618 | smp_rmb(); | 2490 | smp_rmb(); |
619 | if (cwq == get_wq_data(work)) { | 2491 | if (gcwq == get_work_gcwq(work)) { |
620 | debug_work_deactivate(work); | 2492 | debug_work_deactivate(work); |
621 | list_del_init(&work->entry); | 2493 | list_del_init(&work->entry); |
2494 | cwq_dec_nr_in_flight(get_work_cwq(work), | ||
2495 | get_work_color(work), | ||
2496 | *work_data_bits(work) & WORK_STRUCT_DELAYED); | ||
622 | ret = 1; | 2497 | ret = 1; |
623 | } | 2498 | } |
624 | } | 2499 | } |
625 | spin_unlock_irq(&cwq->lock); | 2500 | spin_unlock_irq(&gcwq->lock); |
626 | 2501 | ||
627 | return ret; | 2502 | return ret; |
628 | } | 2503 | } |
629 | 2504 | ||
630 | static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, | 2505 | static bool __cancel_work_timer(struct work_struct *work, |
631 | struct work_struct *work) | ||
632 | { | ||
633 | struct wq_barrier barr; | ||
634 | int running = 0; | ||
635 | |||
636 | spin_lock_irq(&cwq->lock); | ||
637 | if (unlikely(cwq->current_work == work)) { | ||
638 | insert_wq_barrier(cwq, &barr, cwq->worklist.next); | ||
639 | running = 1; | ||
640 | } | ||
641 | spin_unlock_irq(&cwq->lock); | ||
642 | |||
643 | if (unlikely(running)) { | ||
644 | wait_for_completion(&barr.done); | ||
645 | destroy_work_on_stack(&barr.work); | ||
646 | } | ||
647 | } | ||
648 | |||
649 | static void wait_on_work(struct work_struct *work) | ||
650 | { | ||
651 | struct cpu_workqueue_struct *cwq; | ||
652 | struct workqueue_struct *wq; | ||
653 | const struct cpumask *cpu_map; | ||
654 | int cpu; | ||
655 | |||
656 | might_sleep(); | ||
657 | |||
658 | lock_map_acquire(&work->lockdep_map); | ||
659 | lock_map_release(&work->lockdep_map); | ||
660 | |||
661 | cwq = get_wq_data(work); | ||
662 | if (!cwq) | ||
663 | return; | ||
664 | |||
665 | wq = cwq->wq; | ||
666 | cpu_map = wq_cpu_map(wq); | ||
667 | |||
668 | for_each_cpu(cpu, cpu_map) | ||
669 | wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); | ||
670 | } | ||
671 | |||
672 | static int __cancel_work_timer(struct work_struct *work, | ||
673 | struct timer_list* timer) | 2506 | struct timer_list* timer) |
674 | { | 2507 | { |
675 | int ret; | 2508 | int ret; |
@@ -681,54 +2514,91 @@ static int __cancel_work_timer(struct work_struct *work, | |||
681 | wait_on_work(work); | 2514 | wait_on_work(work); |
682 | } while (unlikely(ret < 0)); | 2515 | } while (unlikely(ret < 0)); |
683 | 2516 | ||
684 | clear_wq_data(work); | 2517 | clear_work_data(work); |
685 | return ret; | 2518 | return ret; |
686 | } | 2519 | } |
687 | 2520 | ||
688 | /** | 2521 | /** |
689 | * cancel_work_sync - block until a work_struct's callback has terminated | 2522 | * cancel_work_sync - cancel a work and wait for it to finish |
690 | * @work: the work which is to be flushed | 2523 | * @work: the work to cancel |
691 | * | ||
692 | * Returns true if @work was pending. | ||
693 | * | 2524 | * |
694 | * cancel_work_sync() will cancel the work if it is queued. If the work's | 2525 | * Cancel @work and wait for its execution to finish. This function |
695 | * callback appears to be running, cancel_work_sync() will block until it | 2526 | * can be used even if the work re-queues itself or migrates to |
696 | * has completed. | 2527 | * another workqueue. On return from this function, @work is |
2528 | * guaranteed to be not pending or executing on any CPU. | ||
697 | * | 2529 | * |
698 | * It is possible to use this function if the work re-queues itself. It can | 2530 | * cancel_work_sync(&delayed_work->work) must not be used for |
699 | * cancel the work even if it migrates to another workqueue, however in that | 2531 | * delayed_work's. Use cancel_delayed_work_sync() instead. |
700 | * case it only guarantees that work->func() has completed on the last queued | ||
701 | * workqueue. | ||
702 | * | ||
703 | * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not | ||
704 | * pending, otherwise it goes into a busy-wait loop until the timer expires. | ||
705 | * | 2532 | * |
706 | * The caller must ensure that workqueue_struct on which this work was last | 2533 | * The caller must ensure that the workqueue on which @work was last |
707 | * queued can't be destroyed before this function returns. | 2534 | * queued can't be destroyed before this function returns. |
2535 | * | ||
2536 | * RETURNS: | ||
2537 | * %true if @work was pending, %false otherwise. | ||
708 | */ | 2538 | */ |
709 | int cancel_work_sync(struct work_struct *work) | 2539 | bool cancel_work_sync(struct work_struct *work) |
710 | { | 2540 | { |
711 | return __cancel_work_timer(work, NULL); | 2541 | return __cancel_work_timer(work, NULL); |
712 | } | 2542 | } |
713 | EXPORT_SYMBOL_GPL(cancel_work_sync); | 2543 | EXPORT_SYMBOL_GPL(cancel_work_sync); |
714 | 2544 | ||
715 | /** | 2545 | /** |
716 | * cancel_delayed_work_sync - reliably kill off a delayed work. | 2546 | * flush_delayed_work - wait for a dwork to finish executing the last queueing |
717 | * @dwork: the delayed work struct | 2547 | * @dwork: the delayed work to flush |
718 | * | 2548 | * |
719 | * Returns true if @dwork was pending. | 2549 | * Delayed timer is cancelled and the pending work is queued for |
2550 | * immediate execution. Like flush_work(), this function only | ||
2551 | * considers the last queueing instance of @dwork. | ||
720 | * | 2552 | * |
721 | * It is possible to use this function if @dwork rearms itself via queue_work() | 2553 | * RETURNS: |
722 | * or queue_delayed_work(). See also the comment for cancel_work_sync(). | 2554 | * %true if flush_work() waited for the work to finish execution, |
2555 | * %false if it was already idle. | ||
723 | */ | 2556 | */ |
724 | int cancel_delayed_work_sync(struct delayed_work *dwork) | 2557 | bool flush_delayed_work(struct delayed_work *dwork) |
2558 | { | ||
2559 | if (del_timer_sync(&dwork->timer)) | ||
2560 | __queue_work(raw_smp_processor_id(), | ||
2561 | get_work_cwq(&dwork->work)->wq, &dwork->work); | ||
2562 | return flush_work(&dwork->work); | ||
2563 | } | ||
2564 | EXPORT_SYMBOL(flush_delayed_work); | ||
2565 | |||
2566 | /** | ||
2567 | * flush_delayed_work_sync - wait for a dwork to finish | ||
2568 | * @dwork: the delayed work to flush | ||
2569 | * | ||
2570 | * Delayed timer is cancelled and the pending work is queued for | ||
2571 | * execution immediately. Other than timer handling, its behavior | ||
2572 | * is identical to flush_work_sync(). | ||
2573 | * | ||
2574 | * RETURNS: | ||
2575 | * %true if flush_work_sync() waited for the work to finish execution, | ||
2576 | * %false if it was already idle. | ||
2577 | */ | ||
2578 | bool flush_delayed_work_sync(struct delayed_work *dwork) | ||
2579 | { | ||
2580 | if (del_timer_sync(&dwork->timer)) | ||
2581 | __queue_work(raw_smp_processor_id(), | ||
2582 | get_work_cwq(&dwork->work)->wq, &dwork->work); | ||
2583 | return flush_work_sync(&dwork->work); | ||
2584 | } | ||
2585 | EXPORT_SYMBOL(flush_delayed_work_sync); | ||
2586 | |||
2587 | /** | ||
2588 | * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish | ||
2589 | * @dwork: the delayed work cancel | ||
2590 | * | ||
2591 | * This is cancel_work_sync() for delayed works. | ||
2592 | * | ||
2593 | * RETURNS: | ||
2594 | * %true if @dwork was pending, %false otherwise. | ||
2595 | */ | ||
2596 | bool cancel_delayed_work_sync(struct delayed_work *dwork) | ||
725 | { | 2597 | { |
726 | return __cancel_work_timer(&dwork->work, &dwork->timer); | 2598 | return __cancel_work_timer(&dwork->work, &dwork->timer); |
727 | } | 2599 | } |
728 | EXPORT_SYMBOL(cancel_delayed_work_sync); | 2600 | EXPORT_SYMBOL(cancel_delayed_work_sync); |
729 | 2601 | ||
730 | static struct workqueue_struct *keventd_wq __read_mostly; | ||
731 | |||
732 | /** | 2602 | /** |
733 | * schedule_work - put work task in global workqueue | 2603 | * schedule_work - put work task in global workqueue |
734 | * @work: job to be done | 2604 | * @work: job to be done |
@@ -742,7 +2612,7 @@ static struct workqueue_struct *keventd_wq __read_mostly; | |||
742 | */ | 2612 | */ |
743 | int schedule_work(struct work_struct *work) | 2613 | int schedule_work(struct work_struct *work) |
744 | { | 2614 | { |
745 | return queue_work(keventd_wq, work); | 2615 | return queue_work(system_wq, work); |
746 | } | 2616 | } |
747 | EXPORT_SYMBOL(schedule_work); | 2617 | EXPORT_SYMBOL(schedule_work); |
748 | 2618 | ||
@@ -755,7 +2625,7 @@ EXPORT_SYMBOL(schedule_work); | |||
755 | */ | 2625 | */ |
756 | int schedule_work_on(int cpu, struct work_struct *work) | 2626 | int schedule_work_on(int cpu, struct work_struct *work) |
757 | { | 2627 | { |
758 | return queue_work_on(cpu, keventd_wq, work); | 2628 | return queue_work_on(cpu, system_wq, work); |
759 | } | 2629 | } |
760 | EXPORT_SYMBOL(schedule_work_on); | 2630 | EXPORT_SYMBOL(schedule_work_on); |
761 | 2631 | ||
@@ -770,29 +2640,11 @@ EXPORT_SYMBOL(schedule_work_on); | |||
770 | int schedule_delayed_work(struct delayed_work *dwork, | 2640 | int schedule_delayed_work(struct delayed_work *dwork, |
771 | unsigned long delay) | 2641 | unsigned long delay) |
772 | { | 2642 | { |
773 | return queue_delayed_work(keventd_wq, dwork, delay); | 2643 | return queue_delayed_work(system_wq, dwork, delay); |
774 | } | 2644 | } |
775 | EXPORT_SYMBOL(schedule_delayed_work); | 2645 | EXPORT_SYMBOL(schedule_delayed_work); |
776 | 2646 | ||
777 | /** | 2647 | /** |
778 | * flush_delayed_work - block until a dwork_struct's callback has terminated | ||
779 | * @dwork: the delayed work which is to be flushed | ||
780 | * | ||
781 | * Any timeout is cancelled, and any pending work is run immediately. | ||
782 | */ | ||
783 | void flush_delayed_work(struct delayed_work *dwork) | ||
784 | { | ||
785 | if (del_timer_sync(&dwork->timer)) { | ||
786 | struct cpu_workqueue_struct *cwq; | ||
787 | cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); | ||
788 | __queue_work(cwq, &dwork->work); | ||
789 | put_cpu(); | ||
790 | } | ||
791 | flush_work(&dwork->work); | ||
792 | } | ||
793 | EXPORT_SYMBOL(flush_delayed_work); | ||
794 | |||
795 | /** | ||
796 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay | 2648 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay |
797 | * @cpu: cpu to use | 2649 | * @cpu: cpu to use |
798 | * @dwork: job to be done | 2650 | * @dwork: job to be done |
@@ -804,24 +2656,25 @@ EXPORT_SYMBOL(flush_delayed_work); | |||
804 | int schedule_delayed_work_on(int cpu, | 2656 | int schedule_delayed_work_on(int cpu, |
805 | struct delayed_work *dwork, unsigned long delay) | 2657 | struct delayed_work *dwork, unsigned long delay) |
806 | { | 2658 | { |
807 | return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); | 2659 | return queue_delayed_work_on(cpu, system_wq, dwork, delay); |
808 | } | 2660 | } |
809 | EXPORT_SYMBOL(schedule_delayed_work_on); | 2661 | EXPORT_SYMBOL(schedule_delayed_work_on); |
810 | 2662 | ||
811 | /** | 2663 | /** |
812 | * schedule_on_each_cpu - call a function on each online CPU from keventd | 2664 | * schedule_on_each_cpu - execute a function synchronously on each online CPU |
813 | * @func: the function to call | 2665 | * @func: the function to call |
814 | * | 2666 | * |
815 | * Returns zero on success. | 2667 | * schedule_on_each_cpu() executes @func on each online CPU using the |
816 | * Returns -ve errno on failure. | 2668 | * system workqueue and blocks until all CPUs have completed. |
817 | * | ||
818 | * schedule_on_each_cpu() is very slow. | 2669 | * schedule_on_each_cpu() is very slow. |
2670 | * | ||
2671 | * RETURNS: | ||
2672 | * 0 on success, -errno on failure. | ||
819 | */ | 2673 | */ |
820 | int schedule_on_each_cpu(work_func_t func) | 2674 | int schedule_on_each_cpu(work_func_t func) |
821 | { | 2675 | { |
822 | int cpu; | 2676 | int cpu; |
823 | int orig = -1; | 2677 | struct work_struct __percpu *works; |
824 | struct work_struct *works; | ||
825 | 2678 | ||
826 | works = alloc_percpu(struct work_struct); | 2679 | works = alloc_percpu(struct work_struct); |
827 | if (!works) | 2680 | if (!works) |
@@ -829,23 +2682,12 @@ int schedule_on_each_cpu(work_func_t func) | |||
829 | 2682 | ||
830 | get_online_cpus(); | 2683 | get_online_cpus(); |
831 | 2684 | ||
832 | /* | ||
833 | * When running in keventd don't schedule a work item on | ||
834 | * itself. Can just call directly because the work queue is | ||
835 | * already bound. This also is faster. | ||
836 | */ | ||
837 | if (current_is_keventd()) | ||
838 | orig = raw_smp_processor_id(); | ||
839 | |||
840 | for_each_online_cpu(cpu) { | 2685 | for_each_online_cpu(cpu) { |
841 | struct work_struct *work = per_cpu_ptr(works, cpu); | 2686 | struct work_struct *work = per_cpu_ptr(works, cpu); |
842 | 2687 | ||
843 | INIT_WORK(work, func); | 2688 | INIT_WORK(work, func); |
844 | if (cpu != orig) | 2689 | schedule_work_on(cpu, work); |
845 | schedule_work_on(cpu, work); | ||
846 | } | 2690 | } |
847 | if (orig >= 0) | ||
848 | func(per_cpu_ptr(works, orig)); | ||
849 | 2691 | ||
850 | for_each_online_cpu(cpu) | 2692 | for_each_online_cpu(cpu) |
851 | flush_work(per_cpu_ptr(works, cpu)); | 2693 | flush_work(per_cpu_ptr(works, cpu)); |
@@ -881,7 +2723,7 @@ int schedule_on_each_cpu(work_func_t func) | |||
881 | */ | 2723 | */ |
882 | void flush_scheduled_work(void) | 2724 | void flush_scheduled_work(void) |
883 | { | 2725 | { |
884 | flush_workqueue(keventd_wq); | 2726 | flush_workqueue(system_wq); |
885 | } | 2727 | } |
886 | EXPORT_SYMBOL(flush_scheduled_work); | 2728 | EXPORT_SYMBOL(flush_scheduled_work); |
887 | 2729 | ||
@@ -913,170 +2755,178 @@ EXPORT_SYMBOL_GPL(execute_in_process_context); | |||
913 | 2755 | ||
914 | int keventd_up(void) | 2756 | int keventd_up(void) |
915 | { | 2757 | { |
916 | return keventd_wq != NULL; | 2758 | return system_wq != NULL; |
917 | } | 2759 | } |
918 | 2760 | ||
919 | int current_is_keventd(void) | 2761 | static int alloc_cwqs(struct workqueue_struct *wq) |
920 | { | 2762 | { |
921 | struct cpu_workqueue_struct *cwq; | 2763 | /* |
922 | int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ | 2764 | * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. |
923 | int ret = 0; | 2765 | * Make sure that the alignment isn't lower than that of |
924 | 2766 | * unsigned long long. | |
925 | BUG_ON(!keventd_wq); | 2767 | */ |
2768 | const size_t size = sizeof(struct cpu_workqueue_struct); | ||
2769 | const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, | ||
2770 | __alignof__(unsigned long long)); | ||
2771 | #ifdef CONFIG_SMP | ||
2772 | bool percpu = !(wq->flags & WQ_UNBOUND); | ||
2773 | #else | ||
2774 | bool percpu = false; | ||
2775 | #endif | ||
926 | 2776 | ||
927 | cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); | 2777 | if (percpu) |
928 | if (current == cwq->thread) | 2778 | wq->cpu_wq.pcpu = __alloc_percpu(size, align); |
929 | ret = 1; | 2779 | else { |
2780 | void *ptr; | ||
930 | 2781 | ||
931 | return ret; | 2782 | /* |
2783 | * Allocate enough room to align cwq and put an extra | ||
2784 | * pointer at the end pointing back to the originally | ||
2785 | * allocated pointer which will be used for free. | ||
2786 | */ | ||
2787 | ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); | ||
2788 | if (ptr) { | ||
2789 | wq->cpu_wq.single = PTR_ALIGN(ptr, align); | ||
2790 | *(void **)(wq->cpu_wq.single + 1) = ptr; | ||
2791 | } | ||
2792 | } | ||
932 | 2793 | ||
2794 | /* just in case, make sure it's actually aligned | ||
2795 | * - this is affected by PERCPU() alignment in vmlinux.lds.S | ||
2796 | */ | ||
2797 | BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); | ||
2798 | return wq->cpu_wq.v ? 0 : -ENOMEM; | ||
933 | } | 2799 | } |
934 | 2800 | ||
935 | static struct cpu_workqueue_struct * | 2801 | static void free_cwqs(struct workqueue_struct *wq) |
936 | init_cpu_workqueue(struct workqueue_struct *wq, int cpu) | ||
937 | { | 2802 | { |
938 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 2803 | #ifdef CONFIG_SMP |
939 | 2804 | bool percpu = !(wq->flags & WQ_UNBOUND); | |
940 | cwq->wq = wq; | 2805 | #else |
941 | spin_lock_init(&cwq->lock); | 2806 | bool percpu = false; |
942 | INIT_LIST_HEAD(&cwq->worklist); | 2807 | #endif |
943 | init_waitqueue_head(&cwq->more_work); | ||
944 | 2808 | ||
945 | return cwq; | 2809 | if (percpu) |
2810 | free_percpu(wq->cpu_wq.pcpu); | ||
2811 | else if (wq->cpu_wq.single) { | ||
2812 | /* the pointer to free is stored right after the cwq */ | ||
2813 | kfree(*(void **)(wq->cpu_wq.single + 1)); | ||
2814 | } | ||
946 | } | 2815 | } |
947 | 2816 | ||
948 | static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | 2817 | static int wq_clamp_max_active(int max_active, unsigned int flags, |
2818 | const char *name) | ||
949 | { | 2819 | { |
950 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 2820 | int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; |
951 | struct workqueue_struct *wq = cwq->wq; | ||
952 | const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d"; | ||
953 | struct task_struct *p; | ||
954 | 2821 | ||
955 | p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); | 2822 | if (max_active < 1 || max_active > lim) |
956 | /* | 2823 | printk(KERN_WARNING "workqueue: max_active %d requested for %s " |
957 | * Nobody can add the work_struct to this cwq, | 2824 | "is out of range, clamping between %d and %d\n", |
958 | * if (caller is __create_workqueue) | 2825 | max_active, name, 1, lim); |
959 | * nobody should see this wq | ||
960 | * else // caller is CPU_UP_PREPARE | ||
961 | * cpu is not on cpu_online_map | ||
962 | * so we can abort safely. | ||
963 | */ | ||
964 | if (IS_ERR(p)) | ||
965 | return PTR_ERR(p); | ||
966 | if (cwq->wq->rt) | ||
967 | sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); | ||
968 | cwq->thread = p; | ||
969 | 2826 | ||
970 | trace_workqueue_creation(cwq->thread, cpu); | 2827 | return clamp_val(max_active, 1, lim); |
971 | |||
972 | return 0; | ||
973 | } | 2828 | } |
974 | 2829 | ||
975 | static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | 2830 | struct workqueue_struct *__alloc_workqueue_key(const char *name, |
2831 | unsigned int flags, | ||
2832 | int max_active, | ||
2833 | struct lock_class_key *key, | ||
2834 | const char *lock_name) | ||
976 | { | 2835 | { |
977 | struct task_struct *p = cwq->thread; | 2836 | struct workqueue_struct *wq; |
2837 | unsigned int cpu; | ||
978 | 2838 | ||
979 | if (p != NULL) { | 2839 | /* |
980 | if (cpu >= 0) | 2840 | * Workqueues which may be used during memory reclaim should |
981 | kthread_bind(p, cpu); | 2841 | * have a rescuer to guarantee forward progress. |
982 | wake_up_process(p); | 2842 | */ |
983 | } | 2843 | if (flags & WQ_MEM_RECLAIM) |
984 | } | 2844 | flags |= WQ_RESCUER; |
985 | 2845 | ||
986 | struct workqueue_struct *__create_workqueue_key(const char *name, | 2846 | /* |
987 | int singlethread, | 2847 | * Unbound workqueues aren't concurrency managed and should be |
988 | int freezeable, | 2848 | * dispatched to workers immediately. |
989 | int rt, | 2849 | */ |
990 | struct lock_class_key *key, | 2850 | if (flags & WQ_UNBOUND) |
991 | const char *lock_name) | 2851 | flags |= WQ_HIGHPRI; |
992 | { | 2852 | |
993 | struct workqueue_struct *wq; | 2853 | max_active = max_active ?: WQ_DFL_ACTIVE; |
994 | struct cpu_workqueue_struct *cwq; | 2854 | max_active = wq_clamp_max_active(max_active, flags, name); |
995 | int err = 0, cpu; | ||
996 | 2855 | ||
997 | wq = kzalloc(sizeof(*wq), GFP_KERNEL); | 2856 | wq = kzalloc(sizeof(*wq), GFP_KERNEL); |
998 | if (!wq) | 2857 | if (!wq) |
999 | return NULL; | 2858 | goto err; |
1000 | 2859 | ||
1001 | wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); | 2860 | wq->flags = flags; |
1002 | if (!wq->cpu_wq) { | 2861 | wq->saved_max_active = max_active; |
1003 | kfree(wq); | 2862 | mutex_init(&wq->flush_mutex); |
1004 | return NULL; | 2863 | atomic_set(&wq->nr_cwqs_to_flush, 0); |
1005 | } | 2864 | INIT_LIST_HEAD(&wq->flusher_queue); |
2865 | INIT_LIST_HEAD(&wq->flusher_overflow); | ||
1006 | 2866 | ||
1007 | wq->name = name; | 2867 | wq->name = name; |
1008 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); | 2868 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); |
1009 | wq->singlethread = singlethread; | ||
1010 | wq->freezeable = freezeable; | ||
1011 | wq->rt = rt; | ||
1012 | INIT_LIST_HEAD(&wq->list); | 2869 | INIT_LIST_HEAD(&wq->list); |
1013 | 2870 | ||
1014 | if (singlethread) { | 2871 | if (alloc_cwqs(wq) < 0) |
1015 | cwq = init_cpu_workqueue(wq, singlethread_cpu); | 2872 | goto err; |
1016 | err = create_workqueue_thread(cwq, singlethread_cpu); | 2873 | |
1017 | start_workqueue_thread(cwq, -1); | 2874 | for_each_cwq_cpu(cpu, wq) { |
1018 | } else { | 2875 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
1019 | cpu_maps_update_begin(); | 2876 | struct global_cwq *gcwq = get_gcwq(cpu); |
1020 | /* | 2877 | |
1021 | * We must place this wq on list even if the code below fails. | 2878 | BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); |
1022 | * cpu_down(cpu) can remove cpu from cpu_populated_map before | 2879 | cwq->gcwq = gcwq; |
1023 | * destroy_workqueue() takes the lock, in that case we leak | 2880 | cwq->wq = wq; |
1024 | * cwq[cpu]->thread. | 2881 | cwq->flush_color = -1; |
1025 | */ | 2882 | cwq->max_active = max_active; |
1026 | spin_lock(&workqueue_lock); | 2883 | INIT_LIST_HEAD(&cwq->delayed_works); |
1027 | list_add(&wq->list, &workqueues); | ||
1028 | spin_unlock(&workqueue_lock); | ||
1029 | /* | ||
1030 | * We must initialize cwqs for each possible cpu even if we | ||
1031 | * are going to call destroy_workqueue() finally. Otherwise | ||
1032 | * cpu_up() can hit the uninitialized cwq once we drop the | ||
1033 | * lock. | ||
1034 | */ | ||
1035 | for_each_possible_cpu(cpu) { | ||
1036 | cwq = init_cpu_workqueue(wq, cpu); | ||
1037 | if (err || !cpu_online(cpu)) | ||
1038 | continue; | ||
1039 | err = create_workqueue_thread(cwq, cpu); | ||
1040 | start_workqueue_thread(cwq, cpu); | ||
1041 | } | ||
1042 | cpu_maps_update_done(); | ||
1043 | } | 2884 | } |
1044 | 2885 | ||
1045 | if (err) { | 2886 | if (flags & WQ_RESCUER) { |
1046 | destroy_workqueue(wq); | 2887 | struct worker *rescuer; |
1047 | wq = NULL; | 2888 | |
2889 | if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) | ||
2890 | goto err; | ||
2891 | |||
2892 | wq->rescuer = rescuer = alloc_worker(); | ||
2893 | if (!rescuer) | ||
2894 | goto err; | ||
2895 | |||
2896 | rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); | ||
2897 | if (IS_ERR(rescuer->task)) | ||
2898 | goto err; | ||
2899 | |||
2900 | rescuer->task->flags |= PF_THREAD_BOUND; | ||
2901 | wake_up_process(rescuer->task); | ||
1048 | } | 2902 | } |
1049 | return wq; | ||
1050 | } | ||
1051 | EXPORT_SYMBOL_GPL(__create_workqueue_key); | ||
1052 | 2903 | ||
1053 | static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) | ||
1054 | { | ||
1055 | /* | 2904 | /* |
1056 | * Our caller is either destroy_workqueue() or CPU_POST_DEAD, | 2905 | * workqueue_lock protects global freeze state and workqueues |
1057 | * cpu_add_remove_lock protects cwq->thread. | 2906 | * list. Grab it, set max_active accordingly and add the new |
2907 | * workqueue to workqueues list. | ||
1058 | */ | 2908 | */ |
1059 | if (cwq->thread == NULL) | 2909 | spin_lock(&workqueue_lock); |
1060 | return; | ||
1061 | 2910 | ||
1062 | lock_map_acquire(&cwq->wq->lockdep_map); | 2911 | if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) |
1063 | lock_map_release(&cwq->wq->lockdep_map); | 2912 | for_each_cwq_cpu(cpu, wq) |
2913 | get_cwq(cpu, wq)->max_active = 0; | ||
1064 | 2914 | ||
1065 | flush_cpu_workqueue(cwq); | 2915 | list_add(&wq->list, &workqueues); |
1066 | /* | 2916 | |
1067 | * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, | 2917 | spin_unlock(&workqueue_lock); |
1068 | * a concurrent flush_workqueue() can insert a barrier after us. | 2918 | |
1069 | * However, in that case run_workqueue() won't return and check | 2919 | return wq; |
1070 | * kthread_should_stop() until it flushes all work_struct's. | 2920 | err: |
1071 | * When ->worklist becomes empty it is safe to exit because no | 2921 | if (wq) { |
1072 | * more work_structs can be queued on this cwq: flush_workqueue | 2922 | free_cwqs(wq); |
1073 | * checks list_empty(), and a "normal" queue_work() can't use | 2923 | free_mayday_mask(wq->mayday_mask); |
1074 | * a dead CPU. | 2924 | kfree(wq->rescuer); |
1075 | */ | 2925 | kfree(wq); |
1076 | trace_workqueue_destruction(cwq->thread); | 2926 | } |
1077 | kthread_stop(cwq->thread); | 2927 | return NULL; |
1078 | cwq->thread = NULL; | ||
1079 | } | 2928 | } |
2929 | EXPORT_SYMBOL_GPL(__alloc_workqueue_key); | ||
1080 | 2930 | ||
1081 | /** | 2931 | /** |
1082 | * destroy_workqueue - safely terminate a workqueue | 2932 | * destroy_workqueue - safely terminate a workqueue |
@@ -1086,72 +2936,520 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) | |||
1086 | */ | 2936 | */ |
1087 | void destroy_workqueue(struct workqueue_struct *wq) | 2937 | void destroy_workqueue(struct workqueue_struct *wq) |
1088 | { | 2938 | { |
1089 | const struct cpumask *cpu_map = wq_cpu_map(wq); | 2939 | unsigned int cpu; |
1090 | int cpu; | 2940 | |
2941 | wq->flags |= WQ_DYING; | ||
2942 | flush_workqueue(wq); | ||
1091 | 2943 | ||
1092 | cpu_maps_update_begin(); | 2944 | /* |
2945 | * wq list is used to freeze wq, remove from list after | ||
2946 | * flushing is complete in case freeze races us. | ||
2947 | */ | ||
1093 | spin_lock(&workqueue_lock); | 2948 | spin_lock(&workqueue_lock); |
1094 | list_del(&wq->list); | 2949 | list_del(&wq->list); |
1095 | spin_unlock(&workqueue_lock); | 2950 | spin_unlock(&workqueue_lock); |
1096 | 2951 | ||
1097 | for_each_cpu(cpu, cpu_map) | 2952 | /* sanity check */ |
1098 | cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); | 2953 | for_each_cwq_cpu(cpu, wq) { |
1099 | cpu_maps_update_done(); | 2954 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
2955 | int i; | ||
2956 | |||
2957 | for (i = 0; i < WORK_NR_COLORS; i++) | ||
2958 | BUG_ON(cwq->nr_in_flight[i]); | ||
2959 | BUG_ON(cwq->nr_active); | ||
2960 | BUG_ON(!list_empty(&cwq->delayed_works)); | ||
2961 | } | ||
2962 | |||
2963 | if (wq->flags & WQ_RESCUER) { | ||
2964 | kthread_stop(wq->rescuer->task); | ||
2965 | free_mayday_mask(wq->mayday_mask); | ||
2966 | kfree(wq->rescuer); | ||
2967 | } | ||
1100 | 2968 | ||
1101 | free_percpu(wq->cpu_wq); | 2969 | free_cwqs(wq); |
1102 | kfree(wq); | 2970 | kfree(wq); |
1103 | } | 2971 | } |
1104 | EXPORT_SYMBOL_GPL(destroy_workqueue); | 2972 | EXPORT_SYMBOL_GPL(destroy_workqueue); |
1105 | 2973 | ||
2974 | /** | ||
2975 | * workqueue_set_max_active - adjust max_active of a workqueue | ||
2976 | * @wq: target workqueue | ||
2977 | * @max_active: new max_active value. | ||
2978 | * | ||
2979 | * Set max_active of @wq to @max_active. | ||
2980 | * | ||
2981 | * CONTEXT: | ||
2982 | * Don't call from IRQ context. | ||
2983 | */ | ||
2984 | void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) | ||
2985 | { | ||
2986 | unsigned int cpu; | ||
2987 | |||
2988 | max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); | ||
2989 | |||
2990 | spin_lock(&workqueue_lock); | ||
2991 | |||
2992 | wq->saved_max_active = max_active; | ||
2993 | |||
2994 | for_each_cwq_cpu(cpu, wq) { | ||
2995 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
2996 | |||
2997 | spin_lock_irq(&gcwq->lock); | ||
2998 | |||
2999 | if (!(wq->flags & WQ_FREEZEABLE) || | ||
3000 | !(gcwq->flags & GCWQ_FREEZING)) | ||
3001 | get_cwq(gcwq->cpu, wq)->max_active = max_active; | ||
3002 | |||
3003 | spin_unlock_irq(&gcwq->lock); | ||
3004 | } | ||
3005 | |||
3006 | spin_unlock(&workqueue_lock); | ||
3007 | } | ||
3008 | EXPORT_SYMBOL_GPL(workqueue_set_max_active); | ||
3009 | |||
3010 | /** | ||
3011 | * workqueue_congested - test whether a workqueue is congested | ||
3012 | * @cpu: CPU in question | ||
3013 | * @wq: target workqueue | ||
3014 | * | ||
3015 | * Test whether @wq's cpu workqueue for @cpu is congested. There is | ||
3016 | * no synchronization around this function and the test result is | ||
3017 | * unreliable and only useful as advisory hints or for debugging. | ||
3018 | * | ||
3019 | * RETURNS: | ||
3020 | * %true if congested, %false otherwise. | ||
3021 | */ | ||
3022 | bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) | ||
3023 | { | ||
3024 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
3025 | |||
3026 | return !list_empty(&cwq->delayed_works); | ||
3027 | } | ||
3028 | EXPORT_SYMBOL_GPL(workqueue_congested); | ||
3029 | |||
3030 | /** | ||
3031 | * work_cpu - return the last known associated cpu for @work | ||
3032 | * @work: the work of interest | ||
3033 | * | ||
3034 | * RETURNS: | ||
3035 | * CPU number if @work was ever queued. WORK_CPU_NONE otherwise. | ||
3036 | */ | ||
3037 | unsigned int work_cpu(struct work_struct *work) | ||
3038 | { | ||
3039 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
3040 | |||
3041 | return gcwq ? gcwq->cpu : WORK_CPU_NONE; | ||
3042 | } | ||
3043 | EXPORT_SYMBOL_GPL(work_cpu); | ||
3044 | |||
3045 | /** | ||
3046 | * work_busy - test whether a work is currently pending or running | ||
3047 | * @work: the work to be tested | ||
3048 | * | ||
3049 | * Test whether @work is currently pending or running. There is no | ||
3050 | * synchronization around this function and the test result is | ||
3051 | * unreliable and only useful as advisory hints or for debugging. | ||
3052 | * Especially for reentrant wqs, the pending state might hide the | ||
3053 | * running state. | ||
3054 | * | ||
3055 | * RETURNS: | ||
3056 | * OR'd bitmask of WORK_BUSY_* bits. | ||
3057 | */ | ||
3058 | unsigned int work_busy(struct work_struct *work) | ||
3059 | { | ||
3060 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
3061 | unsigned long flags; | ||
3062 | unsigned int ret = 0; | ||
3063 | |||
3064 | if (!gcwq) | ||
3065 | return false; | ||
3066 | |||
3067 | spin_lock_irqsave(&gcwq->lock, flags); | ||
3068 | |||
3069 | if (work_pending(work)) | ||
3070 | ret |= WORK_BUSY_PENDING; | ||
3071 | if (find_worker_executing_work(gcwq, work)) | ||
3072 | ret |= WORK_BUSY_RUNNING; | ||
3073 | |||
3074 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
3075 | |||
3076 | return ret; | ||
3077 | } | ||
3078 | EXPORT_SYMBOL_GPL(work_busy); | ||
3079 | |||
3080 | /* | ||
3081 | * CPU hotplug. | ||
3082 | * | ||
3083 | * There are two challenges in supporting CPU hotplug. Firstly, there | ||
3084 | * are a lot of assumptions on strong associations among work, cwq and | ||
3085 | * gcwq which make migrating pending and scheduled works very | ||
3086 | * difficult to implement without impacting hot paths. Secondly, | ||
3087 | * gcwqs serve mix of short, long and very long running works making | ||
3088 | * blocked draining impractical. | ||
3089 | * | ||
3090 | * This is solved by allowing a gcwq to be detached from CPU, running | ||
3091 | * it with unbound (rogue) workers and allowing it to be reattached | ||
3092 | * later if the cpu comes back online. A separate thread is created | ||
3093 | * to govern a gcwq in such state and is called the trustee of the | ||
3094 | * gcwq. | ||
3095 | * | ||
3096 | * Trustee states and their descriptions. | ||
3097 | * | ||
3098 | * START Command state used on startup. On CPU_DOWN_PREPARE, a | ||
3099 | * new trustee is started with this state. | ||
3100 | * | ||
3101 | * IN_CHARGE Once started, trustee will enter this state after | ||
3102 | * assuming the manager role and making all existing | ||
3103 | * workers rogue. DOWN_PREPARE waits for trustee to | ||
3104 | * enter this state. After reaching IN_CHARGE, trustee | ||
3105 | * tries to execute the pending worklist until it's empty | ||
3106 | * and the state is set to BUTCHER, or the state is set | ||
3107 | * to RELEASE. | ||
3108 | * | ||
3109 | * BUTCHER Command state which is set by the cpu callback after | ||
3110 | * the cpu has went down. Once this state is set trustee | ||
3111 | * knows that there will be no new works on the worklist | ||
3112 | * and once the worklist is empty it can proceed to | ||
3113 | * killing idle workers. | ||
3114 | * | ||
3115 | * RELEASE Command state which is set by the cpu callback if the | ||
3116 | * cpu down has been canceled or it has come online | ||
3117 | * again. After recognizing this state, trustee stops | ||
3118 | * trying to drain or butcher and clears ROGUE, rebinds | ||
3119 | * all remaining workers back to the cpu and releases | ||
3120 | * manager role. | ||
3121 | * | ||
3122 | * DONE Trustee will enter this state after BUTCHER or RELEASE | ||
3123 | * is complete. | ||
3124 | * | ||
3125 | * trustee CPU draining | ||
3126 | * took over down complete | ||
3127 | * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE | ||
3128 | * | | ^ | ||
3129 | * | CPU is back online v return workers | | ||
3130 | * ----------------> RELEASE -------------- | ||
3131 | */ | ||
3132 | |||
3133 | /** | ||
3134 | * trustee_wait_event_timeout - timed event wait for trustee | ||
3135 | * @cond: condition to wait for | ||
3136 | * @timeout: timeout in jiffies | ||
3137 | * | ||
3138 | * wait_event_timeout() for trustee to use. Handles locking and | ||
3139 | * checks for RELEASE request. | ||
3140 | * | ||
3141 | * CONTEXT: | ||
3142 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3143 | * multiple times. To be used by trustee. | ||
3144 | * | ||
3145 | * RETURNS: | ||
3146 | * Positive indicating left time if @cond is satisfied, 0 if timed | ||
3147 | * out, -1 if canceled. | ||
3148 | */ | ||
3149 | #define trustee_wait_event_timeout(cond, timeout) ({ \ | ||
3150 | long __ret = (timeout); \ | ||
3151 | while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ | ||
3152 | __ret) { \ | ||
3153 | spin_unlock_irq(&gcwq->lock); \ | ||
3154 | __wait_event_timeout(gcwq->trustee_wait, (cond) || \ | ||
3155 | (gcwq->trustee_state == TRUSTEE_RELEASE), \ | ||
3156 | __ret); \ | ||
3157 | spin_lock_irq(&gcwq->lock); \ | ||
3158 | } \ | ||
3159 | gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ | ||
3160 | }) | ||
3161 | |||
3162 | /** | ||
3163 | * trustee_wait_event - event wait for trustee | ||
3164 | * @cond: condition to wait for | ||
3165 | * | ||
3166 | * wait_event() for trustee to use. Automatically handles locking and | ||
3167 | * checks for CANCEL request. | ||
3168 | * | ||
3169 | * CONTEXT: | ||
3170 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3171 | * multiple times. To be used by trustee. | ||
3172 | * | ||
3173 | * RETURNS: | ||
3174 | * 0 if @cond is satisfied, -1 if canceled. | ||
3175 | */ | ||
3176 | #define trustee_wait_event(cond) ({ \ | ||
3177 | long __ret1; \ | ||
3178 | __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ | ||
3179 | __ret1 < 0 ? -1 : 0; \ | ||
3180 | }) | ||
3181 | |||
3182 | static int __cpuinit trustee_thread(void *__gcwq) | ||
3183 | { | ||
3184 | struct global_cwq *gcwq = __gcwq; | ||
3185 | struct worker *worker; | ||
3186 | struct work_struct *work; | ||
3187 | struct hlist_node *pos; | ||
3188 | long rc; | ||
3189 | int i; | ||
3190 | |||
3191 | BUG_ON(gcwq->cpu != smp_processor_id()); | ||
3192 | |||
3193 | spin_lock_irq(&gcwq->lock); | ||
3194 | /* | ||
3195 | * Claim the manager position and make all workers rogue. | ||
3196 | * Trustee must be bound to the target cpu and can't be | ||
3197 | * cancelled. | ||
3198 | */ | ||
3199 | BUG_ON(gcwq->cpu != smp_processor_id()); | ||
3200 | rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); | ||
3201 | BUG_ON(rc < 0); | ||
3202 | |||
3203 | gcwq->flags |= GCWQ_MANAGING_WORKERS; | ||
3204 | |||
3205 | list_for_each_entry(worker, &gcwq->idle_list, entry) | ||
3206 | worker->flags |= WORKER_ROGUE; | ||
3207 | |||
3208 | for_each_busy_worker(worker, i, pos, gcwq) | ||
3209 | worker->flags |= WORKER_ROGUE; | ||
3210 | |||
3211 | /* | ||
3212 | * Call schedule() so that we cross rq->lock and thus can | ||
3213 | * guarantee sched callbacks see the rogue flag. This is | ||
3214 | * necessary as scheduler callbacks may be invoked from other | ||
3215 | * cpus. | ||
3216 | */ | ||
3217 | spin_unlock_irq(&gcwq->lock); | ||
3218 | schedule(); | ||
3219 | spin_lock_irq(&gcwq->lock); | ||
3220 | |||
3221 | /* | ||
3222 | * Sched callbacks are disabled now. Zap nr_running. After | ||
3223 | * this, nr_running stays zero and need_more_worker() and | ||
3224 | * keep_working() are always true as long as the worklist is | ||
3225 | * not empty. | ||
3226 | */ | ||
3227 | atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); | ||
3228 | |||
3229 | spin_unlock_irq(&gcwq->lock); | ||
3230 | del_timer_sync(&gcwq->idle_timer); | ||
3231 | spin_lock_irq(&gcwq->lock); | ||
3232 | |||
3233 | /* | ||
3234 | * We're now in charge. Notify and proceed to drain. We need | ||
3235 | * to keep the gcwq running during the whole CPU down | ||
3236 | * procedure as other cpu hotunplug callbacks may need to | ||
3237 | * flush currently running tasks. | ||
3238 | */ | ||
3239 | gcwq->trustee_state = TRUSTEE_IN_CHARGE; | ||
3240 | wake_up_all(&gcwq->trustee_wait); | ||
3241 | |||
3242 | /* | ||
3243 | * The original cpu is in the process of dying and may go away | ||
3244 | * anytime now. When that happens, we and all workers would | ||
3245 | * be migrated to other cpus. Try draining any left work. We | ||
3246 | * want to get it over with ASAP - spam rescuers, wake up as | ||
3247 | * many idlers as necessary and create new ones till the | ||
3248 | * worklist is empty. Note that if the gcwq is frozen, there | ||
3249 | * may be frozen works in freezeable cwqs. Don't declare | ||
3250 | * completion while frozen. | ||
3251 | */ | ||
3252 | while (gcwq->nr_workers != gcwq->nr_idle || | ||
3253 | gcwq->flags & GCWQ_FREEZING || | ||
3254 | gcwq->trustee_state == TRUSTEE_IN_CHARGE) { | ||
3255 | int nr_works = 0; | ||
3256 | |||
3257 | list_for_each_entry(work, &gcwq->worklist, entry) { | ||
3258 | send_mayday(work); | ||
3259 | nr_works++; | ||
3260 | } | ||
3261 | |||
3262 | list_for_each_entry(worker, &gcwq->idle_list, entry) { | ||
3263 | if (!nr_works--) | ||
3264 | break; | ||
3265 | wake_up_process(worker->task); | ||
3266 | } | ||
3267 | |||
3268 | if (need_to_create_worker(gcwq)) { | ||
3269 | spin_unlock_irq(&gcwq->lock); | ||
3270 | worker = create_worker(gcwq, false); | ||
3271 | spin_lock_irq(&gcwq->lock); | ||
3272 | if (worker) { | ||
3273 | worker->flags |= WORKER_ROGUE; | ||
3274 | start_worker(worker); | ||
3275 | } | ||
3276 | } | ||
3277 | |||
3278 | /* give a breather */ | ||
3279 | if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) | ||
3280 | break; | ||
3281 | } | ||
3282 | |||
3283 | /* | ||
3284 | * Either all works have been scheduled and cpu is down, or | ||
3285 | * cpu down has already been canceled. Wait for and butcher | ||
3286 | * all workers till we're canceled. | ||
3287 | */ | ||
3288 | do { | ||
3289 | rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); | ||
3290 | while (!list_empty(&gcwq->idle_list)) | ||
3291 | destroy_worker(list_first_entry(&gcwq->idle_list, | ||
3292 | struct worker, entry)); | ||
3293 | } while (gcwq->nr_workers && rc >= 0); | ||
3294 | |||
3295 | /* | ||
3296 | * At this point, either draining has completed and no worker | ||
3297 | * is left, or cpu down has been canceled or the cpu is being | ||
3298 | * brought back up. There shouldn't be any idle one left. | ||
3299 | * Tell the remaining busy ones to rebind once it finishes the | ||
3300 | * currently scheduled works by scheduling the rebind_work. | ||
3301 | */ | ||
3302 | WARN_ON(!list_empty(&gcwq->idle_list)); | ||
3303 | |||
3304 | for_each_busy_worker(worker, i, pos, gcwq) { | ||
3305 | struct work_struct *rebind_work = &worker->rebind_work; | ||
3306 | |||
3307 | /* | ||
3308 | * Rebind_work may race with future cpu hotplug | ||
3309 | * operations. Use a separate flag to mark that | ||
3310 | * rebinding is scheduled. | ||
3311 | */ | ||
3312 | worker->flags |= WORKER_REBIND; | ||
3313 | worker->flags &= ~WORKER_ROGUE; | ||
3314 | |||
3315 | /* queue rebind_work, wq doesn't matter, use the default one */ | ||
3316 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, | ||
3317 | work_data_bits(rebind_work))) | ||
3318 | continue; | ||
3319 | |||
3320 | debug_work_activate(rebind_work); | ||
3321 | insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, | ||
3322 | worker->scheduled.next, | ||
3323 | work_color_to_flags(WORK_NO_COLOR)); | ||
3324 | } | ||
3325 | |||
3326 | /* relinquish manager role */ | ||
3327 | gcwq->flags &= ~GCWQ_MANAGING_WORKERS; | ||
3328 | |||
3329 | /* notify completion */ | ||
3330 | gcwq->trustee = NULL; | ||
3331 | gcwq->trustee_state = TRUSTEE_DONE; | ||
3332 | wake_up_all(&gcwq->trustee_wait); | ||
3333 | spin_unlock_irq(&gcwq->lock); | ||
3334 | return 0; | ||
3335 | } | ||
3336 | |||
3337 | /** | ||
3338 | * wait_trustee_state - wait for trustee to enter the specified state | ||
3339 | * @gcwq: gcwq the trustee of interest belongs to | ||
3340 | * @state: target state to wait for | ||
3341 | * | ||
3342 | * Wait for the trustee to reach @state. DONE is already matched. | ||
3343 | * | ||
3344 | * CONTEXT: | ||
3345 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3346 | * multiple times. To be used by cpu_callback. | ||
3347 | */ | ||
3348 | static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) | ||
3349 | __releases(&gcwq->lock) | ||
3350 | __acquires(&gcwq->lock) | ||
3351 | { | ||
3352 | if (!(gcwq->trustee_state == state || | ||
3353 | gcwq->trustee_state == TRUSTEE_DONE)) { | ||
3354 | spin_unlock_irq(&gcwq->lock); | ||
3355 | __wait_event(gcwq->trustee_wait, | ||
3356 | gcwq->trustee_state == state || | ||
3357 | gcwq->trustee_state == TRUSTEE_DONE); | ||
3358 | spin_lock_irq(&gcwq->lock); | ||
3359 | } | ||
3360 | } | ||
3361 | |||
1106 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | 3362 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, |
1107 | unsigned long action, | 3363 | unsigned long action, |
1108 | void *hcpu) | 3364 | void *hcpu) |
1109 | { | 3365 | { |
1110 | unsigned int cpu = (unsigned long)hcpu; | 3366 | unsigned int cpu = (unsigned long)hcpu; |
1111 | struct cpu_workqueue_struct *cwq; | 3367 | struct global_cwq *gcwq = get_gcwq(cpu); |
1112 | struct workqueue_struct *wq; | 3368 | struct task_struct *new_trustee = NULL; |
1113 | int err = 0; | 3369 | struct worker *uninitialized_var(new_worker); |
3370 | unsigned long flags; | ||
1114 | 3371 | ||
1115 | action &= ~CPU_TASKS_FROZEN; | 3372 | action &= ~CPU_TASKS_FROZEN; |
1116 | 3373 | ||
1117 | switch (action) { | 3374 | switch (action) { |
3375 | case CPU_DOWN_PREPARE: | ||
3376 | new_trustee = kthread_create(trustee_thread, gcwq, | ||
3377 | "workqueue_trustee/%d\n", cpu); | ||
3378 | if (IS_ERR(new_trustee)) | ||
3379 | return notifier_from_errno(PTR_ERR(new_trustee)); | ||
3380 | kthread_bind(new_trustee, cpu); | ||
3381 | /* fall through */ | ||
1118 | case CPU_UP_PREPARE: | 3382 | case CPU_UP_PREPARE: |
1119 | cpumask_set_cpu(cpu, cpu_populated_map); | 3383 | BUG_ON(gcwq->first_idle); |
1120 | } | 3384 | new_worker = create_worker(gcwq, false); |
1121 | undo: | 3385 | if (!new_worker) { |
1122 | list_for_each_entry(wq, &workqueues, list) { | 3386 | if (new_trustee) |
1123 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 3387 | kthread_stop(new_trustee); |
1124 | 3388 | return NOTIFY_BAD; | |
1125 | switch (action) { | ||
1126 | case CPU_UP_PREPARE: | ||
1127 | err = create_workqueue_thread(cwq, cpu); | ||
1128 | if (!err) | ||
1129 | break; | ||
1130 | printk(KERN_ERR "workqueue [%s] for %i failed\n", | ||
1131 | wq->name, cpu); | ||
1132 | action = CPU_UP_CANCELED; | ||
1133 | err = -ENOMEM; | ||
1134 | goto undo; | ||
1135 | |||
1136 | case CPU_ONLINE: | ||
1137 | start_workqueue_thread(cwq, cpu); | ||
1138 | break; | ||
1139 | |||
1140 | case CPU_UP_CANCELED: | ||
1141 | start_workqueue_thread(cwq, -1); | ||
1142 | case CPU_POST_DEAD: | ||
1143 | cleanup_workqueue_thread(cwq); | ||
1144 | break; | ||
1145 | } | 3389 | } |
1146 | } | 3390 | } |
1147 | 3391 | ||
3392 | /* some are called w/ irq disabled, don't disturb irq status */ | ||
3393 | spin_lock_irqsave(&gcwq->lock, flags); | ||
3394 | |||
1148 | switch (action) { | 3395 | switch (action) { |
1149 | case CPU_UP_CANCELED: | 3396 | case CPU_DOWN_PREPARE: |
3397 | /* initialize trustee and tell it to acquire the gcwq */ | ||
3398 | BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); | ||
3399 | gcwq->trustee = new_trustee; | ||
3400 | gcwq->trustee_state = TRUSTEE_START; | ||
3401 | wake_up_process(gcwq->trustee); | ||
3402 | wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); | ||
3403 | /* fall through */ | ||
3404 | case CPU_UP_PREPARE: | ||
3405 | BUG_ON(gcwq->first_idle); | ||
3406 | gcwq->first_idle = new_worker; | ||
3407 | break; | ||
3408 | |||
3409 | case CPU_DYING: | ||
3410 | /* | ||
3411 | * Before this, the trustee and all workers except for | ||
3412 | * the ones which are still executing works from | ||
3413 | * before the last CPU down must be on the cpu. After | ||
3414 | * this, they'll all be diasporas. | ||
3415 | */ | ||
3416 | gcwq->flags |= GCWQ_DISASSOCIATED; | ||
3417 | break; | ||
3418 | |||
1150 | case CPU_POST_DEAD: | 3419 | case CPU_POST_DEAD: |
1151 | cpumask_clear_cpu(cpu, cpu_populated_map); | 3420 | gcwq->trustee_state = TRUSTEE_BUTCHER; |
3421 | /* fall through */ | ||
3422 | case CPU_UP_CANCELED: | ||
3423 | destroy_worker(gcwq->first_idle); | ||
3424 | gcwq->first_idle = NULL; | ||
3425 | break; | ||
3426 | |||
3427 | case CPU_DOWN_FAILED: | ||
3428 | case CPU_ONLINE: | ||
3429 | gcwq->flags &= ~GCWQ_DISASSOCIATED; | ||
3430 | if (gcwq->trustee_state != TRUSTEE_DONE) { | ||
3431 | gcwq->trustee_state = TRUSTEE_RELEASE; | ||
3432 | wake_up_process(gcwq->trustee); | ||
3433 | wait_trustee_state(gcwq, TRUSTEE_DONE); | ||
3434 | } | ||
3435 | |||
3436 | /* | ||
3437 | * Trustee is done and there might be no worker left. | ||
3438 | * Put the first_idle in and request a real manager to | ||
3439 | * take a look. | ||
3440 | */ | ||
3441 | spin_unlock_irq(&gcwq->lock); | ||
3442 | kthread_bind(gcwq->first_idle->task, cpu); | ||
3443 | spin_lock_irq(&gcwq->lock); | ||
3444 | gcwq->flags |= GCWQ_MANAGE_WORKERS; | ||
3445 | start_worker(gcwq->first_idle); | ||
3446 | gcwq->first_idle = NULL; | ||
3447 | break; | ||
1152 | } | 3448 | } |
1153 | 3449 | ||
1154 | return notifier_from_errno(err); | 3450 | spin_unlock_irqrestore(&gcwq->lock, flags); |
3451 | |||
3452 | return notifier_from_errno(0); | ||
1155 | } | 3453 | } |
1156 | 3454 | ||
1157 | #ifdef CONFIG_SMP | 3455 | #ifdef CONFIG_SMP |
@@ -1201,14 +3499,200 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) | |||
1201 | EXPORT_SYMBOL_GPL(work_on_cpu); | 3499 | EXPORT_SYMBOL_GPL(work_on_cpu); |
1202 | #endif /* CONFIG_SMP */ | 3500 | #endif /* CONFIG_SMP */ |
1203 | 3501 | ||
1204 | void __init init_workqueues(void) | 3502 | #ifdef CONFIG_FREEZER |
3503 | |||
3504 | /** | ||
3505 | * freeze_workqueues_begin - begin freezing workqueues | ||
3506 | * | ||
3507 | * Start freezing workqueues. After this function returns, all | ||
3508 | * freezeable workqueues will queue new works to their frozen_works | ||
3509 | * list instead of gcwq->worklist. | ||
3510 | * | ||
3511 | * CONTEXT: | ||
3512 | * Grabs and releases workqueue_lock and gcwq->lock's. | ||
3513 | */ | ||
3514 | void freeze_workqueues_begin(void) | ||
3515 | { | ||
3516 | unsigned int cpu; | ||
3517 | |||
3518 | spin_lock(&workqueue_lock); | ||
3519 | |||
3520 | BUG_ON(workqueue_freezing); | ||
3521 | workqueue_freezing = true; | ||
3522 | |||
3523 | for_each_gcwq_cpu(cpu) { | ||
3524 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
3525 | struct workqueue_struct *wq; | ||
3526 | |||
3527 | spin_lock_irq(&gcwq->lock); | ||
3528 | |||
3529 | BUG_ON(gcwq->flags & GCWQ_FREEZING); | ||
3530 | gcwq->flags |= GCWQ_FREEZING; | ||
3531 | |||
3532 | list_for_each_entry(wq, &workqueues, list) { | ||
3533 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
3534 | |||
3535 | if (cwq && wq->flags & WQ_FREEZEABLE) | ||
3536 | cwq->max_active = 0; | ||
3537 | } | ||
3538 | |||
3539 | spin_unlock_irq(&gcwq->lock); | ||
3540 | } | ||
3541 | |||
3542 | spin_unlock(&workqueue_lock); | ||
3543 | } | ||
3544 | |||
3545 | /** | ||
3546 | * freeze_workqueues_busy - are freezeable workqueues still busy? | ||
3547 | * | ||
3548 | * Check whether freezing is complete. This function must be called | ||
3549 | * between freeze_workqueues_begin() and thaw_workqueues(). | ||
3550 | * | ||
3551 | * CONTEXT: | ||
3552 | * Grabs and releases workqueue_lock. | ||
3553 | * | ||
3554 | * RETURNS: | ||
3555 | * %true if some freezeable workqueues are still busy. %false if | ||
3556 | * freezing is complete. | ||
3557 | */ | ||
3558 | bool freeze_workqueues_busy(void) | ||
3559 | { | ||
3560 | unsigned int cpu; | ||
3561 | bool busy = false; | ||
3562 | |||
3563 | spin_lock(&workqueue_lock); | ||
3564 | |||
3565 | BUG_ON(!workqueue_freezing); | ||
3566 | |||
3567 | for_each_gcwq_cpu(cpu) { | ||
3568 | struct workqueue_struct *wq; | ||
3569 | /* | ||
3570 | * nr_active is monotonically decreasing. It's safe | ||
3571 | * to peek without lock. | ||
3572 | */ | ||
3573 | list_for_each_entry(wq, &workqueues, list) { | ||
3574 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
3575 | |||
3576 | if (!cwq || !(wq->flags & WQ_FREEZEABLE)) | ||
3577 | continue; | ||
3578 | |||
3579 | BUG_ON(cwq->nr_active < 0); | ||
3580 | if (cwq->nr_active) { | ||
3581 | busy = true; | ||
3582 | goto out_unlock; | ||
3583 | } | ||
3584 | } | ||
3585 | } | ||
3586 | out_unlock: | ||
3587 | spin_unlock(&workqueue_lock); | ||
3588 | return busy; | ||
3589 | } | ||
3590 | |||
3591 | /** | ||
3592 | * thaw_workqueues - thaw workqueues | ||
3593 | * | ||
3594 | * Thaw workqueues. Normal queueing is restored and all collected | ||
3595 | * frozen works are transferred to their respective gcwq worklists. | ||
3596 | * | ||
3597 | * CONTEXT: | ||
3598 | * Grabs and releases workqueue_lock and gcwq->lock's. | ||
3599 | */ | ||
3600 | void thaw_workqueues(void) | ||
1205 | { | 3601 | { |
1206 | alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); | 3602 | unsigned int cpu; |
1207 | 3603 | ||
1208 | cpumask_copy(cpu_populated_map, cpu_online_mask); | 3604 | spin_lock(&workqueue_lock); |
1209 | singlethread_cpu = cpumask_first(cpu_possible_mask); | 3605 | |
1210 | cpu_singlethread_map = cpumask_of(singlethread_cpu); | 3606 | if (!workqueue_freezing) |
1211 | hotcpu_notifier(workqueue_cpu_callback, 0); | 3607 | goto out_unlock; |
1212 | keventd_wq = create_workqueue("events"); | 3608 | |
1213 | BUG_ON(!keventd_wq); | 3609 | for_each_gcwq_cpu(cpu) { |
3610 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
3611 | struct workqueue_struct *wq; | ||
3612 | |||
3613 | spin_lock_irq(&gcwq->lock); | ||
3614 | |||
3615 | BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); | ||
3616 | gcwq->flags &= ~GCWQ_FREEZING; | ||
3617 | |||
3618 | list_for_each_entry(wq, &workqueues, list) { | ||
3619 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
3620 | |||
3621 | if (!cwq || !(wq->flags & WQ_FREEZEABLE)) | ||
3622 | continue; | ||
3623 | |||
3624 | /* restore max_active and repopulate worklist */ | ||
3625 | cwq->max_active = wq->saved_max_active; | ||
3626 | |||
3627 | while (!list_empty(&cwq->delayed_works) && | ||
3628 | cwq->nr_active < cwq->max_active) | ||
3629 | cwq_activate_first_delayed(cwq); | ||
3630 | } | ||
3631 | |||
3632 | wake_up_worker(gcwq); | ||
3633 | |||
3634 | spin_unlock_irq(&gcwq->lock); | ||
3635 | } | ||
3636 | |||
3637 | workqueue_freezing = false; | ||
3638 | out_unlock: | ||
3639 | spin_unlock(&workqueue_lock); | ||
3640 | } | ||
3641 | #endif /* CONFIG_FREEZER */ | ||
3642 | |||
3643 | static int __init init_workqueues(void) | ||
3644 | { | ||
3645 | unsigned int cpu; | ||
3646 | int i; | ||
3647 | |||
3648 | cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); | ||
3649 | |||
3650 | /* initialize gcwqs */ | ||
3651 | for_each_gcwq_cpu(cpu) { | ||
3652 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
3653 | |||
3654 | spin_lock_init(&gcwq->lock); | ||
3655 | INIT_LIST_HEAD(&gcwq->worklist); | ||
3656 | gcwq->cpu = cpu; | ||
3657 | gcwq->flags |= GCWQ_DISASSOCIATED; | ||
3658 | |||
3659 | INIT_LIST_HEAD(&gcwq->idle_list); | ||
3660 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) | ||
3661 | INIT_HLIST_HEAD(&gcwq->busy_hash[i]); | ||
3662 | |||
3663 | init_timer_deferrable(&gcwq->idle_timer); | ||
3664 | gcwq->idle_timer.function = idle_worker_timeout; | ||
3665 | gcwq->idle_timer.data = (unsigned long)gcwq; | ||
3666 | |||
3667 | setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, | ||
3668 | (unsigned long)gcwq); | ||
3669 | |||
3670 | ida_init(&gcwq->worker_ida); | ||
3671 | |||
3672 | gcwq->trustee_state = TRUSTEE_DONE; | ||
3673 | init_waitqueue_head(&gcwq->trustee_wait); | ||
3674 | } | ||
3675 | |||
3676 | /* create the initial worker */ | ||
3677 | for_each_online_gcwq_cpu(cpu) { | ||
3678 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
3679 | struct worker *worker; | ||
3680 | |||
3681 | if (cpu != WORK_CPU_UNBOUND) | ||
3682 | gcwq->flags &= ~GCWQ_DISASSOCIATED; | ||
3683 | worker = create_worker(gcwq, true); | ||
3684 | BUG_ON(!worker); | ||
3685 | spin_lock_irq(&gcwq->lock); | ||
3686 | start_worker(worker); | ||
3687 | spin_unlock_irq(&gcwq->lock); | ||
3688 | } | ||
3689 | |||
3690 | system_wq = alloc_workqueue("events", 0, 0); | ||
3691 | system_long_wq = alloc_workqueue("events_long", 0, 0); | ||
3692 | system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); | ||
3693 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, | ||
3694 | WQ_UNBOUND_MAX_ACTIVE); | ||
3695 | BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); | ||
3696 | return 0; | ||
1214 | } | 3697 | } |
3698 | early_initcall(init_workqueues); | ||
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h new file mode 100644 index 000000000000..2d10fc98dc79 --- /dev/null +++ b/kernel/workqueue_sched.h | |||
@@ -0,0 +1,9 @@ | |||
1 | /* | ||
2 | * kernel/workqueue_sched.h | ||
3 | * | ||
4 | * Scheduler hooks for concurrency managed workqueue. Only to be | ||
5 | * included from sched.c and workqueue.c. | ||
6 | */ | ||
7 | void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); | ||
8 | struct task_struct *wq_worker_sleeping(struct task_struct *task, | ||
9 | unsigned int cpu); | ||